1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/param.h> 27 #include <sys/types.h> 28 #include <sys/systm.h> 29 #include <sys/cred.h> 30 #include <sys/proc.h> 31 #include <sys/user.h> 32 #include <sys/time.h> 33 #include <sys/buf.h> 34 #include <sys/vfs.h> 35 #include <sys/vnode.h> 36 #include <sys/socket.h> 37 #include <sys/uio.h> 38 #include <sys/tiuser.h> 39 #include <sys/swap.h> 40 #include <sys/errno.h> 41 #include <sys/debug.h> 42 #include <sys/kmem.h> 43 #include <sys/kstat.h> 44 #include <sys/cmn_err.h> 45 #include <sys/vtrace.h> 46 #include <sys/session.h> 47 #include <sys/dnlc.h> 48 #include <sys/bitmap.h> 49 #include <sys/acl.h> 50 #include <sys/ddi.h> 51 #include <sys/pathname.h> 52 #include <sys/flock.h> 53 #include <sys/dirent.h> 54 #include <sys/flock.h> 55 #include <sys/callb.h> 56 #include <sys/atomic.h> 57 #include <sys/list.h> 58 #include <sys/tsol/tnet.h> 59 #include <sys/priv.h> 60 #include <sys/sdt.h> 61 #include <sys/attr.h> 62 63 #include <inet/ip6.h> 64 65 #include <rpc/types.h> 66 #include <rpc/xdr.h> 67 #include <rpc/auth.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs4.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/rnode.h> 74 #include <nfs/nfs_acl.h> 75 76 #include <sys/tsol/label.h> 77 78 /* 79 * The hash queues for the access to active and cached rnodes 80 * are organized as doubly linked lists. A reader/writer lock 81 * for each hash bucket is used to control access and to synchronize 82 * lookups, additions, and deletions from the hash queue. 83 * 84 * The rnode freelist is organized as a doubly linked list with 85 * a head pointer. Additions and deletions are synchronized via 86 * a single mutex. 87 * 88 * In order to add an rnode to the free list, it must be hashed into 89 * a hash queue and the exclusive lock to the hash queue be held. 90 * If an rnode is not hashed into a hash queue, then it is destroyed 91 * because it represents no valuable information that can be reused 92 * about the file. The exclusive lock to the hash queue must be 93 * held in order to prevent a lookup in the hash queue from finding 94 * the rnode and using it and assuming that the rnode is not on the 95 * freelist. The lookup in the hash queue will have the hash queue 96 * locked, either exclusive or shared. 97 * 98 * The vnode reference count for each rnode is not allowed to drop 99 * below 1. This prevents external entities, such as the VM 100 * subsystem, from acquiring references to vnodes already on the 101 * freelist and then trying to place them back on the freelist 102 * when their reference is released. This means that the when an 103 * rnode is looked up in the hash queues, then either the rnode 104 * is removed from the freelist and that reference is transferred to 105 * the new reference or the vnode reference count must be incremented 106 * accordingly. The mutex for the freelist must be held in order to 107 * accurately test to see if the rnode is on the freelist or not. 108 * The hash queue lock might be held shared and it is possible that 109 * two different threads may race to remove the rnode from the 110 * freelist. This race can be resolved by holding the mutex for the 111 * freelist. Please note that the mutex for the freelist does not 112 * need to held if the rnode is not on the freelist. It can not be 113 * placed on the freelist due to the requirement that the thread 114 * putting the rnode on the freelist must hold the exclusive lock 115 * to the hash queue and the thread doing the lookup in the hash 116 * queue is holding either a shared or exclusive lock to the hash 117 * queue. 118 * 119 * The lock ordering is: 120 * 121 * hash bucket lock -> vnode lock 122 * hash bucket lock -> freelist lock 123 */ 124 static rhashq_t *rtable; 125 126 static kmutex_t rpfreelist_lock; 127 static rnode_t *rpfreelist = NULL; 128 static long rnew = 0; 129 long nrnode = 0; 130 131 static int rtablesize; 132 static int rtablemask; 133 134 static int hashlen = 4; 135 136 static struct kmem_cache *rnode_cache; 137 138 /* 139 * Mutex to protect the following variables: 140 * nfs_major 141 * nfs_minor 142 */ 143 kmutex_t nfs_minor_lock; 144 int nfs_major; 145 int nfs_minor; 146 147 /* Do we allow preepoch (negative) time values otw? */ 148 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 149 150 /* 151 * Access cache 152 */ 153 static acache_hash_t *acache; 154 static long nacache; /* used strictly to size the number of hash queues */ 155 156 static int acachesize; 157 static int acachemask; 158 static struct kmem_cache *acache_cache; 159 160 /* 161 * Client side utilities 162 */ 163 164 /* 165 * client side statistics 166 */ 167 static const struct clstat clstat_tmpl = { 168 { "calls", KSTAT_DATA_UINT64 }, 169 { "badcalls", KSTAT_DATA_UINT64 }, 170 { "clgets", KSTAT_DATA_UINT64 }, 171 { "cltoomany", KSTAT_DATA_UINT64 }, 172 #ifdef DEBUG 173 { "clalloc", KSTAT_DATA_UINT64 }, 174 { "noresponse", KSTAT_DATA_UINT64 }, 175 { "failover", KSTAT_DATA_UINT64 }, 176 { "remap", KSTAT_DATA_UINT64 }, 177 #endif 178 }; 179 180 /* 181 * The following are statistics that describe behavior of the system as a whole 182 * and doesn't correspond to any one particular zone. 183 */ 184 #ifdef DEBUG 185 static struct clstat_debug { 186 kstat_named_t nrnode; /* number of allocated rnodes */ 187 kstat_named_t access; /* size of access cache */ 188 kstat_named_t dirent; /* size of readdir cache */ 189 kstat_named_t dirents; /* size of readdir buf cache */ 190 kstat_named_t reclaim; /* number of reclaims */ 191 kstat_named_t clreclaim; /* number of cl reclaims */ 192 kstat_named_t f_reclaim; /* number of free reclaims */ 193 kstat_named_t a_reclaim; /* number of active reclaims */ 194 kstat_named_t r_reclaim; /* number of rnode reclaims */ 195 kstat_named_t rpath; /* bytes used to store rpaths */ 196 } clstat_debug = { 197 { "nrnode", KSTAT_DATA_UINT64 }, 198 { "access", KSTAT_DATA_UINT64 }, 199 { "dirent", KSTAT_DATA_UINT64 }, 200 { "dirents", KSTAT_DATA_UINT64 }, 201 { "reclaim", KSTAT_DATA_UINT64 }, 202 { "clreclaim", KSTAT_DATA_UINT64 }, 203 { "f_reclaim", KSTAT_DATA_UINT64 }, 204 { "a_reclaim", KSTAT_DATA_UINT64 }, 205 { "r_reclaim", KSTAT_DATA_UINT64 }, 206 { "r_path", KSTAT_DATA_UINT64 }, 207 }; 208 #endif /* DEBUG */ 209 210 /* 211 * We keep a global list of per-zone client data, so we can clean up all zones 212 * if we get low on memory. 213 */ 214 static list_t nfs_clnt_list; 215 static kmutex_t nfs_clnt_list_lock; 216 static zone_key_t nfsclnt_zone_key; 217 218 static struct kmem_cache *chtab_cache; 219 220 /* 221 * Some servers do not properly update the attributes of the 222 * directory when changes are made. To allow interoperability 223 * with these broken servers, the nfs_disable_rddir_cache 224 * parameter must be set in /etc/system 225 */ 226 int nfs_disable_rddir_cache = 0; 227 228 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 229 struct chtab **); 230 void clfree(CLIENT *, struct chtab *); 231 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 232 struct chtab **, struct nfs_clnt *); 233 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 234 struct chtab **, struct nfs_clnt *); 235 static void clreclaim(void *); 236 static int nfs_feedback(int, int, mntinfo_t *); 237 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 238 caddr_t, cred_t *, int *, enum clnt_stat *, int, 239 failinfo_t *); 240 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 241 caddr_t, cred_t *, int *, int, failinfo_t *); 242 static void rinactive(rnode_t *, cred_t *); 243 static int rtablehash(nfs_fhandle *); 244 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 245 struct vnodeops *, 246 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 247 cred_t *), 248 int (*)(const void *, const void *), int *, cred_t *, 249 char *, char *); 250 static void rp_rmfree(rnode_t *); 251 static void rp_addhash(rnode_t *); 252 static void rp_rmhash_locked(rnode_t *); 253 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 254 static void destroy_rnode(rnode_t *); 255 static void rddir_cache_free(rddir_cache *); 256 static int nfs_free_data_reclaim(rnode_t *); 257 static int nfs_active_data_reclaim(rnode_t *); 258 static int nfs_free_reclaim(void); 259 static int nfs_active_reclaim(void); 260 static int nfs_rnode_reclaim(void); 261 static void nfs_reclaim(void *); 262 static int failover_safe(failinfo_t *); 263 static void failover_newserver(mntinfo_t *mi); 264 static void failover_thread(mntinfo_t *mi); 265 static int failover_wait(mntinfo_t *); 266 static int failover_remap(failinfo_t *); 267 static int failover_lookup(char *, vnode_t *, 268 int (*)(vnode_t *, char *, vnode_t **, 269 struct pathname *, int, vnode_t *, cred_t *, int), 270 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 271 vnode_t **); 272 static void nfs_free_r_path(rnode_t *); 273 static void nfs_set_vroot(vnode_t *); 274 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 275 276 /* 277 * from rpcsec module (common/rpcsec) 278 */ 279 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 280 extern void sec_clnt_freeh(AUTH *); 281 extern void sec_clnt_freeinfo(struct sec_data *); 282 283 /* 284 * used in mount policy 285 */ 286 extern ts_label_t *getflabel_cipso(vfs_t *); 287 288 /* 289 * EIO or EINTR are not recoverable errors. 290 */ 291 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 292 293 #ifdef DEBUG 294 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n" 295 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n" 296 #else 297 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n" 298 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n" 299 #endif 300 /* 301 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 302 */ 303 static int 304 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 305 struct chtab **chp, struct nfs_clnt *nfscl) 306 { 307 struct chhead *ch, *newch; 308 struct chhead **plistp; 309 struct chtab *cp; 310 int error; 311 k_sigset_t smask; 312 313 if (newcl == NULL || chp == NULL || ci == NULL) 314 return (EINVAL); 315 316 *newcl = NULL; 317 *chp = NULL; 318 319 /* 320 * Find an unused handle or create one 321 */ 322 newch = NULL; 323 nfscl->nfscl_stat.clgets.value.ui64++; 324 top: 325 /* 326 * Find the correct entry in the cache to check for free 327 * client handles. The search is based on the RPC program 328 * number, program version number, dev_t for the transport 329 * device, and the protocol family. 330 */ 331 mutex_enter(&nfscl->nfscl_chtable_lock); 332 plistp = &nfscl->nfscl_chtable; 333 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 334 if (ch->ch_prog == ci->cl_prog && 335 ch->ch_vers == ci->cl_vers && 336 ch->ch_dev == svp->sv_knconf->knc_rdev && 337 (strcmp(ch->ch_protofmly, 338 svp->sv_knconf->knc_protofmly) == 0)) 339 break; 340 plistp = &ch->ch_next; 341 } 342 343 /* 344 * If we didn't find a cache entry for this quadruple, then 345 * create one. If we don't have one already preallocated, 346 * then drop the cache lock, create one, and then start over. 347 * If we did have a preallocated entry, then just add it to 348 * the front of the list. 349 */ 350 if (ch == NULL) { 351 if (newch == NULL) { 352 mutex_exit(&nfscl->nfscl_chtable_lock); 353 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 354 newch->ch_timesused = 0; 355 newch->ch_prog = ci->cl_prog; 356 newch->ch_vers = ci->cl_vers; 357 newch->ch_dev = svp->sv_knconf->knc_rdev; 358 newch->ch_protofmly = kmem_alloc( 359 strlen(svp->sv_knconf->knc_protofmly) + 1, 360 KM_SLEEP); 361 (void) strcpy(newch->ch_protofmly, 362 svp->sv_knconf->knc_protofmly); 363 newch->ch_list = NULL; 364 goto top; 365 } 366 ch = newch; 367 newch = NULL; 368 ch->ch_next = nfscl->nfscl_chtable; 369 nfscl->nfscl_chtable = ch; 370 /* 371 * We found a cache entry, but if it isn't on the front of the 372 * list, then move it to the front of the list to try to take 373 * advantage of locality of operations. 374 */ 375 } else if (ch != nfscl->nfscl_chtable) { 376 *plistp = ch->ch_next; 377 ch->ch_next = nfscl->nfscl_chtable; 378 nfscl->nfscl_chtable = ch; 379 } 380 381 /* 382 * If there was a free client handle cached, then remove it 383 * from the list, init it, and use it. 384 */ 385 if (ch->ch_list != NULL) { 386 cp = ch->ch_list; 387 ch->ch_list = cp->ch_list; 388 mutex_exit(&nfscl->nfscl_chtable_lock); 389 if (newch != NULL) { 390 kmem_free(newch->ch_protofmly, 391 strlen(newch->ch_protofmly) + 1); 392 kmem_free(newch, sizeof (*newch)); 393 } 394 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 395 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 396 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 397 &cp->ch_client->cl_auth); 398 if (error || cp->ch_client->cl_auth == NULL) { 399 CLNT_DESTROY(cp->ch_client); 400 kmem_cache_free(chtab_cache, cp); 401 return ((error != 0) ? error : EINTR); 402 } 403 ch->ch_timesused++; 404 *newcl = cp->ch_client; 405 *chp = cp; 406 return (0); 407 } 408 409 /* 410 * There weren't any free client handles which fit, so allocate 411 * a new one and use that. 412 */ 413 #ifdef DEBUG 414 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 415 #endif 416 mutex_exit(&nfscl->nfscl_chtable_lock); 417 418 nfscl->nfscl_stat.cltoomany.value.ui64++; 419 if (newch != NULL) { 420 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 421 kmem_free(newch, sizeof (*newch)); 422 } 423 424 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 425 cp->ch_head = ch; 426 427 sigintr(&smask, (int)ci->cl_flags & MI_INT); 428 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 429 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 430 sigunintr(&smask); 431 432 if (error != 0) { 433 kmem_cache_free(chtab_cache, cp); 434 #ifdef DEBUG 435 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 436 #endif 437 /* 438 * Warning is unnecessary if error is EINTR. 439 */ 440 if (error != EINTR) { 441 nfs_cmn_err(error, CE_WARN, 442 "clget: couldn't create handle: %m\n"); 443 } 444 return (error); 445 } 446 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 447 auth_destroy(cp->ch_client->cl_auth); 448 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 449 &cp->ch_client->cl_auth); 450 if (error || cp->ch_client->cl_auth == NULL) { 451 CLNT_DESTROY(cp->ch_client); 452 kmem_cache_free(chtab_cache, cp); 453 #ifdef DEBUG 454 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 455 #endif 456 return ((error != 0) ? error : EINTR); 457 } 458 ch->ch_timesused++; 459 *newcl = cp->ch_client; 460 ASSERT(cp->ch_client->cl_nosignal == FALSE); 461 *chp = cp; 462 return (0); 463 } 464 465 int 466 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 467 struct chtab **chp) 468 { 469 struct nfs_clnt *nfscl; 470 471 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 472 ASSERT(nfscl != NULL); 473 474 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 475 } 476 477 static int 478 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 479 struct chtab **chp, struct nfs_clnt *nfscl) 480 { 481 clinfo_t ci; 482 int error; 483 484 /* 485 * Set read buffer size to rsize 486 * and add room for RPC headers. 487 */ 488 ci.cl_readsize = mi->mi_tsize; 489 if (ci.cl_readsize != 0) 490 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 491 492 /* 493 * If soft mount and server is down just try once. 494 * meaning: do not retransmit. 495 */ 496 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 497 ci.cl_retrans = 0; 498 else 499 ci.cl_retrans = mi->mi_retrans; 500 501 ci.cl_prog = NFS_ACL_PROGRAM; 502 ci.cl_vers = mi->mi_vers; 503 ci.cl_flags = mi->mi_flags; 504 505 /* 506 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 507 * security flavor, the client tries to establish a security context 508 * by contacting the server. If the connection is timed out or reset, 509 * e.g. server reboot, we will try again. 510 */ 511 do { 512 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 513 514 if (error == 0) 515 break; 516 517 /* 518 * For forced unmount or zone shutdown, bail out, no retry. 519 */ 520 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 521 error = EIO; 522 break; 523 } 524 525 /* do not retry for softmount */ 526 if (!(mi->mi_flags & MI_HARD)) 527 break; 528 529 /* let the caller deal with the failover case */ 530 if (FAILOVER_MOUNT(mi)) 531 break; 532 533 } while (error == ETIMEDOUT || error == ECONNRESET); 534 535 return (error); 536 } 537 538 static int 539 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 540 struct chtab **chp, struct nfs_clnt *nfscl) 541 { 542 clinfo_t ci; 543 int error; 544 545 /* 546 * Set read buffer size to rsize 547 * and add room for RPC headers. 548 */ 549 ci.cl_readsize = mi->mi_tsize; 550 if (ci.cl_readsize != 0) 551 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 552 553 /* 554 * If soft mount and server is down just try once. 555 * meaning: do not retransmit. 556 */ 557 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 558 ci.cl_retrans = 0; 559 else 560 ci.cl_retrans = mi->mi_retrans; 561 562 ci.cl_prog = mi->mi_prog; 563 ci.cl_vers = mi->mi_vers; 564 ci.cl_flags = mi->mi_flags; 565 566 /* 567 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 568 * security flavor, the client tries to establish a security context 569 * by contacting the server. If the connection is timed out or reset, 570 * e.g. server reboot, we will try again. 571 */ 572 do { 573 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 574 575 if (error == 0) 576 break; 577 578 /* 579 * For forced unmount or zone shutdown, bail out, no retry. 580 */ 581 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 582 error = EIO; 583 break; 584 } 585 586 /* do not retry for softmount */ 587 if (!(mi->mi_flags & MI_HARD)) 588 break; 589 590 /* let the caller deal with the failover case */ 591 if (FAILOVER_MOUNT(mi)) 592 break; 593 594 } while (error == ETIMEDOUT || error == ECONNRESET); 595 596 return (error); 597 } 598 599 static void 600 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 601 { 602 if (cl->cl_auth != NULL) { 603 sec_clnt_freeh(cl->cl_auth); 604 cl->cl_auth = NULL; 605 } 606 607 /* 608 * Timestamp this cache entry so that we know when it was last 609 * used. 610 */ 611 cp->ch_freed = gethrestime_sec(); 612 613 /* 614 * Add the free client handle to the front of the list. 615 * This way, the list will be sorted in youngest to oldest 616 * order. 617 */ 618 mutex_enter(&nfscl->nfscl_chtable_lock); 619 cp->ch_list = cp->ch_head->ch_list; 620 cp->ch_head->ch_list = cp; 621 mutex_exit(&nfscl->nfscl_chtable_lock); 622 } 623 624 void 625 clfree(CLIENT *cl, struct chtab *cp) 626 { 627 struct nfs_clnt *nfscl; 628 629 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 630 ASSERT(nfscl != NULL); 631 632 clfree_impl(cl, cp, nfscl); 633 } 634 635 #define CL_HOLDTIME 60 /* time to hold client handles */ 636 637 static void 638 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 639 { 640 struct chhead *ch; 641 struct chtab *cp; /* list of objects that can be reclaimed */ 642 struct chtab *cpe; 643 struct chtab *cpl; 644 struct chtab **cpp; 645 #ifdef DEBUG 646 int n = 0; 647 #endif 648 649 /* 650 * Need to reclaim some memory, so step through the cache 651 * looking through the lists for entries which can be freed. 652 */ 653 cp = NULL; 654 655 mutex_enter(&nfscl->nfscl_chtable_lock); 656 657 /* 658 * Here we step through each non-NULL quadruple and start to 659 * construct the reclaim list pointed to by cp. Note that 660 * cp will contain all eligible chtab entries. When this traversal 661 * completes, chtab entries from the last quadruple will be at the 662 * front of cp and entries from previously inspected quadruples have 663 * been appended to the rear of cp. 664 */ 665 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 666 if (ch->ch_list == NULL) 667 continue; 668 /* 669 * Search each list for entries older then 670 * cl_holdtime seconds. The lists are maintained 671 * in youngest to oldest order so that when the 672 * first entry is found which is old enough, then 673 * all of the rest of the entries on the list will 674 * be old enough as well. 675 */ 676 cpl = ch->ch_list; 677 cpp = &ch->ch_list; 678 while (cpl != NULL && 679 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 680 cpp = &cpl->ch_list; 681 cpl = cpl->ch_list; 682 } 683 if (cpl != NULL) { 684 *cpp = NULL; 685 if (cp != NULL) { 686 cpe = cpl; 687 while (cpe->ch_list != NULL) 688 cpe = cpe->ch_list; 689 cpe->ch_list = cp; 690 } 691 cp = cpl; 692 } 693 } 694 695 mutex_exit(&nfscl->nfscl_chtable_lock); 696 697 /* 698 * If cp is empty, then there is nothing to reclaim here. 699 */ 700 if (cp == NULL) 701 return; 702 703 /* 704 * Step through the list of entries to free, destroying each client 705 * handle and kmem_free'ing the memory for each entry. 706 */ 707 while (cp != NULL) { 708 #ifdef DEBUG 709 n++; 710 #endif 711 CLNT_DESTROY(cp->ch_client); 712 cpl = cp->ch_list; 713 kmem_cache_free(chtab_cache, cp); 714 cp = cpl; 715 } 716 717 #ifdef DEBUG 718 /* 719 * Update clalloc so that nfsstat shows the current number 720 * of allocated client handles. 721 */ 722 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 723 #endif 724 } 725 726 /* ARGSUSED */ 727 static void 728 clreclaim(void *all) 729 { 730 struct nfs_clnt *nfscl; 731 732 #ifdef DEBUG 733 clstat_debug.clreclaim.value.ui64++; 734 #endif 735 /* 736 * The system is low on memory; go through and try to reclaim some from 737 * every zone on the system. 738 */ 739 mutex_enter(&nfs_clnt_list_lock); 740 nfscl = list_head(&nfs_clnt_list); 741 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 742 clreclaim_zone(nfscl, CL_HOLDTIME); 743 mutex_exit(&nfs_clnt_list_lock); 744 } 745 746 /* 747 * Minimum time-out values indexed by call type 748 * These units are in "eights" of a second to avoid multiplies 749 */ 750 static unsigned int minimum_timeo[] = { 751 6, 7, 10 752 }; 753 754 /* 755 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 756 */ 757 #define MAXTIMO (20*hz) 758 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 759 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 760 761 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 762 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 763 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 764 765 /* 766 * Function called when rfscall notices that we have been 767 * re-transmitting, or when we get a response without retransmissions. 768 * Return 1 if the transfer size was adjusted down - 0 if no change. 769 */ 770 static int 771 nfs_feedback(int flag, int which, mntinfo_t *mi) 772 { 773 int kind; 774 int r = 0; 775 776 mutex_enter(&mi->mi_lock); 777 if (flag == FEEDBACK_REXMIT1) { 778 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 779 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 780 goto done; 781 if (mi->mi_curread > MIN_NFS_TSIZE) { 782 mi->mi_curread /= 2; 783 if (mi->mi_curread < MIN_NFS_TSIZE) 784 mi->mi_curread = MIN_NFS_TSIZE; 785 r = 1; 786 } 787 788 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 789 mi->mi_curwrite /= 2; 790 if (mi->mi_curwrite < MIN_NFS_TSIZE) 791 mi->mi_curwrite = MIN_NFS_TSIZE; 792 r = 1; 793 } 794 } else if (flag == FEEDBACK_OK) { 795 kind = mi->mi_timer_type[which]; 796 if (kind == 0 || 797 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 798 goto done; 799 if (kind == 1) { 800 if (mi->mi_curread >= mi->mi_tsize) 801 goto done; 802 mi->mi_curread += MIN_NFS_TSIZE; 803 if (mi->mi_curread > mi->mi_tsize/2) 804 mi->mi_curread = mi->mi_tsize; 805 } else if (kind == 2) { 806 if (mi->mi_curwrite >= mi->mi_stsize) 807 goto done; 808 mi->mi_curwrite += MIN_NFS_TSIZE; 809 if (mi->mi_curwrite > mi->mi_stsize/2) 810 mi->mi_curwrite = mi->mi_stsize; 811 } 812 } 813 done: 814 mutex_exit(&mi->mi_lock); 815 return (r); 816 } 817 818 #ifdef DEBUG 819 static int rfs2call_hits = 0; 820 static int rfs2call_misses = 0; 821 #endif 822 823 int 824 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 825 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 826 enum nfsstat *statusp, int flags, failinfo_t *fi) 827 { 828 int rpcerror; 829 enum clnt_stat rpc_status; 830 831 ASSERT(statusp != NULL); 832 833 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 834 cr, douprintf, &rpc_status, flags, fi); 835 if (!rpcerror) { 836 /* 837 * See crnetadjust() for comments. 838 */ 839 if (*statusp == NFSERR_ACCES && 840 (cr = crnetadjust(cr)) != NULL) { 841 #ifdef DEBUG 842 rfs2call_hits++; 843 #endif 844 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 845 resp, cr, douprintf, NULL, flags, fi); 846 crfree(cr); 847 #ifdef DEBUG 848 if (*statusp == NFSERR_ACCES) 849 rfs2call_misses++; 850 #endif 851 } 852 } else if (rpc_status == RPC_PROCUNAVAIL) { 853 *statusp = NFSERR_OPNOTSUPP; 854 rpcerror = 0; 855 } 856 857 return (rpcerror); 858 } 859 860 #define NFS3_JUKEBOX_DELAY 10 * hz 861 862 static clock_t nfs3_jukebox_delay = 0; 863 864 #ifdef DEBUG 865 static int rfs3call_hits = 0; 866 static int rfs3call_misses = 0; 867 #endif 868 869 int 870 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 871 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 872 nfsstat3 *statusp, int flags, failinfo_t *fi) 873 { 874 int rpcerror; 875 int user_informed; 876 877 user_informed = 0; 878 do { 879 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 880 cr, douprintf, NULL, flags, fi); 881 if (!rpcerror) { 882 cred_t *crr; 883 if (*statusp == NFS3ERR_JUKEBOX) { 884 if (ttoproc(curthread) == &p0) { 885 rpcerror = EAGAIN; 886 break; 887 } 888 if (!user_informed) { 889 user_informed = 1; 890 uprintf( 891 "file temporarily unavailable on the server, retrying...\n"); 892 } 893 delay(nfs3_jukebox_delay); 894 } 895 /* 896 * See crnetadjust() for comments. 897 */ 898 else if (*statusp == NFS3ERR_ACCES && 899 (crr = crnetadjust(cr)) != NULL) { 900 #ifdef DEBUG 901 rfs3call_hits++; 902 #endif 903 rpcerror = rfscall(mi, which, xdrargs, argsp, 904 xdrres, resp, crr, douprintf, 905 NULL, flags, fi); 906 907 crfree(crr); 908 #ifdef DEBUG 909 if (*statusp == NFS3ERR_ACCES) 910 rfs3call_misses++; 911 #endif 912 } 913 } 914 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 915 916 return (rpcerror); 917 } 918 919 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 920 #define INC_READERS(mi) { \ 921 mi->mi_readers++; \ 922 } 923 #define DEC_READERS(mi) { \ 924 mi->mi_readers--; \ 925 if (mi->mi_readers == 0) \ 926 cv_broadcast(&mi->mi_failover_cv); \ 927 } 928 929 static int 930 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 931 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 932 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 933 { 934 CLIENT *client; 935 struct chtab *ch; 936 cred_t *cr = icr; 937 enum clnt_stat status; 938 struct rpc_err rpcerr, rpcerr_tmp; 939 struct timeval wait; 940 int timeo; /* in units of hz */ 941 int my_rsize, my_wsize; 942 bool_t tryagain; 943 bool_t cred_cloned = FALSE; 944 k_sigset_t smask; 945 servinfo_t *svp; 946 struct nfs_clnt *nfscl; 947 zoneid_t zoneid = getzoneid(); 948 char *msg; 949 #ifdef DEBUG 950 char *bufp; 951 #endif 952 953 954 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 955 "rfscall_start:which %d mi %p", which, mi); 956 957 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 958 ASSERT(nfscl != NULL); 959 960 nfscl->nfscl_stat.calls.value.ui64++; 961 mi->mi_reqs[which].value.ui64++; 962 963 rpcerr.re_status = RPC_SUCCESS; 964 965 /* 966 * In case of forced unmount or zone shutdown, return EIO. 967 */ 968 969 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 970 rpcerr.re_status = RPC_FAILED; 971 rpcerr.re_errno = EIO; 972 return (rpcerr.re_errno); 973 } 974 975 /* 976 * Remember the transfer sizes in case 977 * nfs_feedback changes them underneath us. 978 */ 979 my_rsize = mi->mi_curread; 980 my_wsize = mi->mi_curwrite; 981 982 /* 983 * NFS client failover support 984 * 985 * If this rnode is not in sync with the current server (VALID_FH), 986 * we'd like to do a remap to get in sync. We can be interrupted 987 * in failover_remap(), and if so we'll bail. Otherwise, we'll 988 * use the best info we have to try the RPC. Part of that is 989 * unconditionally updating the filehandle copy kept for V3. 990 * 991 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 992 * rw_enter(); we're trying to keep the current server from being 993 * changed on us until we're done with the remapping and have a 994 * matching client handle. We don't want to sending a filehandle 995 * to the wrong host. 996 */ 997 failoverretry: 998 if (FAILOVER_MOUNT(mi)) { 999 mutex_enter(&mi->mi_lock); 1000 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1001 if (failover_wait(mi)) { 1002 mutex_exit(&mi->mi_lock); 1003 return (EINTR); 1004 } 1005 } 1006 INC_READERS(mi); 1007 mutex_exit(&mi->mi_lock); 1008 if (fi) { 1009 if (!VALID_FH(fi) && 1010 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1011 int remaperr; 1012 1013 svp = mi->mi_curr_serv; 1014 remaperr = failover_remap(fi); 1015 if (remaperr != 0) { 1016 #ifdef DEBUG 1017 if (remaperr != EINTR) 1018 nfs_cmn_err(remaperr, CE_WARN, 1019 "rfscall couldn't failover: %m"); 1020 #endif 1021 mutex_enter(&mi->mi_lock); 1022 DEC_READERS(mi); 1023 mutex_exit(&mi->mi_lock); 1024 /* 1025 * If failover_remap returns ETIMEDOUT 1026 * and the filesystem is hard mounted 1027 * we have to retry the call with a new 1028 * server. 1029 */ 1030 if ((mi->mi_flags & MI_HARD) && 1031 IS_RECOVERABLE_ERROR(remaperr)) { 1032 if (svp == mi->mi_curr_serv) 1033 failover_newserver(mi); 1034 rpcerr.re_status = RPC_SUCCESS; 1035 goto failoverretry; 1036 } 1037 rpcerr.re_errno = remaperr; 1038 return (remaperr); 1039 } 1040 } 1041 if (fi->fhp && fi->copyproc) 1042 (*fi->copyproc)(fi->fhp, fi->vp); 1043 } 1044 } 1045 1046 /* For TSOL, use a new cred which has net_mac_aware flag */ 1047 if (!cred_cloned && is_system_labeled()) { 1048 cred_cloned = TRUE; 1049 cr = crdup(icr); 1050 (void) setpflags(NET_MAC_AWARE, 1, cr); 1051 } 1052 1053 /* 1054 * clget() calls clnt_tli_kinit() which clears the xid, so we 1055 * are guaranteed to reprocess the retry as a new request. 1056 */ 1057 svp = mi->mi_curr_serv; 1058 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1059 1060 if (FAILOVER_MOUNT(mi)) { 1061 mutex_enter(&mi->mi_lock); 1062 DEC_READERS(mi); 1063 mutex_exit(&mi->mi_lock); 1064 1065 if ((rpcerr.re_errno == ETIMEDOUT || 1066 rpcerr.re_errno == ECONNRESET) && 1067 failover_safe(fi)) { 1068 if (svp == mi->mi_curr_serv) 1069 failover_newserver(mi); 1070 goto failoverretry; 1071 } 1072 } 1073 if (rpcerr.re_errno != 0) 1074 return (rpcerr.re_errno); 1075 1076 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1077 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1078 timeo = (mi->mi_timeo * hz) / 10; 1079 } else { 1080 mutex_enter(&mi->mi_lock); 1081 timeo = CLNT_SETTIMERS(client, 1082 &(mi->mi_timers[mi->mi_timer_type[which]]), 1083 &(mi->mi_timers[NFS_CALLTYPES]), 1084 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1085 (void (*)())NULL, (caddr_t)mi, 0); 1086 mutex_exit(&mi->mi_lock); 1087 } 1088 1089 /* 1090 * If hard mounted fs, retry call forever unless hard error occurs. 1091 */ 1092 do { 1093 tryagain = FALSE; 1094 1095 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1096 status = RPC_FAILED; 1097 rpcerr.re_status = RPC_FAILED; 1098 rpcerr.re_errno = EIO; 1099 break; 1100 } 1101 1102 TICK_TO_TIMEVAL(timeo, &wait); 1103 1104 /* 1105 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1106 * and SIGTERM. (Preserving the existing masks). 1107 * Mask out SIGINT if mount option nointr is specified. 1108 */ 1109 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1110 if (!(mi->mi_flags & MI_INT)) 1111 client->cl_nosignal = TRUE; 1112 1113 /* 1114 * If there is a current signal, then don't bother 1115 * even trying to send out the request because we 1116 * won't be able to block waiting for the response. 1117 * Simply assume RPC_INTR and get on with it. 1118 */ 1119 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1120 status = RPC_INTR; 1121 else { 1122 status = CLNT_CALL(client, which, xdrargs, argsp, 1123 xdrres, resp, wait); 1124 } 1125 1126 if (!(mi->mi_flags & MI_INT)) 1127 client->cl_nosignal = FALSE; 1128 /* 1129 * restore original signal mask 1130 */ 1131 sigunintr(&smask); 1132 1133 switch (status) { 1134 case RPC_SUCCESS: 1135 if ((mi->mi_flags & MI_DYNAMIC) && 1136 mi->mi_timer_type[which] != 0 && 1137 (mi->mi_curread != my_rsize || 1138 mi->mi_curwrite != my_wsize)) 1139 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1140 break; 1141 1142 case RPC_INTR: 1143 /* 1144 * There is no way to recover from this error, 1145 * even if mount option nointr is specified. 1146 * SIGKILL, for example, cannot be blocked. 1147 */ 1148 rpcerr.re_status = RPC_INTR; 1149 rpcerr.re_errno = EINTR; 1150 break; 1151 1152 case RPC_UDERROR: 1153 /* 1154 * If the NFS server is local (vold) and 1155 * it goes away then we get RPC_UDERROR. 1156 * This is a retryable error, so we would 1157 * loop, so check to see if the specific 1158 * error was ECONNRESET, indicating that 1159 * target did not exist at all. If so, 1160 * return with RPC_PROGUNAVAIL and 1161 * ECONNRESET to indicate why. 1162 */ 1163 CLNT_GETERR(client, &rpcerr); 1164 if (rpcerr.re_errno == ECONNRESET) { 1165 rpcerr.re_status = RPC_PROGUNAVAIL; 1166 rpcerr.re_errno = ECONNRESET; 1167 break; 1168 } 1169 /*FALLTHROUGH*/ 1170 1171 default: /* probably RPC_TIMEDOUT */ 1172 if (IS_UNRECOVERABLE_RPC(status)) 1173 break; 1174 1175 /* 1176 * increment server not responding count 1177 */ 1178 mutex_enter(&mi->mi_lock); 1179 mi->mi_noresponse++; 1180 mutex_exit(&mi->mi_lock); 1181 #ifdef DEBUG 1182 nfscl->nfscl_stat.noresponse.value.ui64++; 1183 #endif 1184 1185 if (!(mi->mi_flags & MI_HARD)) { 1186 if (!(mi->mi_flags & MI_SEMISOFT) || 1187 (mi->mi_ss_call_type[which] == 0)) 1188 break; 1189 } 1190 1191 /* 1192 * The call is in progress (over COTS). 1193 * Try the CLNT_CALL again, but don't 1194 * print a noisy error message. 1195 */ 1196 if (status == RPC_INPROGRESS) { 1197 tryagain = TRUE; 1198 break; 1199 } 1200 1201 if (flags & RFSCALL_SOFT) 1202 break; 1203 1204 /* 1205 * On zone shutdown, just move on. 1206 */ 1207 if (zone_status_get(curproc->p_zone) >= 1208 ZONE_IS_SHUTTING_DOWN) { 1209 rpcerr.re_status = RPC_FAILED; 1210 rpcerr.re_errno = EIO; 1211 break; 1212 } 1213 1214 /* 1215 * NFS client failover support 1216 * 1217 * If the current server just failed us, we'll 1218 * start the process of finding a new server. 1219 * After that, we can just retry. 1220 */ 1221 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1222 if (svp == mi->mi_curr_serv) 1223 failover_newserver(mi); 1224 clfree_impl(client, ch, nfscl); 1225 goto failoverretry; 1226 } 1227 1228 tryagain = TRUE; 1229 timeo = backoff(timeo); 1230 1231 CLNT_GETERR(client, &rpcerr_tmp); 1232 if ((status == RPC_CANTSEND) && 1233 (rpcerr_tmp.re_errno == ENOBUFS)) 1234 msg = SRV_QFULL_MSG; 1235 else 1236 msg = SRV_NOTRESP_MSG; 1237 1238 mutex_enter(&mi->mi_lock); 1239 if (!(mi->mi_flags & MI_PRINTED)) { 1240 mi->mi_flags |= MI_PRINTED; 1241 mutex_exit(&mi->mi_lock); 1242 #ifdef DEBUG 1243 zprintf(zoneid, msg, mi->mi_vers, 1244 svp->sv_hostname); 1245 #else 1246 zprintf(zoneid, msg, svp->sv_hostname); 1247 #endif 1248 } else 1249 mutex_exit(&mi->mi_lock); 1250 if (*douprintf && nfs_has_ctty()) { 1251 *douprintf = 0; 1252 if (!(mi->mi_flags & MI_NOPRINT)) 1253 #ifdef DEBUG 1254 uprintf(msg, mi->mi_vers, 1255 svp->sv_hostname); 1256 #else 1257 uprintf(msg, svp->sv_hostname); 1258 #endif 1259 } 1260 1261 /* 1262 * If doing dynamic adjustment of transfer 1263 * size and if it's a read or write call 1264 * and if the transfer size changed while 1265 * retransmitting or if the feedback routine 1266 * changed the transfer size, 1267 * then exit rfscall so that the transfer 1268 * size can be adjusted at the vnops level. 1269 */ 1270 if ((mi->mi_flags & MI_DYNAMIC) && 1271 mi->mi_timer_type[which] != 0 && 1272 (mi->mi_curread != my_rsize || 1273 mi->mi_curwrite != my_wsize || 1274 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1275 /* 1276 * On read or write calls, return 1277 * back to the vnode ops level if 1278 * the transfer size changed. 1279 */ 1280 clfree_impl(client, ch, nfscl); 1281 if (cred_cloned) 1282 crfree(cr); 1283 return (ENFS_TRYAGAIN); 1284 } 1285 } 1286 } while (tryagain); 1287 1288 if (status != RPC_SUCCESS) { 1289 /* 1290 * Let soft mounts use the timed out message. 1291 */ 1292 if (status == RPC_INPROGRESS) 1293 status = RPC_TIMEDOUT; 1294 nfscl->nfscl_stat.badcalls.value.ui64++; 1295 if (status != RPC_INTR) { 1296 mutex_enter(&mi->mi_lock); 1297 mi->mi_flags |= MI_DOWN; 1298 mutex_exit(&mi->mi_lock); 1299 CLNT_GETERR(client, &rpcerr); 1300 #ifdef DEBUG 1301 bufp = clnt_sperror(client, svp->sv_hostname); 1302 zprintf(zoneid, "NFS%d %s failed for %s\n", 1303 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1304 if (nfs_has_ctty()) { 1305 if (!(mi->mi_flags & MI_NOPRINT)) { 1306 uprintf("NFS%d %s failed for %s\n", 1307 mi->mi_vers, mi->mi_rfsnames[which], 1308 bufp); 1309 } 1310 } 1311 kmem_free(bufp, MAXPATHLEN); 1312 #else 1313 zprintf(zoneid, 1314 "NFS %s failed for server %s: error %d (%s)\n", 1315 mi->mi_rfsnames[which], svp->sv_hostname, 1316 status, clnt_sperrno(status)); 1317 if (nfs_has_ctty()) { 1318 if (!(mi->mi_flags & MI_NOPRINT)) { 1319 uprintf( 1320 "NFS %s failed for server %s: error %d (%s)\n", 1321 mi->mi_rfsnames[which], 1322 svp->sv_hostname, status, 1323 clnt_sperrno(status)); 1324 } 1325 } 1326 #endif 1327 /* 1328 * when CLNT_CALL() fails with RPC_AUTHERROR, 1329 * re_errno is set appropriately depending on 1330 * the authentication error 1331 */ 1332 if (status == RPC_VERSMISMATCH || 1333 status == RPC_PROGVERSMISMATCH) 1334 rpcerr.re_errno = EIO; 1335 } 1336 } else { 1337 /* 1338 * Test the value of mi_down and mi_printed without 1339 * holding the mi_lock mutex. If they are both zero, 1340 * then it is okay to skip the down and printed 1341 * processing. This saves on a mutex_enter and 1342 * mutex_exit pair for a normal, successful RPC. 1343 * This was just complete overhead. 1344 */ 1345 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1346 mutex_enter(&mi->mi_lock); 1347 mi->mi_flags &= ~MI_DOWN; 1348 if (mi->mi_flags & MI_PRINTED) { 1349 mi->mi_flags &= ~MI_PRINTED; 1350 mutex_exit(&mi->mi_lock); 1351 #ifdef DEBUG 1352 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1353 zprintf(zoneid, "NFS%d server %s ok\n", 1354 mi->mi_vers, svp->sv_hostname); 1355 #else 1356 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1357 zprintf(zoneid, "NFS server %s ok\n", 1358 svp->sv_hostname); 1359 #endif 1360 } else 1361 mutex_exit(&mi->mi_lock); 1362 } 1363 1364 if (*douprintf == 0) { 1365 if (!(mi->mi_flags & MI_NOPRINT)) 1366 #ifdef DEBUG 1367 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1368 uprintf("NFS%d server %s ok\n", 1369 mi->mi_vers, svp->sv_hostname); 1370 #else 1371 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1372 uprintf("NFS server %s ok\n", svp->sv_hostname); 1373 #endif 1374 *douprintf = 1; 1375 } 1376 } 1377 1378 clfree_impl(client, ch, nfscl); 1379 if (cred_cloned) 1380 crfree(cr); 1381 1382 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1383 1384 if (rpc_status != NULL) 1385 *rpc_status = rpcerr.re_status; 1386 1387 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1388 rpcerr.re_errno); 1389 1390 return (rpcerr.re_errno); 1391 } 1392 1393 #ifdef DEBUG 1394 static int acl2call_hits = 0; 1395 static int acl2call_misses = 0; 1396 #endif 1397 1398 int 1399 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1400 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1401 enum nfsstat *statusp, int flags, failinfo_t *fi) 1402 { 1403 int rpcerror; 1404 1405 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1406 cr, douprintf, flags, fi); 1407 if (!rpcerror) { 1408 /* 1409 * See comments with crnetadjust(). 1410 */ 1411 if (*statusp == NFSERR_ACCES && 1412 (cr = crnetadjust(cr)) != NULL) { 1413 #ifdef DEBUG 1414 acl2call_hits++; 1415 #endif 1416 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1417 resp, cr, douprintf, flags, fi); 1418 crfree(cr); 1419 #ifdef DEBUG 1420 if (*statusp == NFSERR_ACCES) 1421 acl2call_misses++; 1422 #endif 1423 } 1424 } 1425 1426 return (rpcerror); 1427 } 1428 1429 #ifdef DEBUG 1430 static int acl3call_hits = 0; 1431 static int acl3call_misses = 0; 1432 #endif 1433 1434 int 1435 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1436 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1437 nfsstat3 *statusp, int flags, failinfo_t *fi) 1438 { 1439 int rpcerror; 1440 int user_informed; 1441 1442 user_informed = 0; 1443 1444 do { 1445 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1446 cr, douprintf, flags, fi); 1447 if (!rpcerror) { 1448 cred_t *crr; 1449 if (*statusp == NFS3ERR_JUKEBOX) { 1450 if (!user_informed) { 1451 user_informed = 1; 1452 uprintf( 1453 "file temporarily unavailable on the server, retrying...\n"); 1454 } 1455 delay(nfs3_jukebox_delay); 1456 } 1457 /* 1458 * See crnetadjust() for comments. 1459 */ 1460 else if (*statusp == NFS3ERR_ACCES && 1461 (crr = crnetadjust(cr)) != NULL) { 1462 #ifdef DEBUG 1463 acl3call_hits++; 1464 #endif 1465 rpcerror = aclcall(mi, which, xdrargs, argsp, 1466 xdrres, resp, crr, douprintf, flags, fi); 1467 1468 crfree(crr); 1469 #ifdef DEBUG 1470 if (*statusp == NFS3ERR_ACCES) 1471 acl3call_misses++; 1472 #endif 1473 } 1474 } 1475 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1476 1477 return (rpcerror); 1478 } 1479 1480 static int 1481 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1482 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1483 int flags, failinfo_t *fi) 1484 { 1485 CLIENT *client; 1486 struct chtab *ch; 1487 cred_t *cr = icr; 1488 bool_t cred_cloned = FALSE; 1489 enum clnt_stat status; 1490 struct rpc_err rpcerr; 1491 struct timeval wait; 1492 int timeo; /* in units of hz */ 1493 #if 0 /* notyet */ 1494 int my_rsize, my_wsize; 1495 #endif 1496 bool_t tryagain; 1497 k_sigset_t smask; 1498 servinfo_t *svp; 1499 struct nfs_clnt *nfscl; 1500 zoneid_t zoneid = getzoneid(); 1501 #ifdef DEBUG 1502 char *bufp; 1503 #endif 1504 1505 #if 0 /* notyet */ 1506 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1507 "rfscall_start:which %d mi %p", which, mi); 1508 #endif 1509 1510 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1511 ASSERT(nfscl != NULL); 1512 1513 nfscl->nfscl_stat.calls.value.ui64++; 1514 mi->mi_aclreqs[which].value.ui64++; 1515 1516 rpcerr.re_status = RPC_SUCCESS; 1517 1518 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1519 rpcerr.re_status = RPC_FAILED; 1520 rpcerr.re_errno = EIO; 1521 return (rpcerr.re_errno); 1522 } 1523 1524 #if 0 /* notyet */ 1525 /* 1526 * Remember the transfer sizes in case 1527 * nfs_feedback changes them underneath us. 1528 */ 1529 my_rsize = mi->mi_curread; 1530 my_wsize = mi->mi_curwrite; 1531 #endif 1532 1533 /* 1534 * NFS client failover support 1535 * 1536 * If this rnode is not in sync with the current server (VALID_FH), 1537 * we'd like to do a remap to get in sync. We can be interrupted 1538 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1539 * use the best info we have to try the RPC. Part of that is 1540 * unconditionally updating the filehandle copy kept for V3. 1541 * 1542 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1543 * rw_enter(); we're trying to keep the current server from being 1544 * changed on us until we're done with the remapping and have a 1545 * matching client handle. We don't want to sending a filehandle 1546 * to the wrong host. 1547 */ 1548 failoverretry: 1549 if (FAILOVER_MOUNT(mi)) { 1550 mutex_enter(&mi->mi_lock); 1551 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1552 if (failover_wait(mi)) { 1553 mutex_exit(&mi->mi_lock); 1554 return (EINTR); 1555 } 1556 } 1557 INC_READERS(mi); 1558 mutex_exit(&mi->mi_lock); 1559 if (fi) { 1560 if (!VALID_FH(fi) && 1561 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1562 int remaperr; 1563 1564 svp = mi->mi_curr_serv; 1565 remaperr = failover_remap(fi); 1566 if (remaperr != 0) { 1567 #ifdef DEBUG 1568 if (remaperr != EINTR) 1569 nfs_cmn_err(remaperr, CE_WARN, 1570 "aclcall couldn't failover: %m"); 1571 #endif 1572 mutex_enter(&mi->mi_lock); 1573 DEC_READERS(mi); 1574 mutex_exit(&mi->mi_lock); 1575 1576 /* 1577 * If failover_remap returns ETIMEDOUT 1578 * and the filesystem is hard mounted 1579 * we have to retry the call with a new 1580 * server. 1581 */ 1582 if ((mi->mi_flags & MI_HARD) && 1583 IS_RECOVERABLE_ERROR(remaperr)) { 1584 if (svp == mi->mi_curr_serv) 1585 failover_newserver(mi); 1586 rpcerr.re_status = RPC_SUCCESS; 1587 goto failoverretry; 1588 } 1589 return (remaperr); 1590 } 1591 } 1592 if (fi->fhp && fi->copyproc) 1593 (*fi->copyproc)(fi->fhp, fi->vp); 1594 } 1595 } 1596 1597 /* For TSOL, use a new cred which has net_mac_aware flag */ 1598 if (!cred_cloned && is_system_labeled()) { 1599 cred_cloned = TRUE; 1600 cr = crdup(icr); 1601 (void) setpflags(NET_MAC_AWARE, 1, cr); 1602 } 1603 1604 /* 1605 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1606 * are guaranteed to reprocess the retry as a new request. 1607 */ 1608 svp = mi->mi_curr_serv; 1609 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1610 if (FAILOVER_MOUNT(mi)) { 1611 mutex_enter(&mi->mi_lock); 1612 DEC_READERS(mi); 1613 mutex_exit(&mi->mi_lock); 1614 1615 if ((rpcerr.re_errno == ETIMEDOUT || 1616 rpcerr.re_errno == ECONNRESET) && 1617 failover_safe(fi)) { 1618 if (svp == mi->mi_curr_serv) 1619 failover_newserver(mi); 1620 goto failoverretry; 1621 } 1622 } 1623 if (rpcerr.re_errno != 0) { 1624 if (cred_cloned) 1625 crfree(cr); 1626 return (rpcerr.re_errno); 1627 } 1628 1629 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1630 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1631 timeo = (mi->mi_timeo * hz) / 10; 1632 } else { 1633 mutex_enter(&mi->mi_lock); 1634 timeo = CLNT_SETTIMERS(client, 1635 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1636 &(mi->mi_timers[NFS_CALLTYPES]), 1637 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1638 (void (*)()) 0, (caddr_t)mi, 0); 1639 mutex_exit(&mi->mi_lock); 1640 } 1641 1642 /* 1643 * If hard mounted fs, retry call forever unless hard error occurs. 1644 */ 1645 do { 1646 tryagain = FALSE; 1647 1648 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1649 status = RPC_FAILED; 1650 rpcerr.re_status = RPC_FAILED; 1651 rpcerr.re_errno = EIO; 1652 break; 1653 } 1654 1655 TICK_TO_TIMEVAL(timeo, &wait); 1656 1657 /* 1658 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1659 * and SIGTERM. (Preserving the existing masks). 1660 * Mask out SIGINT if mount option nointr is specified. 1661 */ 1662 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1663 if (!(mi->mi_flags & MI_INT)) 1664 client->cl_nosignal = TRUE; 1665 1666 /* 1667 * If there is a current signal, then don't bother 1668 * even trying to send out the request because we 1669 * won't be able to block waiting for the response. 1670 * Simply assume RPC_INTR and get on with it. 1671 */ 1672 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1673 status = RPC_INTR; 1674 else { 1675 status = CLNT_CALL(client, which, xdrargs, argsp, 1676 xdrres, resp, wait); 1677 } 1678 1679 if (!(mi->mi_flags & MI_INT)) 1680 client->cl_nosignal = FALSE; 1681 /* 1682 * restore original signal mask 1683 */ 1684 sigunintr(&smask); 1685 1686 switch (status) { 1687 case RPC_SUCCESS: 1688 #if 0 /* notyet */ 1689 if ((mi->mi_flags & MI_DYNAMIC) && 1690 mi->mi_timer_type[which] != 0 && 1691 (mi->mi_curread != my_rsize || 1692 mi->mi_curwrite != my_wsize)) 1693 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1694 #endif 1695 break; 1696 1697 /* 1698 * Unfortunately, there are servers in the world which 1699 * are not coded correctly. They are not prepared to 1700 * handle RPC requests to the NFS port which are not 1701 * NFS requests. Thus, they may try to process the 1702 * NFS_ACL request as if it were an NFS request. This 1703 * does not work. Generally, an error will be generated 1704 * on the client because it will not be able to decode 1705 * the response from the server. However, it seems 1706 * possible that the server may not be able to decode 1707 * the arguments. Thus, the criteria for deciding 1708 * whether the server supports NFS_ACL or not is whether 1709 * the following RPC errors are returned from CLNT_CALL. 1710 */ 1711 case RPC_CANTDECODERES: 1712 case RPC_PROGUNAVAIL: 1713 case RPC_CANTDECODEARGS: 1714 case RPC_PROGVERSMISMATCH: 1715 mutex_enter(&mi->mi_lock); 1716 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1717 mutex_exit(&mi->mi_lock); 1718 break; 1719 1720 /* 1721 * If the server supports NFS_ACL but not the new ops 1722 * for extended attributes, make sure we don't retry. 1723 */ 1724 case RPC_PROCUNAVAIL: 1725 mutex_enter(&mi->mi_lock); 1726 mi->mi_flags &= ~MI_EXTATTR; 1727 mutex_exit(&mi->mi_lock); 1728 break; 1729 1730 case RPC_INTR: 1731 /* 1732 * There is no way to recover from this error, 1733 * even if mount option nointr is specified. 1734 * SIGKILL, for example, cannot be blocked. 1735 */ 1736 rpcerr.re_status = RPC_INTR; 1737 rpcerr.re_errno = EINTR; 1738 break; 1739 1740 case RPC_UDERROR: 1741 /* 1742 * If the NFS server is local (vold) and 1743 * it goes away then we get RPC_UDERROR. 1744 * This is a retryable error, so we would 1745 * loop, so check to see if the specific 1746 * error was ECONNRESET, indicating that 1747 * target did not exist at all. If so, 1748 * return with RPC_PROGUNAVAIL and 1749 * ECONNRESET to indicate why. 1750 */ 1751 CLNT_GETERR(client, &rpcerr); 1752 if (rpcerr.re_errno == ECONNRESET) { 1753 rpcerr.re_status = RPC_PROGUNAVAIL; 1754 rpcerr.re_errno = ECONNRESET; 1755 break; 1756 } 1757 /*FALLTHROUGH*/ 1758 1759 default: /* probably RPC_TIMEDOUT */ 1760 if (IS_UNRECOVERABLE_RPC(status)) 1761 break; 1762 1763 /* 1764 * increment server not responding count 1765 */ 1766 mutex_enter(&mi->mi_lock); 1767 mi->mi_noresponse++; 1768 mutex_exit(&mi->mi_lock); 1769 #ifdef DEBUG 1770 nfscl->nfscl_stat.noresponse.value.ui64++; 1771 #endif 1772 1773 if (!(mi->mi_flags & MI_HARD)) { 1774 if (!(mi->mi_flags & MI_SEMISOFT) || 1775 (mi->mi_acl_ss_call_type[which] == 0)) 1776 break; 1777 } 1778 1779 /* 1780 * The call is in progress (over COTS). 1781 * Try the CLNT_CALL again, but don't 1782 * print a noisy error message. 1783 */ 1784 if (status == RPC_INPROGRESS) { 1785 tryagain = TRUE; 1786 break; 1787 } 1788 1789 if (flags & RFSCALL_SOFT) 1790 break; 1791 1792 /* 1793 * On zone shutdown, just move on. 1794 */ 1795 if (zone_status_get(curproc->p_zone) >= 1796 ZONE_IS_SHUTTING_DOWN) { 1797 rpcerr.re_status = RPC_FAILED; 1798 rpcerr.re_errno = EIO; 1799 break; 1800 } 1801 1802 /* 1803 * NFS client failover support 1804 * 1805 * If the current server just failed us, we'll 1806 * start the process of finding a new server. 1807 * After that, we can just retry. 1808 */ 1809 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1810 if (svp == mi->mi_curr_serv) 1811 failover_newserver(mi); 1812 clfree_impl(client, ch, nfscl); 1813 goto failoverretry; 1814 } 1815 1816 tryagain = TRUE; 1817 timeo = backoff(timeo); 1818 mutex_enter(&mi->mi_lock); 1819 if (!(mi->mi_flags & MI_PRINTED)) { 1820 mi->mi_flags |= MI_PRINTED; 1821 mutex_exit(&mi->mi_lock); 1822 #ifdef DEBUG 1823 zprintf(zoneid, 1824 "NFS_ACL%d server %s not responding still trying\n", 1825 mi->mi_vers, svp->sv_hostname); 1826 #else 1827 zprintf(zoneid, 1828 "NFS server %s not responding still trying\n", 1829 svp->sv_hostname); 1830 #endif 1831 } else 1832 mutex_exit(&mi->mi_lock); 1833 if (*douprintf && nfs_has_ctty()) { 1834 *douprintf = 0; 1835 if (!(mi->mi_flags & MI_NOPRINT)) 1836 #ifdef DEBUG 1837 uprintf( 1838 "NFS_ACL%d server %s not responding still trying\n", 1839 mi->mi_vers, svp->sv_hostname); 1840 #else 1841 uprintf( 1842 "NFS server %s not responding still trying\n", 1843 svp->sv_hostname); 1844 #endif 1845 } 1846 1847 #if 0 /* notyet */ 1848 /* 1849 * If doing dynamic adjustment of transfer 1850 * size and if it's a read or write call 1851 * and if the transfer size changed while 1852 * retransmitting or if the feedback routine 1853 * changed the transfer size, 1854 * then exit rfscall so that the transfer 1855 * size can be adjusted at the vnops level. 1856 */ 1857 if ((mi->mi_flags & MI_DYNAMIC) && 1858 mi->mi_acl_timer_type[which] != 0 && 1859 (mi->mi_curread != my_rsize || 1860 mi->mi_curwrite != my_wsize || 1861 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1862 /* 1863 * On read or write calls, return 1864 * back to the vnode ops level if 1865 * the transfer size changed. 1866 */ 1867 clfree_impl(client, ch, nfscl); 1868 if (cred_cloned) 1869 crfree(cr); 1870 return (ENFS_TRYAGAIN); 1871 } 1872 #endif 1873 } 1874 } while (tryagain); 1875 1876 if (status != RPC_SUCCESS) { 1877 /* 1878 * Let soft mounts use the timed out message. 1879 */ 1880 if (status == RPC_INPROGRESS) 1881 status = RPC_TIMEDOUT; 1882 nfscl->nfscl_stat.badcalls.value.ui64++; 1883 if (status == RPC_CANTDECODERES || 1884 status == RPC_PROGUNAVAIL || 1885 status == RPC_PROCUNAVAIL || 1886 status == RPC_CANTDECODEARGS || 1887 status == RPC_PROGVERSMISMATCH) 1888 CLNT_GETERR(client, &rpcerr); 1889 else if (status != RPC_INTR) { 1890 mutex_enter(&mi->mi_lock); 1891 mi->mi_flags |= MI_DOWN; 1892 mutex_exit(&mi->mi_lock); 1893 CLNT_GETERR(client, &rpcerr); 1894 #ifdef DEBUG 1895 bufp = clnt_sperror(client, svp->sv_hostname); 1896 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1897 mi->mi_vers, mi->mi_aclnames[which], bufp); 1898 if (nfs_has_ctty()) { 1899 if (!(mi->mi_flags & MI_NOPRINT)) { 1900 uprintf("NFS_ACL%d %s failed for %s\n", 1901 mi->mi_vers, mi->mi_aclnames[which], 1902 bufp); 1903 } 1904 } 1905 kmem_free(bufp, MAXPATHLEN); 1906 #else 1907 zprintf(zoneid, 1908 "NFS %s failed for server %s: error %d (%s)\n", 1909 mi->mi_aclnames[which], svp->sv_hostname, 1910 status, clnt_sperrno(status)); 1911 if (nfs_has_ctty()) { 1912 if (!(mi->mi_flags & MI_NOPRINT)) 1913 uprintf( 1914 "NFS %s failed for server %s: error %d (%s)\n", 1915 mi->mi_aclnames[which], 1916 svp->sv_hostname, status, 1917 clnt_sperrno(status)); 1918 } 1919 #endif 1920 /* 1921 * when CLNT_CALL() fails with RPC_AUTHERROR, 1922 * re_errno is set appropriately depending on 1923 * the authentication error 1924 */ 1925 if (status == RPC_VERSMISMATCH || 1926 status == RPC_PROGVERSMISMATCH) 1927 rpcerr.re_errno = EIO; 1928 } 1929 } else { 1930 /* 1931 * Test the value of mi_down and mi_printed without 1932 * holding the mi_lock mutex. If they are both zero, 1933 * then it is okay to skip the down and printed 1934 * processing. This saves on a mutex_enter and 1935 * mutex_exit pair for a normal, successful RPC. 1936 * This was just complete overhead. 1937 */ 1938 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1939 mutex_enter(&mi->mi_lock); 1940 mi->mi_flags &= ~MI_DOWN; 1941 if (mi->mi_flags & MI_PRINTED) { 1942 mi->mi_flags &= ~MI_PRINTED; 1943 mutex_exit(&mi->mi_lock); 1944 #ifdef DEBUG 1945 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1946 mi->mi_vers, svp->sv_hostname); 1947 #else 1948 zprintf(zoneid, "NFS server %s ok\n", 1949 svp->sv_hostname); 1950 #endif 1951 } else 1952 mutex_exit(&mi->mi_lock); 1953 } 1954 1955 if (*douprintf == 0) { 1956 if (!(mi->mi_flags & MI_NOPRINT)) 1957 #ifdef DEBUG 1958 uprintf("NFS_ACL%d server %s ok\n", 1959 mi->mi_vers, svp->sv_hostname); 1960 #else 1961 uprintf("NFS server %s ok\n", svp->sv_hostname); 1962 #endif 1963 *douprintf = 1; 1964 } 1965 } 1966 1967 clfree_impl(client, ch, nfscl); 1968 if (cred_cloned) 1969 crfree(cr); 1970 1971 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1972 1973 #if 0 /* notyet */ 1974 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1975 rpcerr.re_errno); 1976 #endif 1977 1978 return (rpcerr.re_errno); 1979 } 1980 1981 int 1982 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1983 { 1984 uint_t mask = vap->va_mask; 1985 1986 if (!(mask & AT_MODE)) 1987 sa->sa_mode = (uint32_t)-1; 1988 else 1989 sa->sa_mode = vap->va_mode; 1990 if (!(mask & AT_UID)) 1991 sa->sa_uid = (uint32_t)-1; 1992 else 1993 sa->sa_uid = (uint32_t)vap->va_uid; 1994 if (!(mask & AT_GID)) 1995 sa->sa_gid = (uint32_t)-1; 1996 else 1997 sa->sa_gid = (uint32_t)vap->va_gid; 1998 if (!(mask & AT_SIZE)) 1999 sa->sa_size = (uint32_t)-1; 2000 else 2001 sa->sa_size = (uint32_t)vap->va_size; 2002 if (!(mask & AT_ATIME)) 2003 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 2004 else { 2005 /* check time validity */ 2006 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2007 return (EOVERFLOW); 2008 } 2009 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2010 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2011 } 2012 if (!(mask & AT_MTIME)) 2013 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2014 else { 2015 /* check time validity */ 2016 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2017 return (EOVERFLOW); 2018 } 2019 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2020 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2021 } 2022 return (0); 2023 } 2024 2025 int 2026 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2027 { 2028 uint_t mask = vap->va_mask; 2029 2030 if (!(mask & AT_MODE)) 2031 sa->mode.set_it = FALSE; 2032 else { 2033 sa->mode.set_it = TRUE; 2034 sa->mode.mode = (mode3)vap->va_mode; 2035 } 2036 if (!(mask & AT_UID)) 2037 sa->uid.set_it = FALSE; 2038 else { 2039 sa->uid.set_it = TRUE; 2040 sa->uid.uid = (uid3)vap->va_uid; 2041 } 2042 if (!(mask & AT_GID)) 2043 sa->gid.set_it = FALSE; 2044 else { 2045 sa->gid.set_it = TRUE; 2046 sa->gid.gid = (gid3)vap->va_gid; 2047 } 2048 if (!(mask & AT_SIZE)) 2049 sa->size.set_it = FALSE; 2050 else { 2051 sa->size.set_it = TRUE; 2052 sa->size.size = (size3)vap->va_size; 2053 } 2054 if (!(mask & AT_ATIME)) 2055 sa->atime.set_it = DONT_CHANGE; 2056 else { 2057 /* check time validity */ 2058 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2059 return (EOVERFLOW); 2060 } 2061 sa->atime.set_it = SET_TO_CLIENT_TIME; 2062 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2063 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2064 } 2065 if (!(mask & AT_MTIME)) 2066 sa->mtime.set_it = DONT_CHANGE; 2067 else { 2068 /* check time validity */ 2069 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2070 return (EOVERFLOW); 2071 } 2072 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2073 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2074 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2075 } 2076 return (0); 2077 } 2078 2079 void 2080 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2081 { 2082 2083 da->da_fhandle = VTOFH(dvp); 2084 da->da_name = nm; 2085 da->da_flags = 0; 2086 } 2087 2088 void 2089 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2090 { 2091 2092 da->dirp = VTOFH3(dvp); 2093 da->name = nm; 2094 } 2095 2096 int 2097 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2098 { 2099 int error; 2100 rnode_t *rp; 2101 struct vattr va; 2102 2103 va.va_mask = AT_MODE | AT_GID; 2104 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2105 if (error) 2106 return (error); 2107 2108 /* 2109 * To determine the expected group-id of the created file: 2110 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2111 * GRPID option, and the directory's set-gid bit is clear, 2112 * then use the process's gid. 2113 * 2) Otherwise, set the group-id to the gid of the parent directory. 2114 */ 2115 rp = VTOR(dvp); 2116 mutex_enter(&rp->r_statelock); 2117 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2118 *gidp = crgetgid(cr); 2119 else 2120 *gidp = va.va_gid; 2121 mutex_exit(&rp->r_statelock); 2122 return (0); 2123 } 2124 2125 int 2126 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2127 { 2128 int error; 2129 struct vattr va; 2130 2131 va.va_mask = AT_MODE; 2132 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2133 if (error) 2134 return (error); 2135 2136 /* 2137 * Modify the expected mode (om) so that the set-gid bit matches 2138 * that of the parent directory (dvp). 2139 */ 2140 if (va.va_mode & VSGID) 2141 *omp |= VSGID; 2142 else 2143 *omp &= ~VSGID; 2144 return (0); 2145 } 2146 2147 void 2148 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2149 { 2150 2151 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2152 if (!(vp->v_flag & VSWAPLIKE)) { 2153 mutex_enter(&vp->v_lock); 2154 vp->v_flag |= VSWAPLIKE; 2155 mutex_exit(&vp->v_lock); 2156 } 2157 } else { 2158 if (vp->v_flag & VSWAPLIKE) { 2159 mutex_enter(&vp->v_lock); 2160 vp->v_flag &= ~VSWAPLIKE; 2161 mutex_exit(&vp->v_lock); 2162 } 2163 } 2164 } 2165 2166 /* 2167 * Free the resources associated with an rnode. 2168 */ 2169 static void 2170 rinactive(rnode_t *rp, cred_t *cr) 2171 { 2172 vnode_t *vp; 2173 cred_t *cred; 2174 char *contents; 2175 int size; 2176 vsecattr_t *vsp; 2177 int error; 2178 nfs3_pathconf_info *info; 2179 2180 /* 2181 * Before freeing anything, wait until all asynchronous 2182 * activity is done on this rnode. This will allow all 2183 * asynchronous read ahead and write behind i/o's to 2184 * finish. 2185 */ 2186 mutex_enter(&rp->r_statelock); 2187 while (rp->r_count > 0) 2188 cv_wait(&rp->r_cv, &rp->r_statelock); 2189 mutex_exit(&rp->r_statelock); 2190 2191 /* 2192 * Flush and invalidate all pages associated with the vnode. 2193 */ 2194 vp = RTOV(rp); 2195 if (vn_has_cached_data(vp)) { 2196 ASSERT(vp->v_type != VCHR); 2197 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2198 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2199 if (error && (error == ENOSPC || error == EDQUOT)) { 2200 mutex_enter(&rp->r_statelock); 2201 if (!rp->r_error) 2202 rp->r_error = error; 2203 mutex_exit(&rp->r_statelock); 2204 } 2205 } 2206 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2207 } 2208 2209 /* 2210 * Free any held credentials and caches which may be associated 2211 * with this rnode. 2212 */ 2213 mutex_enter(&rp->r_statelock); 2214 cred = rp->r_cred; 2215 rp->r_cred = NULL; 2216 contents = rp->r_symlink.contents; 2217 size = rp->r_symlink.size; 2218 rp->r_symlink.contents = NULL; 2219 vsp = rp->r_secattr; 2220 rp->r_secattr = NULL; 2221 info = rp->r_pathconf; 2222 rp->r_pathconf = NULL; 2223 mutex_exit(&rp->r_statelock); 2224 2225 /* 2226 * Free the held credential. 2227 */ 2228 if (cred != NULL) 2229 crfree(cred); 2230 2231 /* 2232 * Free the access cache entries. 2233 */ 2234 (void) nfs_access_purge_rp(rp); 2235 2236 /* 2237 * Free the readdir cache entries. 2238 */ 2239 if (HAVE_RDDIR_CACHE(rp)) 2240 nfs_purge_rddir_cache(vp); 2241 2242 /* 2243 * Free the symbolic link cache. 2244 */ 2245 if (contents != NULL) { 2246 2247 kmem_free((void *)contents, size); 2248 } 2249 2250 /* 2251 * Free any cached ACL. 2252 */ 2253 if (vsp != NULL) 2254 nfs_acl_free(vsp); 2255 2256 /* 2257 * Free any cached pathconf information. 2258 */ 2259 if (info != NULL) 2260 kmem_free(info, sizeof (*info)); 2261 } 2262 2263 /* 2264 * Return a vnode for the given NFS Version 2 file handle. 2265 * If no rnode exists for this fhandle, create one and put it 2266 * into the hash queues. If the rnode for this fhandle 2267 * already exists, return it. 2268 * 2269 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2270 */ 2271 vnode_t * 2272 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2273 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2274 { 2275 int newnode; 2276 int index; 2277 vnode_t *vp; 2278 nfs_fhandle nfh; 2279 vattr_t va; 2280 2281 nfh.fh_len = NFS_FHSIZE; 2282 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2283 2284 index = rtablehash(&nfh); 2285 rw_enter(&rtable[index].r_lock, RW_READER); 2286 2287 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2288 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2289 2290 if (attr != NULL) { 2291 if (!newnode) { 2292 rw_exit(&rtable[index].r_lock); 2293 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2294 } else { 2295 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2296 vp->v_type = VBAD; 2297 else 2298 vp->v_type = n2v_type(attr); 2299 /* 2300 * A translation here seems to be necessary 2301 * because this function can be called 2302 * with `attr' that has come from the wire, 2303 * and been operated on by vattr_to_nattr(). 2304 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2305 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2306 * ->makenfsnode(). 2307 */ 2308 if ((attr->na_rdev & 0xffff0000) == 0) 2309 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2310 else 2311 vp->v_rdev = expldev(n2v_rdev(attr)); 2312 nfs_attrcache(vp, attr, t); 2313 rw_exit(&rtable[index].r_lock); 2314 } 2315 } else { 2316 if (newnode) { 2317 PURGE_ATTRCACHE(vp); 2318 } 2319 rw_exit(&rtable[index].r_lock); 2320 } 2321 2322 return (vp); 2323 } 2324 2325 /* 2326 * Return a vnode for the given NFS Version 3 file handle. 2327 * If no rnode exists for this fhandle, create one and put it 2328 * into the hash queues. If the rnode for this fhandle 2329 * already exists, return it. 2330 * 2331 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2332 */ 2333 vnode_t * 2334 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2335 cred_t *cr, char *dnm, char *nm) 2336 { 2337 int newnode; 2338 int index; 2339 vnode_t *vp; 2340 2341 index = rtablehash((nfs_fhandle *)fh); 2342 rw_enter(&rtable[index].r_lock, RW_READER); 2343 2344 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2345 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2346 dnm, nm); 2347 2348 if (vap == NULL) { 2349 if (newnode) { 2350 PURGE_ATTRCACHE(vp); 2351 } 2352 rw_exit(&rtable[index].r_lock); 2353 return (vp); 2354 } 2355 2356 if (!newnode) { 2357 rw_exit(&rtable[index].r_lock); 2358 nfs_attr_cache(vp, vap, t, cr); 2359 } else { 2360 rnode_t *rp = VTOR(vp); 2361 2362 vp->v_type = vap->va_type; 2363 vp->v_rdev = vap->va_rdev; 2364 2365 mutex_enter(&rp->r_statelock); 2366 if (rp->r_mtime <= t) 2367 nfs_attrcache_va(vp, vap); 2368 mutex_exit(&rp->r_statelock); 2369 rw_exit(&rtable[index].r_lock); 2370 } 2371 2372 return (vp); 2373 } 2374 2375 vnode_t * 2376 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2377 cred_t *cr, char *dnm, char *nm) 2378 { 2379 int newnode; 2380 int index; 2381 vnode_t *vp; 2382 vattr_t va; 2383 2384 index = rtablehash((nfs_fhandle *)fh); 2385 rw_enter(&rtable[index].r_lock, RW_READER); 2386 2387 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2388 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2389 dnm, nm); 2390 2391 if (attr == NULL) { 2392 if (newnode) { 2393 PURGE_ATTRCACHE(vp); 2394 } 2395 rw_exit(&rtable[index].r_lock); 2396 return (vp); 2397 } 2398 2399 if (!newnode) { 2400 rw_exit(&rtable[index].r_lock); 2401 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2402 } else { 2403 if (attr->type < NF3REG || attr->type > NF3FIFO) 2404 vp->v_type = VBAD; 2405 else 2406 vp->v_type = nf3_to_vt[attr->type]; 2407 vp->v_rdev = makedevice(attr->rdev.specdata1, 2408 attr->rdev.specdata2); 2409 nfs3_attrcache(vp, attr, t); 2410 rw_exit(&rtable[index].r_lock); 2411 } 2412 2413 return (vp); 2414 } 2415 2416 /* 2417 * Read this comment before making changes to rtablehash()! 2418 * This is a hash function in which seemingly obvious and harmless 2419 * changes can cause escalations costing million dollars! 2420 * Know what you are doing. 2421 * 2422 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2423 * algorithm is currently detailed here: 2424 * 2425 * http://burtleburtle.net/bob/hash/doobs.html 2426 * 2427 * Of course, the above link may not be valid by the time you are reading 2428 * this, but suffice it to say that the one-at-a-time algorithm works well in 2429 * almost all cases. If you are changing the algorithm be sure to verify that 2430 * the hash algorithm still provides even distribution in all cases and with 2431 * any server returning filehandles in whatever order (sequential or random). 2432 */ 2433 static int 2434 rtablehash(nfs_fhandle *fh) 2435 { 2436 ulong_t hash, len, i; 2437 char *key; 2438 2439 key = fh->fh_buf; 2440 len = (ulong_t)fh->fh_len; 2441 for (hash = 0, i = 0; i < len; i++) { 2442 hash += key[i]; 2443 hash += (hash << 10); 2444 hash ^= (hash >> 6); 2445 } 2446 hash += (hash << 3); 2447 hash ^= (hash >> 11); 2448 hash += (hash << 15); 2449 return (hash & rtablemask); 2450 } 2451 2452 static vnode_t * 2453 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2454 struct vnodeops *vops, 2455 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2456 int (*compar)(const void *, const void *), 2457 int *newnode, cred_t *cr, char *dnm, char *nm) 2458 { 2459 rnode_t *rp; 2460 rnode_t *trp; 2461 vnode_t *vp; 2462 mntinfo_t *mi; 2463 2464 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2465 2466 mi = VFTOMI(vfsp); 2467 start: 2468 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2469 vp = RTOV(rp); 2470 nfs_set_vroot(vp); 2471 *newnode = 0; 2472 return (vp); 2473 } 2474 rw_exit(&rhtp->r_lock); 2475 2476 mutex_enter(&rpfreelist_lock); 2477 if (rpfreelist != NULL && rnew >= nrnode) { 2478 rp = rpfreelist; 2479 rp_rmfree(rp); 2480 mutex_exit(&rpfreelist_lock); 2481 2482 vp = RTOV(rp); 2483 2484 if (rp->r_flags & RHASHED) { 2485 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2486 mutex_enter(&vp->v_lock); 2487 if (vp->v_count > 1) { 2488 vp->v_count--; 2489 mutex_exit(&vp->v_lock); 2490 rw_exit(&rp->r_hashq->r_lock); 2491 rw_enter(&rhtp->r_lock, RW_READER); 2492 goto start; 2493 } 2494 mutex_exit(&vp->v_lock); 2495 rp_rmhash_locked(rp); 2496 rw_exit(&rp->r_hashq->r_lock); 2497 } 2498 2499 rinactive(rp, cr); 2500 2501 mutex_enter(&vp->v_lock); 2502 if (vp->v_count > 1) { 2503 vp->v_count--; 2504 mutex_exit(&vp->v_lock); 2505 rw_enter(&rhtp->r_lock, RW_READER); 2506 goto start; 2507 } 2508 mutex_exit(&vp->v_lock); 2509 vn_invalid(vp); 2510 /* 2511 * destroy old locks before bzero'ing and 2512 * recreating the locks below. 2513 */ 2514 nfs_rw_destroy(&rp->r_rwlock); 2515 nfs_rw_destroy(&rp->r_lkserlock); 2516 mutex_destroy(&rp->r_statelock); 2517 cv_destroy(&rp->r_cv); 2518 cv_destroy(&rp->r_commit.c_cv); 2519 nfs_free_r_path(rp); 2520 avl_destroy(&rp->r_dir); 2521 /* 2522 * Make sure that if rnode is recycled then 2523 * VFS count is decremented properly before 2524 * reuse. 2525 */ 2526 VFS_RELE(vp->v_vfsp); 2527 vn_reinit(vp); 2528 } else { 2529 vnode_t *new_vp; 2530 2531 mutex_exit(&rpfreelist_lock); 2532 2533 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2534 new_vp = vn_alloc(KM_SLEEP); 2535 2536 atomic_add_long((ulong_t *)&rnew, 1); 2537 #ifdef DEBUG 2538 clstat_debug.nrnode.value.ui64++; 2539 #endif 2540 vp = new_vp; 2541 } 2542 2543 bzero(rp, sizeof (*rp)); 2544 rp->r_vnode = vp; 2545 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2546 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2547 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2548 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2549 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2550 rp->r_fh.fh_len = fh->fh_len; 2551 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2552 rp->r_server = mi->mi_curr_serv; 2553 if (FAILOVER_MOUNT(mi)) { 2554 /* 2555 * If replicated servers, stash pathnames 2556 */ 2557 if (dnm != NULL && nm != NULL) { 2558 char *s, *p; 2559 uint_t len; 2560 2561 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2562 rp->r_path = kmem_alloc(len, KM_SLEEP); 2563 #ifdef DEBUG 2564 clstat_debug.rpath.value.ui64 += len; 2565 #endif 2566 s = rp->r_path; 2567 for (p = dnm; *p; p++) 2568 *s++ = *p; 2569 *s++ = '/'; 2570 for (p = nm; *p; p++) 2571 *s++ = *p; 2572 *s = '\0'; 2573 } else { 2574 /* special case for root */ 2575 rp->r_path = kmem_alloc(2, KM_SLEEP); 2576 #ifdef DEBUG 2577 clstat_debug.rpath.value.ui64 += 2; 2578 #endif 2579 *rp->r_path = '.'; 2580 *(rp->r_path + 1) = '\0'; 2581 } 2582 } 2583 VFS_HOLD(vfsp); 2584 rp->r_putapage = putapage; 2585 rp->r_hashq = rhtp; 2586 rp->r_flags = RREADDIRPLUS; 2587 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2588 offsetof(rddir_cache, tree)); 2589 vn_setops(vp, vops); 2590 vp->v_data = (caddr_t)rp; 2591 vp->v_vfsp = vfsp; 2592 vp->v_type = VNON; 2593 vp->v_flag |= VMODSORT; 2594 nfs_set_vroot(vp); 2595 2596 /* 2597 * There is a race condition if someone else 2598 * alloc's the rnode while no locks are held, so we 2599 * check again and recover if found. 2600 */ 2601 rw_enter(&rhtp->r_lock, RW_WRITER); 2602 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2603 vp = RTOV(trp); 2604 nfs_set_vroot(vp); 2605 *newnode = 0; 2606 rw_exit(&rhtp->r_lock); 2607 rp_addfree(rp, cr); 2608 rw_enter(&rhtp->r_lock, RW_READER); 2609 return (vp); 2610 } 2611 rp_addhash(rp); 2612 *newnode = 1; 2613 return (vp); 2614 } 2615 2616 /* 2617 * Callback function to check if the page should be marked as 2618 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT. 2619 */ 2620 int 2621 nfs_setmod_check(page_t *pp) 2622 { 2623 if (pp->p_fsdata != C_NOCOMMIT) { 2624 pp->p_fsdata = C_NOCOMMIT; 2625 return (1); 2626 } 2627 return (0); 2628 } 2629 2630 static void 2631 nfs_set_vroot(vnode_t *vp) 2632 { 2633 rnode_t *rp; 2634 nfs_fhandle *rootfh; 2635 2636 rp = VTOR(vp); 2637 rootfh = &rp->r_server->sv_fhandle; 2638 if (rootfh->fh_len == rp->r_fh.fh_len && 2639 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2640 if (!(vp->v_flag & VROOT)) { 2641 mutex_enter(&vp->v_lock); 2642 vp->v_flag |= VROOT; 2643 mutex_exit(&vp->v_lock); 2644 } 2645 } 2646 } 2647 2648 static void 2649 nfs_free_r_path(rnode_t *rp) 2650 { 2651 char *path; 2652 size_t len; 2653 2654 path = rp->r_path; 2655 if (path) { 2656 rp->r_path = NULL; 2657 len = strlen(path) + 1; 2658 kmem_free(path, len); 2659 #ifdef DEBUG 2660 clstat_debug.rpath.value.ui64 -= len; 2661 #endif 2662 } 2663 } 2664 2665 /* 2666 * Put an rnode on the free list. 2667 * 2668 * Rnodes which were allocated above and beyond the normal limit 2669 * are immediately freed. 2670 */ 2671 void 2672 rp_addfree(rnode_t *rp, cred_t *cr) 2673 { 2674 vnode_t *vp; 2675 struct vfs *vfsp; 2676 2677 vp = RTOV(rp); 2678 ASSERT(vp->v_count >= 1); 2679 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2680 2681 /* 2682 * If we have too many rnodes allocated and there are no 2683 * references to this rnode, or if the rnode is no longer 2684 * accessible by it does not reside in the hash queues, 2685 * or if an i/o error occurred while writing to the file, 2686 * then just free it instead of putting it on the rnode 2687 * freelist. 2688 */ 2689 vfsp = vp->v_vfsp; 2690 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2691 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2692 if (rp->r_flags & RHASHED) { 2693 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2694 mutex_enter(&vp->v_lock); 2695 if (vp->v_count > 1) { 2696 vp->v_count--; 2697 mutex_exit(&vp->v_lock); 2698 rw_exit(&rp->r_hashq->r_lock); 2699 return; 2700 } 2701 mutex_exit(&vp->v_lock); 2702 rp_rmhash_locked(rp); 2703 rw_exit(&rp->r_hashq->r_lock); 2704 } 2705 2706 rinactive(rp, cr); 2707 2708 /* 2709 * Recheck the vnode reference count. We need to 2710 * make sure that another reference has not been 2711 * acquired while we were not holding v_lock. The 2712 * rnode is not in the rnode hash queues, so the 2713 * only way for a reference to have been acquired 2714 * is for a VOP_PUTPAGE because the rnode was marked 2715 * with RDIRTY or for a modified page. This 2716 * reference may have been acquired before our call 2717 * to rinactive. The i/o may have been completed, 2718 * thus allowing rinactive to complete, but the 2719 * reference to the vnode may not have been released 2720 * yet. In any case, the rnode can not be destroyed 2721 * until the other references to this vnode have been 2722 * released. The other references will take care of 2723 * either destroying the rnode or placing it on the 2724 * rnode freelist. If there are no other references, 2725 * then the rnode may be safely destroyed. 2726 */ 2727 mutex_enter(&vp->v_lock); 2728 if (vp->v_count > 1) { 2729 vp->v_count--; 2730 mutex_exit(&vp->v_lock); 2731 return; 2732 } 2733 mutex_exit(&vp->v_lock); 2734 2735 destroy_rnode(rp); 2736 return; 2737 } 2738 2739 /* 2740 * Lock the hash queue and then recheck the reference count 2741 * to ensure that no other threads have acquired a reference 2742 * to indicate that the rnode should not be placed on the 2743 * freelist. If another reference has been acquired, then 2744 * just release this one and let the other thread complete 2745 * the processing of adding this rnode to the freelist. 2746 */ 2747 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2748 2749 mutex_enter(&vp->v_lock); 2750 if (vp->v_count > 1) { 2751 vp->v_count--; 2752 mutex_exit(&vp->v_lock); 2753 rw_exit(&rp->r_hashq->r_lock); 2754 return; 2755 } 2756 mutex_exit(&vp->v_lock); 2757 2758 /* 2759 * If there is no cached data or metadata for this file, then 2760 * put the rnode on the front of the freelist so that it will 2761 * be reused before other rnodes which may have cached data or 2762 * metadata associated with them. 2763 */ 2764 mutex_enter(&rpfreelist_lock); 2765 if (rpfreelist == NULL) { 2766 rp->r_freef = rp; 2767 rp->r_freeb = rp; 2768 rpfreelist = rp; 2769 } else { 2770 rp->r_freef = rpfreelist; 2771 rp->r_freeb = rpfreelist->r_freeb; 2772 rpfreelist->r_freeb->r_freef = rp; 2773 rpfreelist->r_freeb = rp; 2774 if (!vn_has_cached_data(vp) && 2775 !HAVE_RDDIR_CACHE(rp) && 2776 rp->r_symlink.contents == NULL && 2777 rp->r_secattr == NULL && 2778 rp->r_pathconf == NULL) 2779 rpfreelist = rp; 2780 } 2781 mutex_exit(&rpfreelist_lock); 2782 2783 rw_exit(&rp->r_hashq->r_lock); 2784 } 2785 2786 /* 2787 * Remove an rnode from the free list. 2788 * 2789 * The caller must be holding rpfreelist_lock and the rnode 2790 * must be on the freelist. 2791 */ 2792 static void 2793 rp_rmfree(rnode_t *rp) 2794 { 2795 2796 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2797 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2798 2799 if (rp == rpfreelist) { 2800 rpfreelist = rp->r_freef; 2801 if (rp == rpfreelist) 2802 rpfreelist = NULL; 2803 } 2804 2805 rp->r_freeb->r_freef = rp->r_freef; 2806 rp->r_freef->r_freeb = rp->r_freeb; 2807 2808 rp->r_freef = rp->r_freeb = NULL; 2809 } 2810 2811 /* 2812 * Put a rnode in the hash table. 2813 * 2814 * The caller must be holding the exclusive hash queue lock. 2815 */ 2816 static void 2817 rp_addhash(rnode_t *rp) 2818 { 2819 2820 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2821 ASSERT(!(rp->r_flags & RHASHED)); 2822 2823 rp->r_hashf = rp->r_hashq->r_hashf; 2824 rp->r_hashq->r_hashf = rp; 2825 rp->r_hashb = (rnode_t *)rp->r_hashq; 2826 rp->r_hashf->r_hashb = rp; 2827 2828 mutex_enter(&rp->r_statelock); 2829 rp->r_flags |= RHASHED; 2830 mutex_exit(&rp->r_statelock); 2831 } 2832 2833 /* 2834 * Remove a rnode from the hash table. 2835 * 2836 * The caller must be holding the hash queue lock. 2837 */ 2838 static void 2839 rp_rmhash_locked(rnode_t *rp) 2840 { 2841 2842 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2843 ASSERT(rp->r_flags & RHASHED); 2844 2845 rp->r_hashb->r_hashf = rp->r_hashf; 2846 rp->r_hashf->r_hashb = rp->r_hashb; 2847 2848 mutex_enter(&rp->r_statelock); 2849 rp->r_flags &= ~RHASHED; 2850 mutex_exit(&rp->r_statelock); 2851 } 2852 2853 /* 2854 * Remove a rnode from the hash table. 2855 * 2856 * The caller must not be holding the hash queue lock. 2857 */ 2858 void 2859 rp_rmhash(rnode_t *rp) 2860 { 2861 2862 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2863 rp_rmhash_locked(rp); 2864 rw_exit(&rp->r_hashq->r_lock); 2865 } 2866 2867 /* 2868 * Lookup a rnode by fhandle. 2869 * 2870 * The caller must be holding the hash queue lock, either shared or exclusive. 2871 */ 2872 static rnode_t * 2873 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2874 { 2875 rnode_t *rp; 2876 vnode_t *vp; 2877 2878 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2879 2880 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2881 vp = RTOV(rp); 2882 if (vp->v_vfsp == vfsp && 2883 rp->r_fh.fh_len == fh->fh_len && 2884 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2885 /* 2886 * remove rnode from free list, if necessary. 2887 */ 2888 if (rp->r_freef != NULL) { 2889 mutex_enter(&rpfreelist_lock); 2890 /* 2891 * If the rnode is on the freelist, 2892 * then remove it and use that reference 2893 * as the new reference. Otherwise, 2894 * need to increment the reference count. 2895 */ 2896 if (rp->r_freef != NULL) { 2897 rp_rmfree(rp); 2898 mutex_exit(&rpfreelist_lock); 2899 } else { 2900 mutex_exit(&rpfreelist_lock); 2901 VN_HOLD(vp); 2902 } 2903 } else 2904 VN_HOLD(vp); 2905 return (rp); 2906 } 2907 } 2908 return (NULL); 2909 } 2910 2911 /* 2912 * Return 1 if there is a active vnode belonging to this vfs in the 2913 * rtable cache. 2914 * 2915 * Several of these checks are done without holding the usual 2916 * locks. This is safe because destroy_rtable(), rp_addfree(), 2917 * etc. will redo the necessary checks before actually destroying 2918 * any rnodes. 2919 */ 2920 int 2921 check_rtable(struct vfs *vfsp) 2922 { 2923 int index; 2924 rnode_t *rp; 2925 vnode_t *vp; 2926 2927 for (index = 0; index < rtablesize; index++) { 2928 rw_enter(&rtable[index].r_lock, RW_READER); 2929 for (rp = rtable[index].r_hashf; 2930 rp != (rnode_t *)(&rtable[index]); 2931 rp = rp->r_hashf) { 2932 vp = RTOV(rp); 2933 if (vp->v_vfsp == vfsp) { 2934 if (rp->r_freef == NULL || 2935 (vn_has_cached_data(vp) && 2936 (rp->r_flags & RDIRTY)) || 2937 rp->r_count > 0) { 2938 rw_exit(&rtable[index].r_lock); 2939 return (1); 2940 } 2941 } 2942 } 2943 rw_exit(&rtable[index].r_lock); 2944 } 2945 return (0); 2946 } 2947 2948 /* 2949 * Destroy inactive vnodes from the hash queues which belong to this 2950 * vfs. It is essential that we destroy all inactive vnodes during a 2951 * forced unmount as well as during a normal unmount. 2952 */ 2953 void 2954 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2955 { 2956 int index; 2957 rnode_t *rp; 2958 rnode_t *rlist; 2959 rnode_t *r_hashf; 2960 vnode_t *vp; 2961 2962 rlist = NULL; 2963 2964 for (index = 0; index < rtablesize; index++) { 2965 rw_enter(&rtable[index].r_lock, RW_WRITER); 2966 for (rp = rtable[index].r_hashf; 2967 rp != (rnode_t *)(&rtable[index]); 2968 rp = r_hashf) { 2969 /* save the hash pointer before destroying */ 2970 r_hashf = rp->r_hashf; 2971 vp = RTOV(rp); 2972 if (vp->v_vfsp == vfsp) { 2973 mutex_enter(&rpfreelist_lock); 2974 if (rp->r_freef != NULL) { 2975 rp_rmfree(rp); 2976 mutex_exit(&rpfreelist_lock); 2977 rp_rmhash_locked(rp); 2978 rp->r_hashf = rlist; 2979 rlist = rp; 2980 } else 2981 mutex_exit(&rpfreelist_lock); 2982 } 2983 } 2984 rw_exit(&rtable[index].r_lock); 2985 } 2986 2987 for (rp = rlist; rp != NULL; rp = rlist) { 2988 rlist = rp->r_hashf; 2989 /* 2990 * This call to rp_addfree will end up destroying the 2991 * rnode, but in a safe way with the appropriate set 2992 * of checks done. 2993 */ 2994 rp_addfree(rp, cr); 2995 } 2996 2997 } 2998 2999 /* 3000 * This routine destroys all the resources associated with the rnode 3001 * and then the rnode itself. 3002 */ 3003 static void 3004 destroy_rnode(rnode_t *rp) 3005 { 3006 vnode_t *vp; 3007 vfs_t *vfsp; 3008 3009 vp = RTOV(rp); 3010 vfsp = vp->v_vfsp; 3011 3012 ASSERT(vp->v_count == 1); 3013 ASSERT(rp->r_count == 0); 3014 ASSERT(rp->r_lmpl == NULL); 3015 ASSERT(rp->r_mapcnt == 0); 3016 ASSERT(!(rp->r_flags & RHASHED)); 3017 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 3018 atomic_add_long((ulong_t *)&rnew, -1); 3019 #ifdef DEBUG 3020 clstat_debug.nrnode.value.ui64--; 3021 #endif 3022 nfs_rw_destroy(&rp->r_rwlock); 3023 nfs_rw_destroy(&rp->r_lkserlock); 3024 mutex_destroy(&rp->r_statelock); 3025 cv_destroy(&rp->r_cv); 3026 cv_destroy(&rp->r_commit.c_cv); 3027 if (rp->r_flags & RDELMAPLIST) 3028 list_destroy(&rp->r_indelmap); 3029 nfs_free_r_path(rp); 3030 avl_destroy(&rp->r_dir); 3031 vn_invalid(vp); 3032 vn_free(vp); 3033 kmem_cache_free(rnode_cache, rp); 3034 VFS_RELE(vfsp); 3035 } 3036 3037 /* 3038 * Flush all vnodes in this (or every) vfs. 3039 * Used by nfs_sync and by nfs_unmount. 3040 */ 3041 void 3042 rflush(struct vfs *vfsp, cred_t *cr) 3043 { 3044 int index; 3045 rnode_t *rp; 3046 vnode_t *vp, **vplist; 3047 long num, cnt; 3048 3049 /* 3050 * Check to see whether there is anything to do. 3051 */ 3052 num = rnew; 3053 if (num == 0) 3054 return; 3055 3056 /* 3057 * Allocate a slot for all currently active rnodes on the 3058 * supposition that they all may need flushing. 3059 */ 3060 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3061 cnt = 0; 3062 3063 /* 3064 * Walk the hash queues looking for rnodes with page 3065 * lists associated with them. Make a list of these 3066 * files. 3067 */ 3068 for (index = 0; index < rtablesize; index++) { 3069 rw_enter(&rtable[index].r_lock, RW_READER); 3070 for (rp = rtable[index].r_hashf; 3071 rp != (rnode_t *)(&rtable[index]); 3072 rp = rp->r_hashf) { 3073 vp = RTOV(rp); 3074 /* 3075 * Don't bother sync'ing a vp if it 3076 * is part of virtual swap device or 3077 * if VFS is read-only 3078 */ 3079 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3080 continue; 3081 /* 3082 * If flushing all mounted file systems or 3083 * the vnode belongs to this vfs, has pages 3084 * and is marked as either dirty or mmap'd, 3085 * hold and add this vnode to the list of 3086 * vnodes to flush. 3087 */ 3088 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3089 vn_has_cached_data(vp) && 3090 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3091 VN_HOLD(vp); 3092 vplist[cnt++] = vp; 3093 if (cnt == num) { 3094 rw_exit(&rtable[index].r_lock); 3095 goto toomany; 3096 } 3097 } 3098 } 3099 rw_exit(&rtable[index].r_lock); 3100 } 3101 toomany: 3102 3103 /* 3104 * Flush and release all of the files on the list. 3105 */ 3106 while (cnt-- > 0) { 3107 vp = vplist[cnt]; 3108 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3109 VN_RELE(vp); 3110 } 3111 3112 /* 3113 * Free the space allocated to hold the list. 3114 */ 3115 kmem_free(vplist, num * sizeof (*vplist)); 3116 } 3117 3118 /* 3119 * This probably needs to be larger than or equal to 3120 * log2(sizeof (struct rnode)) due to the way that rnodes are 3121 * allocated. 3122 */ 3123 #define ACACHE_SHIFT_BITS 9 3124 3125 static int 3126 acachehash(rnode_t *rp, cred_t *cr) 3127 { 3128 3129 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3130 acachemask); 3131 } 3132 3133 #ifdef DEBUG 3134 static long nfs_access_cache_hits = 0; 3135 static long nfs_access_cache_misses = 0; 3136 #endif 3137 3138 nfs_access_type_t 3139 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3140 { 3141 vnode_t *vp; 3142 acache_t *ap; 3143 acache_hash_t *hp; 3144 nfs_access_type_t all; 3145 3146 vp = RTOV(rp); 3147 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3148 return (NFS_ACCESS_UNKNOWN); 3149 3150 if (rp->r_acache != NULL) { 3151 hp = &acache[acachehash(rp, cr)]; 3152 rw_enter(&hp->lock, RW_READER); 3153 ap = hp->next; 3154 while (ap != (acache_t *)hp) { 3155 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3156 if ((ap->known & acc) == acc) { 3157 #ifdef DEBUG 3158 nfs_access_cache_hits++; 3159 #endif 3160 if ((ap->allowed & acc) == acc) 3161 all = NFS_ACCESS_ALLOWED; 3162 else 3163 all = NFS_ACCESS_DENIED; 3164 } else { 3165 #ifdef DEBUG 3166 nfs_access_cache_misses++; 3167 #endif 3168 all = NFS_ACCESS_UNKNOWN; 3169 } 3170 rw_exit(&hp->lock); 3171 return (all); 3172 } 3173 ap = ap->next; 3174 } 3175 rw_exit(&hp->lock); 3176 } 3177 3178 #ifdef DEBUG 3179 nfs_access_cache_misses++; 3180 #endif 3181 return (NFS_ACCESS_UNKNOWN); 3182 } 3183 3184 void 3185 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3186 { 3187 acache_t *ap; 3188 acache_t *nap; 3189 acache_hash_t *hp; 3190 3191 hp = &acache[acachehash(rp, cr)]; 3192 3193 /* 3194 * Allocate now assuming that mostly an allocation will be 3195 * required. This allows the allocation to happen without 3196 * holding the hash bucket locked. 3197 */ 3198 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3199 if (nap != NULL) { 3200 nap->known = acc; 3201 nap->allowed = resacc; 3202 nap->rnode = rp; 3203 crhold(cr); 3204 nap->cred = cr; 3205 nap->hashq = hp; 3206 } 3207 3208 rw_enter(&hp->lock, RW_WRITER); 3209 3210 if (rp->r_acache != NULL) { 3211 ap = hp->next; 3212 while (ap != (acache_t *)hp) { 3213 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3214 ap->known |= acc; 3215 ap->allowed &= ~acc; 3216 ap->allowed |= resacc; 3217 rw_exit(&hp->lock); 3218 if (nap != NULL) { 3219 crfree(nap->cred); 3220 kmem_cache_free(acache_cache, nap); 3221 } 3222 return; 3223 } 3224 ap = ap->next; 3225 } 3226 } 3227 3228 if (nap != NULL) { 3229 #ifdef DEBUG 3230 clstat_debug.access.value.ui64++; 3231 #endif 3232 nap->next = hp->next; 3233 hp->next = nap; 3234 nap->next->prev = nap; 3235 nap->prev = (acache_t *)hp; 3236 3237 mutex_enter(&rp->r_statelock); 3238 nap->list = rp->r_acache; 3239 rp->r_acache = nap; 3240 mutex_exit(&rp->r_statelock); 3241 } 3242 3243 rw_exit(&hp->lock); 3244 } 3245 3246 int 3247 nfs_access_purge_rp(rnode_t *rp) 3248 { 3249 acache_t *ap; 3250 acache_t *tmpap; 3251 acache_t *rplist; 3252 3253 /* 3254 * If there aren't any cached entries, then there is nothing 3255 * to free. 3256 */ 3257 if (rp->r_acache == NULL) 3258 return (0); 3259 3260 mutex_enter(&rp->r_statelock); 3261 rplist = rp->r_acache; 3262 rp->r_acache = NULL; 3263 mutex_exit(&rp->r_statelock); 3264 3265 /* 3266 * Loop through each entry in the list pointed to in the 3267 * rnode. Remove each of these entries from the hash 3268 * queue that it is on and remove it from the list in 3269 * the rnode. 3270 */ 3271 for (ap = rplist; ap != NULL; ap = tmpap) { 3272 rw_enter(&ap->hashq->lock, RW_WRITER); 3273 ap->prev->next = ap->next; 3274 ap->next->prev = ap->prev; 3275 rw_exit(&ap->hashq->lock); 3276 3277 tmpap = ap->list; 3278 crfree(ap->cred); 3279 kmem_cache_free(acache_cache, ap); 3280 #ifdef DEBUG 3281 clstat_debug.access.value.ui64--; 3282 #endif 3283 } 3284 3285 return (1); 3286 } 3287 3288 static const char prefix[] = ".nfs"; 3289 3290 static kmutex_t newnum_lock; 3291 3292 int 3293 newnum(void) 3294 { 3295 static uint_t newnum = 0; 3296 uint_t id; 3297 3298 mutex_enter(&newnum_lock); 3299 if (newnum == 0) 3300 newnum = gethrestime_sec() & 0xffff; 3301 id = newnum++; 3302 mutex_exit(&newnum_lock); 3303 return (id); 3304 } 3305 3306 char * 3307 newname(void) 3308 { 3309 char *news; 3310 char *s; 3311 const char *p; 3312 uint_t id; 3313 3314 id = newnum(); 3315 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3316 s = news; 3317 p = prefix; 3318 while (*p != '\0') 3319 *s++ = *p++; 3320 while (id != 0) { 3321 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3322 id >>= 4; 3323 } 3324 *s = '\0'; 3325 return (news); 3326 } 3327 3328 /* 3329 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3330 * framework. 3331 */ 3332 static int 3333 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3334 { 3335 ksp->ks_snaptime = gethrtime(); 3336 if (rw == KSTAT_WRITE) { 3337 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3338 #ifdef DEBUG 3339 /* 3340 * Currently only the global zone can write to kstats, but we 3341 * add the check just for paranoia. 3342 */ 3343 if (INGLOBALZONE(curproc)) 3344 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3345 sizeof (clstat_debug)); 3346 #endif 3347 } else { 3348 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3349 #ifdef DEBUG 3350 /* 3351 * If we're displaying the "global" debug kstat values, we 3352 * display them as-is to all zones since in fact they apply to 3353 * the system as a whole. 3354 */ 3355 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3356 sizeof (clstat_debug)); 3357 #endif 3358 } 3359 return (0); 3360 } 3361 3362 static void * 3363 clinit_zone(zoneid_t zoneid) 3364 { 3365 kstat_t *nfs_client_kstat; 3366 struct nfs_clnt *nfscl; 3367 uint_t ndata; 3368 3369 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3370 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3371 nfscl->nfscl_chtable = NULL; 3372 nfscl->nfscl_zoneid = zoneid; 3373 3374 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3375 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3376 #ifdef DEBUG 3377 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3378 #endif 3379 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3380 "misc", KSTAT_TYPE_NAMED, ndata, 3381 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3382 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3383 nfs_client_kstat->ks_snapshot = cl_snapshot; 3384 kstat_install(nfs_client_kstat); 3385 } 3386 mutex_enter(&nfs_clnt_list_lock); 3387 list_insert_head(&nfs_clnt_list, nfscl); 3388 mutex_exit(&nfs_clnt_list_lock); 3389 return (nfscl); 3390 } 3391 3392 /*ARGSUSED*/ 3393 static void 3394 clfini_zone(zoneid_t zoneid, void *arg) 3395 { 3396 struct nfs_clnt *nfscl = arg; 3397 chhead_t *chp, *next; 3398 3399 if (nfscl == NULL) 3400 return; 3401 mutex_enter(&nfs_clnt_list_lock); 3402 list_remove(&nfs_clnt_list, nfscl); 3403 mutex_exit(&nfs_clnt_list_lock); 3404 clreclaim_zone(nfscl, 0); 3405 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3406 ASSERT(chp->ch_list == NULL); 3407 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3408 next = chp->ch_next; 3409 kmem_free(chp, sizeof (*chp)); 3410 } 3411 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3412 mutex_destroy(&nfscl->nfscl_chtable_lock); 3413 kmem_free(nfscl, sizeof (*nfscl)); 3414 } 3415 3416 /* 3417 * Called by endpnt_destructor to make sure the client handles are 3418 * cleaned up before the RPC endpoints. This becomes a no-op if 3419 * clfini_zone (above) is called first. This function is needed 3420 * (rather than relying on clfini_zone to clean up) because the ZSD 3421 * callbacks have no ordering mechanism, so we have no way to ensure 3422 * that clfini_zone is called before endpnt_destructor. 3423 */ 3424 void 3425 clcleanup_zone(zoneid_t zoneid) 3426 { 3427 struct nfs_clnt *nfscl; 3428 3429 mutex_enter(&nfs_clnt_list_lock); 3430 nfscl = list_head(&nfs_clnt_list); 3431 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3432 if (nfscl->nfscl_zoneid == zoneid) { 3433 clreclaim_zone(nfscl, 0); 3434 break; 3435 } 3436 } 3437 mutex_exit(&nfs_clnt_list_lock); 3438 } 3439 3440 int 3441 nfs_subrinit(void) 3442 { 3443 int i; 3444 ulong_t nrnode_max; 3445 3446 /* 3447 * Allocate and initialize the rnode hash queues 3448 */ 3449 if (nrnode <= 0) 3450 nrnode = ncsize; 3451 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3452 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3453 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3454 "setting nrnode to max value of %ld", nrnode_max); 3455 nrnode = nrnode_max; 3456 } 3457 3458 rtablesize = 1 << highbit(nrnode / hashlen); 3459 rtablemask = rtablesize - 1; 3460 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3461 for (i = 0; i < rtablesize; i++) { 3462 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3463 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3464 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3465 } 3466 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3467 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3468 3469 /* 3470 * Allocate and initialize the access cache 3471 */ 3472 3473 /* 3474 * Initial guess is one access cache entry per rnode unless 3475 * nacache is set to a non-zero value and then it is used to 3476 * indicate a guess at the number of access cache entries. 3477 */ 3478 if (nacache > 0) 3479 acachesize = 1 << highbit(nacache / hashlen); 3480 else 3481 acachesize = rtablesize; 3482 acachemask = acachesize - 1; 3483 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3484 for (i = 0; i < acachesize; i++) { 3485 acache[i].next = (acache_t *)&acache[i]; 3486 acache[i].prev = (acache_t *)&acache[i]; 3487 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3488 } 3489 acache_cache = kmem_cache_create("nfs_access_cache", 3490 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3491 /* 3492 * Allocate and initialize the client handle cache 3493 */ 3494 chtab_cache = kmem_cache_create("client_handle_cache", 3495 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3496 /* 3497 * Initialize the list of per-zone client handles (and associated data). 3498 * This needs to be done before we call zone_key_create(). 3499 */ 3500 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3501 offsetof(struct nfs_clnt, nfscl_node)); 3502 /* 3503 * Initialize the zone_key for per-zone client handle lists. 3504 */ 3505 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3506 /* 3507 * Initialize the various mutexes and reader/writer locks 3508 */ 3509 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3510 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3511 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3512 3513 /* 3514 * Assign unique major number for all nfs mounts 3515 */ 3516 if ((nfs_major = getudev()) == -1) { 3517 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3518 "nfs: init: can't get unique device number"); 3519 nfs_major = 0; 3520 } 3521 nfs_minor = 0; 3522 3523 if (nfs3_jukebox_delay == 0) 3524 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3525 3526 return (0); 3527 } 3528 3529 void 3530 nfs_subrfini(void) 3531 { 3532 int i; 3533 3534 /* 3535 * Deallocate the rnode hash queues 3536 */ 3537 kmem_cache_destroy(rnode_cache); 3538 3539 for (i = 0; i < rtablesize; i++) 3540 rw_destroy(&rtable[i].r_lock); 3541 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3542 3543 /* 3544 * Deallocated the access cache 3545 */ 3546 kmem_cache_destroy(acache_cache); 3547 3548 for (i = 0; i < acachesize; i++) 3549 rw_destroy(&acache[i].lock); 3550 kmem_free(acache, acachesize * sizeof (*acache)); 3551 3552 /* 3553 * Deallocate the client handle cache 3554 */ 3555 kmem_cache_destroy(chtab_cache); 3556 3557 /* 3558 * Destroy the various mutexes and reader/writer locks 3559 */ 3560 mutex_destroy(&rpfreelist_lock); 3561 mutex_destroy(&newnum_lock); 3562 mutex_destroy(&nfs_minor_lock); 3563 (void) zone_key_delete(nfsclnt_zone_key); 3564 } 3565 3566 enum nfsstat 3567 puterrno(int error) 3568 { 3569 3570 switch (error) { 3571 case EOPNOTSUPP: 3572 return (NFSERR_OPNOTSUPP); 3573 case ENAMETOOLONG: 3574 return (NFSERR_NAMETOOLONG); 3575 case ENOTEMPTY: 3576 return (NFSERR_NOTEMPTY); 3577 case EDQUOT: 3578 return (NFSERR_DQUOT); 3579 case ESTALE: 3580 return (NFSERR_STALE); 3581 case EREMOTE: 3582 return (NFSERR_REMOTE); 3583 case ENOSYS: 3584 return (NFSERR_OPNOTSUPP); 3585 case EOVERFLOW: 3586 return (NFSERR_INVAL); 3587 default: 3588 return ((enum nfsstat)error); 3589 } 3590 /* NOTREACHED */ 3591 } 3592 3593 int 3594 geterrno(enum nfsstat status) 3595 { 3596 3597 switch (status) { 3598 case NFSERR_OPNOTSUPP: 3599 return (EOPNOTSUPP); 3600 case NFSERR_NAMETOOLONG: 3601 return (ENAMETOOLONG); 3602 case NFSERR_NOTEMPTY: 3603 return (ENOTEMPTY); 3604 case NFSERR_DQUOT: 3605 return (EDQUOT); 3606 case NFSERR_STALE: 3607 return (ESTALE); 3608 case NFSERR_REMOTE: 3609 return (EREMOTE); 3610 case NFSERR_WFLUSH: 3611 return (EIO); 3612 default: 3613 return ((int)status); 3614 } 3615 /* NOTREACHED */ 3616 } 3617 3618 enum nfsstat3 3619 puterrno3(int error) 3620 { 3621 3622 #ifdef DEBUG 3623 switch (error) { 3624 case 0: 3625 return (NFS3_OK); 3626 case EPERM: 3627 return (NFS3ERR_PERM); 3628 case ENOENT: 3629 return (NFS3ERR_NOENT); 3630 case EIO: 3631 return (NFS3ERR_IO); 3632 case ENXIO: 3633 return (NFS3ERR_NXIO); 3634 case EACCES: 3635 return (NFS3ERR_ACCES); 3636 case EEXIST: 3637 return (NFS3ERR_EXIST); 3638 case EXDEV: 3639 return (NFS3ERR_XDEV); 3640 case ENODEV: 3641 return (NFS3ERR_NODEV); 3642 case ENOTDIR: 3643 return (NFS3ERR_NOTDIR); 3644 case EISDIR: 3645 return (NFS3ERR_ISDIR); 3646 case EINVAL: 3647 return (NFS3ERR_INVAL); 3648 case EFBIG: 3649 return (NFS3ERR_FBIG); 3650 case ENOSPC: 3651 return (NFS3ERR_NOSPC); 3652 case EROFS: 3653 return (NFS3ERR_ROFS); 3654 case EMLINK: 3655 return (NFS3ERR_MLINK); 3656 case ENAMETOOLONG: 3657 return (NFS3ERR_NAMETOOLONG); 3658 case ENOTEMPTY: 3659 return (NFS3ERR_NOTEMPTY); 3660 case EDQUOT: 3661 return (NFS3ERR_DQUOT); 3662 case ESTALE: 3663 return (NFS3ERR_STALE); 3664 case EREMOTE: 3665 return (NFS3ERR_REMOTE); 3666 case ENOSYS: 3667 case EOPNOTSUPP: 3668 return (NFS3ERR_NOTSUPP); 3669 case EOVERFLOW: 3670 return (NFS3ERR_INVAL); 3671 default: 3672 zcmn_err(getzoneid(), CE_WARN, 3673 "puterrno3: got error %d", error); 3674 return ((enum nfsstat3)error); 3675 } 3676 #else 3677 switch (error) { 3678 case ENAMETOOLONG: 3679 return (NFS3ERR_NAMETOOLONG); 3680 case ENOTEMPTY: 3681 return (NFS3ERR_NOTEMPTY); 3682 case EDQUOT: 3683 return (NFS3ERR_DQUOT); 3684 case ESTALE: 3685 return (NFS3ERR_STALE); 3686 case ENOSYS: 3687 case EOPNOTSUPP: 3688 return (NFS3ERR_NOTSUPP); 3689 case EREMOTE: 3690 return (NFS3ERR_REMOTE); 3691 case EOVERFLOW: 3692 return (NFS3ERR_INVAL); 3693 default: 3694 return ((enum nfsstat3)error); 3695 } 3696 #endif 3697 } 3698 3699 int 3700 geterrno3(enum nfsstat3 status) 3701 { 3702 3703 #ifdef DEBUG 3704 switch (status) { 3705 case NFS3_OK: 3706 return (0); 3707 case NFS3ERR_PERM: 3708 return (EPERM); 3709 case NFS3ERR_NOENT: 3710 return (ENOENT); 3711 case NFS3ERR_IO: 3712 return (EIO); 3713 case NFS3ERR_NXIO: 3714 return (ENXIO); 3715 case NFS3ERR_ACCES: 3716 return (EACCES); 3717 case NFS3ERR_EXIST: 3718 return (EEXIST); 3719 case NFS3ERR_XDEV: 3720 return (EXDEV); 3721 case NFS3ERR_NODEV: 3722 return (ENODEV); 3723 case NFS3ERR_NOTDIR: 3724 return (ENOTDIR); 3725 case NFS3ERR_ISDIR: 3726 return (EISDIR); 3727 case NFS3ERR_INVAL: 3728 return (EINVAL); 3729 case NFS3ERR_FBIG: 3730 return (EFBIG); 3731 case NFS3ERR_NOSPC: 3732 return (ENOSPC); 3733 case NFS3ERR_ROFS: 3734 return (EROFS); 3735 case NFS3ERR_MLINK: 3736 return (EMLINK); 3737 case NFS3ERR_NAMETOOLONG: 3738 return (ENAMETOOLONG); 3739 case NFS3ERR_NOTEMPTY: 3740 return (ENOTEMPTY); 3741 case NFS3ERR_DQUOT: 3742 return (EDQUOT); 3743 case NFS3ERR_STALE: 3744 return (ESTALE); 3745 case NFS3ERR_REMOTE: 3746 return (EREMOTE); 3747 case NFS3ERR_BADHANDLE: 3748 return (ESTALE); 3749 case NFS3ERR_NOT_SYNC: 3750 return (EINVAL); 3751 case NFS3ERR_BAD_COOKIE: 3752 return (ENOENT); 3753 case NFS3ERR_NOTSUPP: 3754 return (EOPNOTSUPP); 3755 case NFS3ERR_TOOSMALL: 3756 return (EINVAL); 3757 case NFS3ERR_SERVERFAULT: 3758 return (EIO); 3759 case NFS3ERR_BADTYPE: 3760 return (EINVAL); 3761 case NFS3ERR_JUKEBOX: 3762 return (ENXIO); 3763 default: 3764 zcmn_err(getzoneid(), CE_WARN, 3765 "geterrno3: got status %d", status); 3766 return ((int)status); 3767 } 3768 #else 3769 switch (status) { 3770 case NFS3ERR_NAMETOOLONG: 3771 return (ENAMETOOLONG); 3772 case NFS3ERR_NOTEMPTY: 3773 return (ENOTEMPTY); 3774 case NFS3ERR_DQUOT: 3775 return (EDQUOT); 3776 case NFS3ERR_STALE: 3777 case NFS3ERR_BADHANDLE: 3778 return (ESTALE); 3779 case NFS3ERR_NOTSUPP: 3780 return (EOPNOTSUPP); 3781 case NFS3ERR_REMOTE: 3782 return (EREMOTE); 3783 case NFS3ERR_NOT_SYNC: 3784 case NFS3ERR_TOOSMALL: 3785 case NFS3ERR_BADTYPE: 3786 return (EINVAL); 3787 case NFS3ERR_BAD_COOKIE: 3788 return (ENOENT); 3789 case NFS3ERR_SERVERFAULT: 3790 return (EIO); 3791 case NFS3ERR_JUKEBOX: 3792 return (ENXIO); 3793 default: 3794 return ((int)status); 3795 } 3796 #endif 3797 } 3798 3799 rddir_cache * 3800 rddir_cache_alloc(int flags) 3801 { 3802 rddir_cache *rc; 3803 3804 rc = kmem_alloc(sizeof (*rc), flags); 3805 if (rc != NULL) { 3806 rc->entries = NULL; 3807 rc->flags = RDDIR; 3808 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3809 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3810 rc->count = 1; 3811 #ifdef DEBUG 3812 atomic_add_64(&clstat_debug.dirent.value.ui64, 1); 3813 #endif 3814 } 3815 return (rc); 3816 } 3817 3818 static void 3819 rddir_cache_free(rddir_cache *rc) 3820 { 3821 3822 #ifdef DEBUG 3823 atomic_add_64(&clstat_debug.dirent.value.ui64, -1); 3824 #endif 3825 if (rc->entries != NULL) { 3826 #ifdef DEBUG 3827 rddir_cache_buf_free(rc->entries, rc->buflen); 3828 #else 3829 kmem_free(rc->entries, rc->buflen); 3830 #endif 3831 } 3832 cv_destroy(&rc->cv); 3833 mutex_destroy(&rc->lock); 3834 kmem_free(rc, sizeof (*rc)); 3835 } 3836 3837 void 3838 rddir_cache_hold(rddir_cache *rc) 3839 { 3840 3841 mutex_enter(&rc->lock); 3842 rc->count++; 3843 mutex_exit(&rc->lock); 3844 } 3845 3846 void 3847 rddir_cache_rele(rddir_cache *rc) 3848 { 3849 3850 mutex_enter(&rc->lock); 3851 ASSERT(rc->count > 0); 3852 if (--rc->count == 0) { 3853 mutex_exit(&rc->lock); 3854 rddir_cache_free(rc); 3855 } else 3856 mutex_exit(&rc->lock); 3857 } 3858 3859 #ifdef DEBUG 3860 char * 3861 rddir_cache_buf_alloc(size_t size, int flags) 3862 { 3863 char *rc; 3864 3865 rc = kmem_alloc(size, flags); 3866 if (rc != NULL) 3867 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3868 return (rc); 3869 } 3870 3871 void 3872 rddir_cache_buf_free(void *addr, size_t size) 3873 { 3874 3875 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3876 kmem_free(addr, size); 3877 } 3878 #endif 3879 3880 static int 3881 nfs_free_data_reclaim(rnode_t *rp) 3882 { 3883 char *contents; 3884 int size; 3885 vsecattr_t *vsp; 3886 nfs3_pathconf_info *info; 3887 int freed; 3888 cred_t *cred; 3889 3890 /* 3891 * Free any held credentials and caches which 3892 * may be associated with this rnode. 3893 */ 3894 mutex_enter(&rp->r_statelock); 3895 cred = rp->r_cred; 3896 rp->r_cred = NULL; 3897 contents = rp->r_symlink.contents; 3898 size = rp->r_symlink.size; 3899 rp->r_symlink.contents = NULL; 3900 vsp = rp->r_secattr; 3901 rp->r_secattr = NULL; 3902 info = rp->r_pathconf; 3903 rp->r_pathconf = NULL; 3904 mutex_exit(&rp->r_statelock); 3905 3906 if (cred != NULL) 3907 crfree(cred); 3908 3909 /* 3910 * Free the access cache entries. 3911 */ 3912 freed = nfs_access_purge_rp(rp); 3913 3914 if (!HAVE_RDDIR_CACHE(rp) && 3915 contents == NULL && 3916 vsp == NULL && 3917 info == NULL) 3918 return (freed); 3919 3920 /* 3921 * Free the readdir cache entries 3922 */ 3923 if (HAVE_RDDIR_CACHE(rp)) 3924 nfs_purge_rddir_cache(RTOV(rp)); 3925 3926 /* 3927 * Free the symbolic link cache. 3928 */ 3929 if (contents != NULL) { 3930 3931 kmem_free((void *)contents, size); 3932 } 3933 3934 /* 3935 * Free any cached ACL. 3936 */ 3937 if (vsp != NULL) 3938 nfs_acl_free(vsp); 3939 3940 /* 3941 * Free any cached pathconf information. 3942 */ 3943 if (info != NULL) 3944 kmem_free(info, sizeof (*info)); 3945 3946 return (1); 3947 } 3948 3949 static int 3950 nfs_active_data_reclaim(rnode_t *rp) 3951 { 3952 char *contents; 3953 int size; 3954 vsecattr_t *vsp; 3955 nfs3_pathconf_info *info; 3956 int freed; 3957 3958 /* 3959 * Free any held credentials and caches which 3960 * may be associated with this rnode. 3961 */ 3962 if (!mutex_tryenter(&rp->r_statelock)) 3963 return (0); 3964 contents = rp->r_symlink.contents; 3965 size = rp->r_symlink.size; 3966 rp->r_symlink.contents = NULL; 3967 vsp = rp->r_secattr; 3968 rp->r_secattr = NULL; 3969 info = rp->r_pathconf; 3970 rp->r_pathconf = NULL; 3971 mutex_exit(&rp->r_statelock); 3972 3973 /* 3974 * Free the access cache entries. 3975 */ 3976 freed = nfs_access_purge_rp(rp); 3977 3978 if (!HAVE_RDDIR_CACHE(rp) && 3979 contents == NULL && 3980 vsp == NULL && 3981 info == NULL) 3982 return (freed); 3983 3984 /* 3985 * Free the readdir cache entries 3986 */ 3987 if (HAVE_RDDIR_CACHE(rp)) 3988 nfs_purge_rddir_cache(RTOV(rp)); 3989 3990 /* 3991 * Free the symbolic link cache. 3992 */ 3993 if (contents != NULL) { 3994 3995 kmem_free((void *)contents, size); 3996 } 3997 3998 /* 3999 * Free any cached ACL. 4000 */ 4001 if (vsp != NULL) 4002 nfs_acl_free(vsp); 4003 4004 /* 4005 * Free any cached pathconf information. 4006 */ 4007 if (info != NULL) 4008 kmem_free(info, sizeof (*info)); 4009 4010 return (1); 4011 } 4012 4013 static int 4014 nfs_free_reclaim(void) 4015 { 4016 int freed; 4017 rnode_t *rp; 4018 4019 #ifdef DEBUG 4020 clstat_debug.f_reclaim.value.ui64++; 4021 #endif 4022 freed = 0; 4023 mutex_enter(&rpfreelist_lock); 4024 rp = rpfreelist; 4025 if (rp != NULL) { 4026 do { 4027 if (nfs_free_data_reclaim(rp)) 4028 freed = 1; 4029 } while ((rp = rp->r_freef) != rpfreelist); 4030 } 4031 mutex_exit(&rpfreelist_lock); 4032 return (freed); 4033 } 4034 4035 static int 4036 nfs_active_reclaim(void) 4037 { 4038 int freed; 4039 int index; 4040 rnode_t *rp; 4041 4042 #ifdef DEBUG 4043 clstat_debug.a_reclaim.value.ui64++; 4044 #endif 4045 freed = 0; 4046 for (index = 0; index < rtablesize; index++) { 4047 rw_enter(&rtable[index].r_lock, RW_READER); 4048 for (rp = rtable[index].r_hashf; 4049 rp != (rnode_t *)(&rtable[index]); 4050 rp = rp->r_hashf) { 4051 if (nfs_active_data_reclaim(rp)) 4052 freed = 1; 4053 } 4054 rw_exit(&rtable[index].r_lock); 4055 } 4056 return (freed); 4057 } 4058 4059 static int 4060 nfs_rnode_reclaim(void) 4061 { 4062 int freed; 4063 rnode_t *rp; 4064 vnode_t *vp; 4065 4066 #ifdef DEBUG 4067 clstat_debug.r_reclaim.value.ui64++; 4068 #endif 4069 freed = 0; 4070 mutex_enter(&rpfreelist_lock); 4071 while ((rp = rpfreelist) != NULL) { 4072 rp_rmfree(rp); 4073 mutex_exit(&rpfreelist_lock); 4074 if (rp->r_flags & RHASHED) { 4075 vp = RTOV(rp); 4076 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4077 mutex_enter(&vp->v_lock); 4078 if (vp->v_count > 1) { 4079 vp->v_count--; 4080 mutex_exit(&vp->v_lock); 4081 rw_exit(&rp->r_hashq->r_lock); 4082 mutex_enter(&rpfreelist_lock); 4083 continue; 4084 } 4085 mutex_exit(&vp->v_lock); 4086 rp_rmhash_locked(rp); 4087 rw_exit(&rp->r_hashq->r_lock); 4088 } 4089 /* 4090 * This call to rp_addfree will end up destroying the 4091 * rnode, but in a safe way with the appropriate set 4092 * of checks done. 4093 */ 4094 rp_addfree(rp, CRED()); 4095 mutex_enter(&rpfreelist_lock); 4096 } 4097 mutex_exit(&rpfreelist_lock); 4098 return (freed); 4099 } 4100 4101 /*ARGSUSED*/ 4102 static void 4103 nfs_reclaim(void *cdrarg) 4104 { 4105 4106 #ifdef DEBUG 4107 clstat_debug.reclaim.value.ui64++; 4108 #endif 4109 if (nfs_free_reclaim()) 4110 return; 4111 4112 if (nfs_active_reclaim()) 4113 return; 4114 4115 (void) nfs_rnode_reclaim(); 4116 } 4117 4118 /* 4119 * NFS client failover support 4120 * 4121 * Routines to copy filehandles 4122 */ 4123 void 4124 nfscopyfh(caddr_t fhp, vnode_t *vp) 4125 { 4126 fhandle_t *dest = (fhandle_t *)fhp; 4127 4128 if (dest != NULL) 4129 *dest = *VTOFH(vp); 4130 } 4131 4132 void 4133 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4134 { 4135 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4136 4137 if (dest != NULL) 4138 *dest = *VTOFH3(vp); 4139 } 4140 4141 /* 4142 * NFS client failover support 4143 * 4144 * failover_safe() will test various conditions to ensure that 4145 * failover is permitted for this vnode. It will be denied 4146 * if: 4147 * 1) the operation in progress does not support failover (NULL fi) 4148 * 2) there are no available replicas (NULL mi_servers->sv_next) 4149 * 3) any locks are outstanding on this file 4150 */ 4151 static int 4152 failover_safe(failinfo_t *fi) 4153 { 4154 4155 /* 4156 * Does this op permit failover? 4157 */ 4158 if (fi == NULL || fi->vp == NULL) 4159 return (0); 4160 4161 /* 4162 * Are there any alternates to failover to? 4163 */ 4164 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4165 return (0); 4166 4167 /* 4168 * Disable check; we've forced local locking 4169 * 4170 * if (flk_has_remote_locks(fi->vp)) 4171 * return (0); 4172 */ 4173 4174 /* 4175 * If we have no partial path, we can't do anything 4176 */ 4177 if (VTOR(fi->vp)->r_path == NULL) 4178 return (0); 4179 4180 return (1); 4181 } 4182 4183 #include <sys/thread.h> 4184 4185 /* 4186 * NFS client failover support 4187 * 4188 * failover_newserver() will start a search for a new server, 4189 * preferably by starting an async thread to do the work. If 4190 * someone is already doing this (recognizable by MI_BINDINPROG 4191 * being set), it will simply return and the calling thread 4192 * will queue on the mi_failover_cv condition variable. 4193 */ 4194 static void 4195 failover_newserver(mntinfo_t *mi) 4196 { 4197 /* 4198 * Check if someone else is doing this already 4199 */ 4200 mutex_enter(&mi->mi_lock); 4201 if (mi->mi_flags & MI_BINDINPROG) { 4202 mutex_exit(&mi->mi_lock); 4203 return; 4204 } 4205 mi->mi_flags |= MI_BINDINPROG; 4206 4207 /* 4208 * Need to hold the vfs struct so that it can't be released 4209 * while the failover thread is selecting a new server. 4210 */ 4211 VFS_HOLD(mi->mi_vfsp); 4212 4213 /* 4214 * Start a thread to do the real searching. 4215 */ 4216 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4217 4218 mutex_exit(&mi->mi_lock); 4219 } 4220 4221 /* 4222 * NFS client failover support 4223 * 4224 * failover_thread() will find a new server to replace the one 4225 * currently in use, wake up other threads waiting on this mount 4226 * point, and die. It will start at the head of the server list 4227 * and poll servers until it finds one with an NFS server which is 4228 * registered and responds to a NULL procedure ping. 4229 * 4230 * XXX failover_thread is unsafe within the scope of the 4231 * present model defined for cpr to suspend the system. 4232 * Specifically, over-the-wire calls made by the thread 4233 * are unsafe. The thread needs to be reevaluated in case of 4234 * future updates to the cpr suspend model. 4235 */ 4236 static void 4237 failover_thread(mntinfo_t *mi) 4238 { 4239 servinfo_t *svp = NULL; 4240 CLIENT *cl; 4241 enum clnt_stat status; 4242 struct timeval tv; 4243 int error; 4244 int oncethru = 0; 4245 callb_cpr_t cprinfo; 4246 rnode_t *rp; 4247 int index; 4248 char *srvnames; 4249 size_t srvnames_len; 4250 struct nfs_clnt *nfscl = NULL; 4251 zoneid_t zoneid = getzoneid(); 4252 4253 #ifdef DEBUG 4254 /* 4255 * This is currently only needed to access counters which exist on 4256 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4257 * on non-DEBUG kernels. 4258 */ 4259 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4260 ASSERT(nfscl != NULL); 4261 #endif 4262 4263 /* 4264 * Its safe to piggyback on the mi_lock since failover_newserver() 4265 * code guarantees that there will be only one failover thread 4266 * per mountinfo at any instance. 4267 */ 4268 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4269 "failover_thread"); 4270 4271 mutex_enter(&mi->mi_lock); 4272 while (mi->mi_readers) { 4273 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4274 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4275 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4276 } 4277 mutex_exit(&mi->mi_lock); 4278 4279 tv.tv_sec = 2; 4280 tv.tv_usec = 0; 4281 4282 /* 4283 * Ping the null NFS procedure of every server in 4284 * the list until one responds. We always start 4285 * at the head of the list and always skip the one 4286 * that is current, since it's caused us a problem. 4287 */ 4288 while (svp == NULL) { 4289 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4290 if (!oncethru && svp == mi->mi_curr_serv) 4291 continue; 4292 4293 /* 4294 * If the file system was forcibly umounted 4295 * while trying to do a failover, then just 4296 * give up on the failover. It won't matter 4297 * what the server is. 4298 */ 4299 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4300 svp = NULL; 4301 goto done; 4302 } 4303 4304 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4305 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4306 if (error) 4307 continue; 4308 4309 if (!(mi->mi_flags & MI_INT)) 4310 cl->cl_nosignal = TRUE; 4311 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4312 xdr_void, NULL, tv); 4313 if (!(mi->mi_flags & MI_INT)) 4314 cl->cl_nosignal = FALSE; 4315 AUTH_DESTROY(cl->cl_auth); 4316 CLNT_DESTROY(cl); 4317 if (status == RPC_SUCCESS) { 4318 if (svp == mi->mi_curr_serv) { 4319 #ifdef DEBUG 4320 zcmn_err(zoneid, CE_NOTE, 4321 "NFS%d: failing over: selecting original server %s", 4322 mi->mi_vers, svp->sv_hostname); 4323 #else 4324 zcmn_err(zoneid, CE_NOTE, 4325 "NFS: failing over: selecting original server %s", 4326 svp->sv_hostname); 4327 #endif 4328 } else { 4329 #ifdef DEBUG 4330 zcmn_err(zoneid, CE_NOTE, 4331 "NFS%d: failing over from %s to %s", 4332 mi->mi_vers, 4333 mi->mi_curr_serv->sv_hostname, 4334 svp->sv_hostname); 4335 #else 4336 zcmn_err(zoneid, CE_NOTE, 4337 "NFS: failing over from %s to %s", 4338 mi->mi_curr_serv->sv_hostname, 4339 svp->sv_hostname); 4340 #endif 4341 } 4342 break; 4343 } 4344 } 4345 4346 if (svp == NULL) { 4347 if (!oncethru) { 4348 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4349 #ifdef DEBUG 4350 zprintf(zoneid, 4351 "NFS%d servers %s not responding " 4352 "still trying\n", mi->mi_vers, srvnames); 4353 #else 4354 zprintf(zoneid, "NFS servers %s not responding " 4355 "still trying\n", srvnames); 4356 #endif 4357 oncethru = 1; 4358 } 4359 mutex_enter(&mi->mi_lock); 4360 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4361 mutex_exit(&mi->mi_lock); 4362 delay(hz); 4363 mutex_enter(&mi->mi_lock); 4364 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4365 mutex_exit(&mi->mi_lock); 4366 } 4367 } 4368 4369 if (oncethru) { 4370 #ifdef DEBUG 4371 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4372 #else 4373 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4374 #endif 4375 } 4376 4377 if (svp != mi->mi_curr_serv) { 4378 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4379 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4380 rw_enter(&rtable[index].r_lock, RW_WRITER); 4381 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4382 mi->mi_vfsp); 4383 if (rp != NULL) { 4384 if (rp->r_flags & RHASHED) 4385 rp_rmhash_locked(rp); 4386 rw_exit(&rtable[index].r_lock); 4387 rp->r_server = svp; 4388 rp->r_fh = svp->sv_fhandle; 4389 (void) nfs_free_data_reclaim(rp); 4390 index = rtablehash(&rp->r_fh); 4391 rp->r_hashq = &rtable[index]; 4392 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4393 vn_exists(RTOV(rp)); 4394 rp_addhash(rp); 4395 rw_exit(&rp->r_hashq->r_lock); 4396 VN_RELE(RTOV(rp)); 4397 } else 4398 rw_exit(&rtable[index].r_lock); 4399 } 4400 4401 done: 4402 if (oncethru) 4403 kmem_free(srvnames, srvnames_len); 4404 mutex_enter(&mi->mi_lock); 4405 mi->mi_flags &= ~MI_BINDINPROG; 4406 if (svp != NULL) { 4407 mi->mi_curr_serv = svp; 4408 mi->mi_failover++; 4409 #ifdef DEBUG 4410 nfscl->nfscl_stat.failover.value.ui64++; 4411 #endif 4412 } 4413 cv_broadcast(&mi->mi_failover_cv); 4414 CALLB_CPR_EXIT(&cprinfo); 4415 VFS_RELE(mi->mi_vfsp); 4416 zthread_exit(); 4417 /* NOTREACHED */ 4418 } 4419 4420 /* 4421 * NFS client failover support 4422 * 4423 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4424 * is cleared, meaning that failover is complete. Called with 4425 * mi_lock mutex held. 4426 */ 4427 static int 4428 failover_wait(mntinfo_t *mi) 4429 { 4430 k_sigset_t smask; 4431 4432 /* 4433 * If someone else is hunting for a living server, 4434 * sleep until it's done. After our sleep, we may 4435 * be bound to the right server and get off cheaply. 4436 */ 4437 while (mi->mi_flags & MI_BINDINPROG) { 4438 /* 4439 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4440 * and SIGTERM. (Preserving the existing masks). 4441 * Mask out SIGINT if mount option nointr is specified. 4442 */ 4443 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4444 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4445 /* 4446 * restore original signal mask 4447 */ 4448 sigunintr(&smask); 4449 return (EINTR); 4450 } 4451 /* 4452 * restore original signal mask 4453 */ 4454 sigunintr(&smask); 4455 } 4456 return (0); 4457 } 4458 4459 /* 4460 * NFS client failover support 4461 * 4462 * failover_remap() will do a partial pathname lookup and find the 4463 * desired vnode on the current server. The interim vnode will be 4464 * discarded after we pilfer the new filehandle. 4465 * 4466 * Side effects: 4467 * - This routine will also update the filehandle in the args structure 4468 * pointed to by the fi->fhp pointer if it is non-NULL. 4469 */ 4470 4471 static int 4472 failover_remap(failinfo_t *fi) 4473 { 4474 vnode_t *vp, *nvp, *rootvp; 4475 rnode_t *rp, *nrp; 4476 mntinfo_t *mi; 4477 int error; 4478 #ifdef DEBUG 4479 struct nfs_clnt *nfscl; 4480 4481 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4482 ASSERT(nfscl != NULL); 4483 #endif 4484 /* 4485 * Sanity check 4486 */ 4487 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4488 return (EINVAL); 4489 vp = fi->vp; 4490 rp = VTOR(vp); 4491 mi = VTOMI(vp); 4492 4493 if (!(vp->v_flag & VROOT)) { 4494 /* 4495 * Given the root fh, use the path stored in 4496 * the rnode to find the fh for the new server. 4497 */ 4498 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4499 if (error) 4500 return (error); 4501 4502 error = failover_lookup(rp->r_path, rootvp, 4503 fi->lookupproc, fi->xattrdirproc, &nvp); 4504 4505 VN_RELE(rootvp); 4506 4507 if (error) 4508 return (error); 4509 4510 /* 4511 * If we found the same rnode, we're done now 4512 */ 4513 if (nvp == vp) { 4514 /* 4515 * Failed and the new server may physically be same 4516 * OR may share a same disk subsystem. In this case 4517 * file handle for a particular file path is not going 4518 * to change, given the same filehandle lookup will 4519 * always locate the same rnode as the existing one. 4520 * All we might need to do is to update the r_server 4521 * with the current servinfo. 4522 */ 4523 if (!VALID_FH(fi)) { 4524 rp->r_server = mi->mi_curr_serv; 4525 } 4526 VN_RELE(nvp); 4527 return (0); 4528 } 4529 4530 /* 4531 * Try to make it so that no one else will find this 4532 * vnode because it is just a temporary to hold the 4533 * new file handle until that file handle can be 4534 * copied to the original vnode/rnode. 4535 */ 4536 nrp = VTOR(nvp); 4537 mutex_enter(&mi->mi_remap_lock); 4538 /* 4539 * Some other thread could have raced in here and could 4540 * have done the remap for this particular rnode before 4541 * this thread here. Check for rp->r_server and 4542 * mi->mi_curr_serv and return if they are same. 4543 */ 4544 if (VALID_FH(fi)) { 4545 mutex_exit(&mi->mi_remap_lock); 4546 VN_RELE(nvp); 4547 return (0); 4548 } 4549 4550 if (nrp->r_flags & RHASHED) 4551 rp_rmhash(nrp); 4552 4553 /* 4554 * As a heuristic check on the validity of the new 4555 * file, check that the size and type match against 4556 * that we remember from the old version. 4557 */ 4558 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4559 mutex_exit(&mi->mi_remap_lock); 4560 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4561 "NFS replicas %s and %s: file %s not same.", 4562 rp->r_server->sv_hostname, 4563 nrp->r_server->sv_hostname, rp->r_path); 4564 VN_RELE(nvp); 4565 return (EINVAL); 4566 } 4567 4568 /* 4569 * snarf the filehandle from the new rnode 4570 * then release it, again while updating the 4571 * hash queues for the rnode. 4572 */ 4573 if (rp->r_flags & RHASHED) 4574 rp_rmhash(rp); 4575 rp->r_server = mi->mi_curr_serv; 4576 rp->r_fh = nrp->r_fh; 4577 rp->r_hashq = nrp->r_hashq; 4578 /* 4579 * Copy the attributes from the new rnode to the old 4580 * rnode. This will help to reduce unnecessary page 4581 * cache flushes. 4582 */ 4583 rp->r_attr = nrp->r_attr; 4584 rp->r_attrtime = nrp->r_attrtime; 4585 rp->r_mtime = nrp->r_mtime; 4586 (void) nfs_free_data_reclaim(rp); 4587 nfs_setswaplike(vp, &rp->r_attr); 4588 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4589 rp_addhash(rp); 4590 rw_exit(&rp->r_hashq->r_lock); 4591 mutex_exit(&mi->mi_remap_lock); 4592 VN_RELE(nvp); 4593 } 4594 4595 /* 4596 * Update successful failover remap count 4597 */ 4598 mutex_enter(&mi->mi_lock); 4599 mi->mi_remap++; 4600 mutex_exit(&mi->mi_lock); 4601 #ifdef DEBUG 4602 nfscl->nfscl_stat.remap.value.ui64++; 4603 #endif 4604 4605 /* 4606 * If we have a copied filehandle to update, do it now. 4607 */ 4608 if (fi->fhp != NULL && fi->copyproc != NULL) 4609 (*fi->copyproc)(fi->fhp, vp); 4610 4611 return (0); 4612 } 4613 4614 /* 4615 * NFS client failover support 4616 * 4617 * We want a simple pathname lookup routine to parse the pieces 4618 * of path in rp->r_path. We know that the path was a created 4619 * as rnodes were made, so we know we have only to deal with 4620 * paths that look like: 4621 * dir1/dir2/dir3/file 4622 * Any evidence of anything like .., symlinks, and ENOTDIR 4623 * are hard errors, because they mean something in this filesystem 4624 * is different from the one we came from, or has changed under 4625 * us in some way. If this is true, we want the failure. 4626 * 4627 * Extended attributes: if the filesystem is mounted with extended 4628 * attributes enabled (-o xattr), the attribute directory will be 4629 * represented in the r_path as the magic name XATTR_RPATH. So if 4630 * we see that name in the pathname, is must be because this node 4631 * is an extended attribute. Therefore, look it up that way. 4632 */ 4633 static int 4634 failover_lookup(char *path, vnode_t *root, 4635 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4636 vnode_t *, cred_t *, int), 4637 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4638 vnode_t **new) 4639 { 4640 vnode_t *dvp, *nvp; 4641 int error = EINVAL; 4642 char *s, *p, *tmppath; 4643 size_t len; 4644 mntinfo_t *mi; 4645 bool_t xattr; 4646 4647 /* Make local copy of path */ 4648 len = strlen(path) + 1; 4649 tmppath = kmem_alloc(len, KM_SLEEP); 4650 (void) strcpy(tmppath, path); 4651 s = tmppath; 4652 4653 dvp = root; 4654 VN_HOLD(dvp); 4655 mi = VTOMI(root); 4656 xattr = mi->mi_flags & MI_EXTATTR; 4657 4658 do { 4659 p = strchr(s, '/'); 4660 if (p != NULL) 4661 *p = '\0'; 4662 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4663 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4664 RFSCALL_SOFT); 4665 } else { 4666 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4667 CRED(), RFSCALL_SOFT); 4668 } 4669 if (p != NULL) 4670 *p++ = '/'; 4671 if (error) { 4672 VN_RELE(dvp); 4673 kmem_free(tmppath, len); 4674 return (error); 4675 } 4676 s = p; 4677 VN_RELE(dvp); 4678 dvp = nvp; 4679 } while (p != NULL); 4680 4681 if (nvp != NULL && new != NULL) 4682 *new = nvp; 4683 kmem_free(tmppath, len); 4684 return (0); 4685 } 4686 4687 /* 4688 * NFS client failover support 4689 * 4690 * sv_free() frees the malloc'd portion of a "servinfo_t". 4691 */ 4692 void 4693 sv_free(servinfo_t *svp) 4694 { 4695 servinfo_t *next; 4696 struct knetconfig *knconf; 4697 4698 while (svp != NULL) { 4699 next = svp->sv_next; 4700 if (svp->sv_secdata) 4701 sec_clnt_freeinfo(svp->sv_secdata); 4702 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4703 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4704 knconf = svp->sv_knconf; 4705 if (knconf != NULL) { 4706 if (knconf->knc_protofmly != NULL) 4707 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4708 if (knconf->knc_proto != NULL) 4709 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4710 kmem_free(knconf, sizeof (*knconf)); 4711 } 4712 knconf = svp->sv_origknconf; 4713 if (knconf != NULL) { 4714 if (knconf->knc_protofmly != NULL) 4715 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4716 if (knconf->knc_proto != NULL) 4717 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4718 kmem_free(knconf, sizeof (*knconf)); 4719 } 4720 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4721 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4722 mutex_destroy(&svp->sv_lock); 4723 kmem_free(svp, sizeof (*svp)); 4724 svp = next; 4725 } 4726 } 4727 4728 /* 4729 * Only can return non-zero if intr != 0. 4730 */ 4731 int 4732 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4733 { 4734 4735 mutex_enter(&l->lock); 4736 4737 /* 4738 * If this is a nested enter, then allow it. There 4739 * must be as many exits as enters through. 4740 */ 4741 if (l->owner == curthread) { 4742 /* lock is held for writing by current thread */ 4743 ASSERT(rw == RW_READER || rw == RW_WRITER); 4744 l->count--; 4745 } else if (rw == RW_READER) { 4746 /* 4747 * While there is a writer active or writers waiting, 4748 * then wait for them to finish up and move on. Then, 4749 * increment the count to indicate that a reader is 4750 * active. 4751 */ 4752 while (l->count < 0 || l->waiters > 0) { 4753 if (intr) { 4754 klwp_t *lwp = ttolwp(curthread); 4755 4756 if (lwp != NULL) 4757 lwp->lwp_nostop++; 4758 if (!cv_wait_sig(&l->cv, &l->lock)) { 4759 if (lwp != NULL) 4760 lwp->lwp_nostop--; 4761 mutex_exit(&l->lock); 4762 return (EINTR); 4763 } 4764 if (lwp != NULL) 4765 lwp->lwp_nostop--; 4766 } else 4767 cv_wait(&l->cv, &l->lock); 4768 } 4769 ASSERT(l->count < INT_MAX); 4770 #ifdef DEBUG 4771 if ((l->count % 10000) == 9999) 4772 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4773 "rwlock @ %p\n", l->count, (void *)&l); 4774 #endif 4775 l->count++; 4776 } else { 4777 ASSERT(rw == RW_WRITER); 4778 /* 4779 * While there are readers active or a writer 4780 * active, then wait for all of the readers 4781 * to finish or for the writer to finish. 4782 * Then, set the owner field to curthread and 4783 * decrement count to indicate that a writer 4784 * is active. 4785 */ 4786 while (l->count > 0 || l->owner != NULL) { 4787 l->waiters++; 4788 if (intr) { 4789 klwp_t *lwp = ttolwp(curthread); 4790 4791 if (lwp != NULL) 4792 lwp->lwp_nostop++; 4793 if (!cv_wait_sig(&l->cv, &l->lock)) { 4794 if (lwp != NULL) 4795 lwp->lwp_nostop--; 4796 l->waiters--; 4797 cv_broadcast(&l->cv); 4798 mutex_exit(&l->lock); 4799 return (EINTR); 4800 } 4801 if (lwp != NULL) 4802 lwp->lwp_nostop--; 4803 } else 4804 cv_wait(&l->cv, &l->lock); 4805 l->waiters--; 4806 } 4807 l->owner = curthread; 4808 l->count--; 4809 } 4810 4811 mutex_exit(&l->lock); 4812 4813 return (0); 4814 } 4815 4816 /* 4817 * If the lock is available, obtain it and return non-zero. If there is 4818 * already a conflicting lock, return 0 immediately. 4819 */ 4820 4821 int 4822 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4823 { 4824 mutex_enter(&l->lock); 4825 4826 /* 4827 * If this is a nested enter, then allow it. There 4828 * must be as many exits as enters through. 4829 */ 4830 if (l->owner == curthread) { 4831 /* lock is held for writing by current thread */ 4832 ASSERT(rw == RW_READER || rw == RW_WRITER); 4833 l->count--; 4834 } else if (rw == RW_READER) { 4835 /* 4836 * If there is a writer active or writers waiting, deny the 4837 * lock. Otherwise, bump the count of readers. 4838 */ 4839 if (l->count < 0 || l->waiters > 0) { 4840 mutex_exit(&l->lock); 4841 return (0); 4842 } 4843 l->count++; 4844 } else { 4845 ASSERT(rw == RW_WRITER); 4846 /* 4847 * If there are readers active or a writer active, deny the 4848 * lock. Otherwise, set the owner field to curthread and 4849 * decrement count to indicate that a writer is active. 4850 */ 4851 if (l->count > 0 || l->owner != NULL) { 4852 mutex_exit(&l->lock); 4853 return (0); 4854 } 4855 l->owner = curthread; 4856 l->count--; 4857 } 4858 4859 mutex_exit(&l->lock); 4860 4861 return (1); 4862 } 4863 4864 void 4865 nfs_rw_exit(nfs_rwlock_t *l) 4866 { 4867 4868 mutex_enter(&l->lock); 4869 /* 4870 * If this is releasing a writer lock, then increment count to 4871 * indicate that there is one less writer active. If this was 4872 * the last of possibly nested writer locks, then clear the owner 4873 * field as well to indicate that there is no writer active 4874 * and wakeup any possible waiting writers or readers. 4875 * 4876 * If releasing a reader lock, then just decrement count to 4877 * indicate that there is one less reader active. If this was 4878 * the last active reader and there are writer(s) waiting, 4879 * then wake up the first. 4880 */ 4881 if (l->owner != NULL) { 4882 ASSERT(l->owner == curthread); 4883 l->count++; 4884 if (l->count == 0) { 4885 l->owner = NULL; 4886 cv_broadcast(&l->cv); 4887 } 4888 } else { 4889 ASSERT(l->count > 0); 4890 l->count--; 4891 if (l->count == 0 && l->waiters > 0) 4892 cv_broadcast(&l->cv); 4893 } 4894 mutex_exit(&l->lock); 4895 } 4896 4897 int 4898 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4899 { 4900 4901 if (rw == RW_READER) 4902 return (l->count > 0); 4903 ASSERT(rw == RW_WRITER); 4904 return (l->count < 0); 4905 } 4906 4907 /* ARGSUSED */ 4908 void 4909 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4910 { 4911 4912 l->count = 0; 4913 l->waiters = 0; 4914 l->owner = NULL; 4915 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4916 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4917 } 4918 4919 void 4920 nfs_rw_destroy(nfs_rwlock_t *l) 4921 { 4922 4923 mutex_destroy(&l->lock); 4924 cv_destroy(&l->cv); 4925 } 4926 4927 int 4928 nfs3_rddir_compar(const void *x, const void *y) 4929 { 4930 rddir_cache *a = (rddir_cache *)x; 4931 rddir_cache *b = (rddir_cache *)y; 4932 4933 if (a->nfs3_cookie == b->nfs3_cookie) { 4934 if (a->buflen == b->buflen) 4935 return (0); 4936 if (a->buflen < b->buflen) 4937 return (-1); 4938 return (1); 4939 } 4940 4941 if (a->nfs3_cookie < b->nfs3_cookie) 4942 return (-1); 4943 4944 return (1); 4945 } 4946 4947 int 4948 nfs_rddir_compar(const void *x, const void *y) 4949 { 4950 rddir_cache *a = (rddir_cache *)x; 4951 rddir_cache *b = (rddir_cache *)y; 4952 4953 if (a->nfs_cookie == b->nfs_cookie) { 4954 if (a->buflen == b->buflen) 4955 return (0); 4956 if (a->buflen < b->buflen) 4957 return (-1); 4958 return (1); 4959 } 4960 4961 if (a->nfs_cookie < b->nfs_cookie) 4962 return (-1); 4963 4964 return (1); 4965 } 4966 4967 static char * 4968 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 4969 { 4970 servinfo_t *s; 4971 char *srvnames; 4972 char *namep; 4973 size_t length; 4974 4975 /* 4976 * Calculate the length of the string required to hold all 4977 * of the server names plus either a comma or a null 4978 * character following each individual one. 4979 */ 4980 length = 0; 4981 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 4982 length += s->sv_hostnamelen; 4983 4984 srvnames = kmem_alloc(length, KM_SLEEP); 4985 4986 namep = srvnames; 4987 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 4988 (void) strcpy(namep, s->sv_hostname); 4989 namep += s->sv_hostnamelen - 1; 4990 *namep++ = ','; 4991 } 4992 *--namep = '\0'; 4993 4994 *len = length; 4995 4996 return (srvnames); 4997 } 4998 4999 /* 5000 * These two functions are temporary and designed for the upgrade-workaround 5001 * only. They cannot be used for general zone-crossing NFS client support, and 5002 * will be removed shortly. 5003 * 5004 * When the workaround is enabled, all NFS traffic is forced into the global 5005 * zone. These functions are called when the code needs to refer to the state 5006 * of the underlying network connection. They're not called when the function 5007 * needs to refer to the state of the process that invoked the system call. 5008 * (E.g., when checking whether the zone is shutting down during the mount() 5009 * call.) 5010 */ 5011 5012 struct zone * 5013 nfs_zone(void) 5014 { 5015 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5016 } 5017 5018 zoneid_t 5019 nfs_zoneid(void) 5020 { 5021 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5022 } 5023 5024 /* 5025 * nfs_mount_label_policy: 5026 * Determine whether the mount is allowed according to MAC check, 5027 * by comparing (where appropriate) label of the remote server 5028 * against the label of the zone being mounted into. 5029 * 5030 * Returns: 5031 * 0 : access allowed 5032 * -1 : read-only access allowed (i.e., read-down) 5033 * >0 : error code, such as EACCES 5034 */ 5035 int 5036 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5037 struct knetconfig *knconf, cred_t *cr) 5038 { 5039 int addr_type; 5040 void *ipaddr; 5041 bslabel_t *server_sl, *mntlabel; 5042 zone_t *mntzone = NULL; 5043 ts_label_t *zlabel; 5044 tsol_tpc_t *tp; 5045 ts_label_t *tsl = NULL; 5046 int retv; 5047 5048 /* 5049 * Get the zone's label. Each zone on a labeled system has a label. 5050 */ 5051 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5052 zlabel = mntzone->zone_slabel; 5053 ASSERT(zlabel != NULL); 5054 label_hold(zlabel); 5055 5056 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5057 addr_type = IPV4_VERSION; 5058 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5059 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5060 addr_type = IPV6_VERSION; 5061 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5062 } else { 5063 retv = 0; 5064 goto out; 5065 } 5066 5067 retv = EACCES; /* assume the worst */ 5068 5069 /* 5070 * Next, get the assigned label of the remote server. 5071 */ 5072 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5073 if (tp == NULL) 5074 goto out; /* error getting host entry */ 5075 5076 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5077 goto rel_tpc; /* invalid domain */ 5078 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5079 (tp->tpc_tp.host_type != UNLABELED)) 5080 goto rel_tpc; /* invalid hosttype */ 5081 5082 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5083 tsl = getflabel_cipso(vfsp); 5084 if (tsl == NULL) 5085 goto rel_tpc; /* error getting server lbl */ 5086 5087 server_sl = label2bslabel(tsl); 5088 } else { /* UNLABELED */ 5089 server_sl = &tp->tpc_tp.tp_def_label; 5090 } 5091 5092 mntlabel = label2bslabel(zlabel); 5093 5094 /* 5095 * Now compare labels to complete the MAC check. If the labels 5096 * are equal or if the requestor is in the global zone and has 5097 * NET_MAC_AWARE, then allow read-write access. (Except for 5098 * mounts into the global zone itself; restrict these to 5099 * read-only.) 5100 * 5101 * If the requestor is in some other zone, but his label 5102 * dominates the server, then allow read-down. 5103 * 5104 * Otherwise, access is denied. 5105 */ 5106 if (blequal(mntlabel, server_sl) || 5107 (crgetzoneid(cr) == GLOBAL_ZONEID && 5108 getpflags(NET_MAC_AWARE, cr) != 0)) { 5109 if ((mntzone == global_zone) || 5110 !blequal(mntlabel, server_sl)) 5111 retv = -1; /* read-only */ 5112 else 5113 retv = 0; /* access OK */ 5114 } else if (bldominates(mntlabel, server_sl)) { 5115 retv = -1; /* read-only */ 5116 } else { 5117 retv = EACCES; 5118 } 5119 5120 if (tsl != NULL) 5121 label_rele(tsl); 5122 5123 rel_tpc: 5124 TPC_RELE(tp); 5125 out: 5126 if (mntzone) 5127 zone_rele(mntzone); 5128 label_rele(zlabel); 5129 return (retv); 5130 } 5131 5132 boolean_t 5133 nfs_has_ctty(void) 5134 { 5135 boolean_t rv; 5136 mutex_enter(&curproc->p_splock); 5137 rv = (curproc->p_sessp->s_vp != NULL); 5138 mutex_exit(&curproc->p_splock); 5139 return (rv); 5140 } 5141 5142 /* 5143 * See if xattr directory to see if it has any generic user attributes 5144 */ 5145 int 5146 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) 5147 { 5148 struct uio uio; 5149 struct iovec iov; 5150 char *dbuf; 5151 struct dirent64 *dp; 5152 size_t dlen = 8 * 1024; 5153 size_t dbuflen; 5154 int eof = 0; 5155 int error; 5156 5157 *valp = 0; 5158 dbuf = kmem_alloc(dlen, KM_SLEEP); 5159 uio.uio_iov = &iov; 5160 uio.uio_iovcnt = 1; 5161 uio.uio_segflg = UIO_SYSSPACE; 5162 uio.uio_fmode = 0; 5163 uio.uio_extflg = UIO_COPY_CACHED; 5164 uio.uio_loffset = 0; 5165 uio.uio_resid = dlen; 5166 iov.iov_base = dbuf; 5167 iov.iov_len = dlen; 5168 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 5169 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); 5170 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 5171 5172 dbuflen = dlen - uio.uio_resid; 5173 5174 if (error || dbuflen == 0) { 5175 kmem_free(dbuf, dlen); 5176 return (error); 5177 } 5178 5179 dp = (dirent64_t *)dbuf; 5180 5181 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { 5182 if (strcmp(dp->d_name, ".") == 0 || 5183 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, 5184 VIEW_READWRITE) == 0 || strcmp(dp->d_name, 5185 VIEW_READONLY) == 0) { 5186 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); 5187 continue; 5188 } 5189 5190 *valp = 1; 5191 break; 5192 } 5193 kmem_free(dbuf, dlen); 5194 return (0); 5195 } 5196