1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/systm.h> 33 #include <sys/cred.h> 34 #include <sys/proc.h> 35 #include <sys/user.h> 36 #include <sys/time.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/socket.h> 41 #include <sys/uio.h> 42 #include <sys/tiuser.h> 43 #include <sys/swap.h> 44 #include <sys/errno.h> 45 #include <sys/debug.h> 46 #include <sys/kmem.h> 47 #include <sys/kstat.h> 48 #include <sys/cmn_err.h> 49 #include <sys/vtrace.h> 50 #include <sys/session.h> 51 #include <sys/dnlc.h> 52 #include <sys/bitmap.h> 53 #include <sys/acl.h> 54 #include <sys/ddi.h> 55 #include <sys/pathname.h> 56 #include <sys/flock.h> 57 #include <sys/dirent.h> 58 #include <sys/flock.h> 59 #include <sys/callb.h> 60 #include <sys/atomic.h> 61 #include <sys/list.h> 62 #include <sys/tsol/tnet.h> 63 #include <sys/priv.h> 64 #include <sys/sdt.h> 65 #include <sys/attr.h> 66 67 #include <inet/ip6.h> 68 69 #include <rpc/types.h> 70 #include <rpc/xdr.h> 71 #include <rpc/auth.h> 72 #include <rpc/clnt.h> 73 74 #include <nfs/nfs.h> 75 #include <nfs/nfs4.h> 76 #include <nfs/nfs_clnt.h> 77 #include <nfs/rnode.h> 78 #include <nfs/nfs_acl.h> 79 80 #include <sys/tsol/label.h> 81 82 /* 83 * The hash queues for the access to active and cached rnodes 84 * are organized as doubly linked lists. A reader/writer lock 85 * for each hash bucket is used to control access and to synchronize 86 * lookups, additions, and deletions from the hash queue. 87 * 88 * The rnode freelist is organized as a doubly linked list with 89 * a head pointer. Additions and deletions are synchronized via 90 * a single mutex. 91 * 92 * In order to add an rnode to the free list, it must be hashed into 93 * a hash queue and the exclusive lock to the hash queue be held. 94 * If an rnode is not hashed into a hash queue, then it is destroyed 95 * because it represents no valuable information that can be reused 96 * about the file. The exclusive lock to the hash queue must be 97 * held in order to prevent a lookup in the hash queue from finding 98 * the rnode and using it and assuming that the rnode is not on the 99 * freelist. The lookup in the hash queue will have the hash queue 100 * locked, either exclusive or shared. 101 * 102 * The vnode reference count for each rnode is not allowed to drop 103 * below 1. This prevents external entities, such as the VM 104 * subsystem, from acquiring references to vnodes already on the 105 * freelist and then trying to place them back on the freelist 106 * when their reference is released. This means that the when an 107 * rnode is looked up in the hash queues, then either the rnode 108 * is removed from the freelist and that reference is transferred to 109 * the new reference or the vnode reference count must be incremented 110 * accordingly. The mutex for the freelist must be held in order to 111 * accurately test to see if the rnode is on the freelist or not. 112 * The hash queue lock might be held shared and it is possible that 113 * two different threads may race to remove the rnode from the 114 * freelist. This race can be resolved by holding the mutex for the 115 * freelist. Please note that the mutex for the freelist does not 116 * need to held if the rnode is not on the freelist. It can not be 117 * placed on the freelist due to the requirement that the thread 118 * putting the rnode on the freelist must hold the exclusive lock 119 * to the hash queue and the thread doing the lookup in the hash 120 * queue is holding either a shared or exclusive lock to the hash 121 * queue. 122 * 123 * The lock ordering is: 124 * 125 * hash bucket lock -> vnode lock 126 * hash bucket lock -> freelist lock 127 */ 128 static rhashq_t *rtable; 129 130 static kmutex_t rpfreelist_lock; 131 static rnode_t *rpfreelist = NULL; 132 static long rnew = 0; 133 long nrnode = 0; 134 135 static int rtablesize; 136 static int rtablemask; 137 138 static int hashlen = 4; 139 140 static struct kmem_cache *rnode_cache; 141 142 /* 143 * Mutex to protect the following variables: 144 * nfs_major 145 * nfs_minor 146 */ 147 kmutex_t nfs_minor_lock; 148 int nfs_major; 149 int nfs_minor; 150 151 /* Do we allow preepoch (negative) time values otw? */ 152 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 153 154 /* 155 * Access cache 156 */ 157 static acache_hash_t *acache; 158 static long nacache; /* used strictly to size the number of hash queues */ 159 160 static int acachesize; 161 static int acachemask; 162 static struct kmem_cache *acache_cache; 163 164 /* 165 * Client side utilities 166 */ 167 168 /* 169 * client side statistics 170 */ 171 static const struct clstat clstat_tmpl = { 172 { "calls", KSTAT_DATA_UINT64 }, 173 { "badcalls", KSTAT_DATA_UINT64 }, 174 { "clgets", KSTAT_DATA_UINT64 }, 175 { "cltoomany", KSTAT_DATA_UINT64 }, 176 #ifdef DEBUG 177 { "clalloc", KSTAT_DATA_UINT64 }, 178 { "noresponse", KSTAT_DATA_UINT64 }, 179 { "failover", KSTAT_DATA_UINT64 }, 180 { "remap", KSTAT_DATA_UINT64 }, 181 #endif 182 }; 183 184 /* 185 * The following are statistics that describe behavior of the system as a whole 186 * and doesn't correspond to any one particular zone. 187 */ 188 #ifdef DEBUG 189 static struct clstat_debug { 190 kstat_named_t nrnode; /* number of allocated rnodes */ 191 kstat_named_t access; /* size of access cache */ 192 kstat_named_t dirent; /* size of readdir cache */ 193 kstat_named_t dirents; /* size of readdir buf cache */ 194 kstat_named_t reclaim; /* number of reclaims */ 195 kstat_named_t clreclaim; /* number of cl reclaims */ 196 kstat_named_t f_reclaim; /* number of free reclaims */ 197 kstat_named_t a_reclaim; /* number of active reclaims */ 198 kstat_named_t r_reclaim; /* number of rnode reclaims */ 199 kstat_named_t rpath; /* bytes used to store rpaths */ 200 } clstat_debug = { 201 { "nrnode", KSTAT_DATA_UINT64 }, 202 { "access", KSTAT_DATA_UINT64 }, 203 { "dirent", KSTAT_DATA_UINT64 }, 204 { "dirents", KSTAT_DATA_UINT64 }, 205 { "reclaim", KSTAT_DATA_UINT64 }, 206 { "clreclaim", KSTAT_DATA_UINT64 }, 207 { "f_reclaim", KSTAT_DATA_UINT64 }, 208 { "a_reclaim", KSTAT_DATA_UINT64 }, 209 { "r_reclaim", KSTAT_DATA_UINT64 }, 210 { "r_path", KSTAT_DATA_UINT64 }, 211 }; 212 #endif /* DEBUG */ 213 214 /* 215 * We keep a global list of per-zone client data, so we can clean up all zones 216 * if we get low on memory. 217 */ 218 static list_t nfs_clnt_list; 219 static kmutex_t nfs_clnt_list_lock; 220 static zone_key_t nfsclnt_zone_key; 221 222 static struct kmem_cache *chtab_cache; 223 224 /* 225 * Some servers do not properly update the attributes of the 226 * directory when changes are made. To allow interoperability 227 * with these broken servers, the nfs_disable_rddir_cache 228 * parameter must be set in /etc/system 229 */ 230 int nfs_disable_rddir_cache = 0; 231 232 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 233 struct chtab **); 234 void clfree(CLIENT *, struct chtab *); 235 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 236 struct chtab **, struct nfs_clnt *); 237 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 238 struct chtab **, struct nfs_clnt *); 239 static void clreclaim(void *); 240 static int nfs_feedback(int, int, mntinfo_t *); 241 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 242 caddr_t, cred_t *, int *, enum clnt_stat *, int, 243 failinfo_t *); 244 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 245 caddr_t, cred_t *, int *, int, failinfo_t *); 246 static void rinactive(rnode_t *, cred_t *); 247 static int rtablehash(nfs_fhandle *); 248 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 249 struct vnodeops *, 250 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 251 cred_t *), 252 int (*)(const void *, const void *), int *, cred_t *, 253 char *, char *); 254 static void rp_rmfree(rnode_t *); 255 static void rp_addhash(rnode_t *); 256 static void rp_rmhash_locked(rnode_t *); 257 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 258 static void destroy_rnode(rnode_t *); 259 static void rddir_cache_free(rddir_cache *); 260 static int nfs_free_data_reclaim(rnode_t *); 261 static int nfs_active_data_reclaim(rnode_t *); 262 static int nfs_free_reclaim(void); 263 static int nfs_active_reclaim(void); 264 static int nfs_rnode_reclaim(void); 265 static void nfs_reclaim(void *); 266 static int failover_safe(failinfo_t *); 267 static void failover_newserver(mntinfo_t *mi); 268 static void failover_thread(mntinfo_t *mi); 269 static int failover_wait(mntinfo_t *); 270 static int failover_remap(failinfo_t *); 271 static int failover_lookup(char *, vnode_t *, 272 int (*)(vnode_t *, char *, vnode_t **, 273 struct pathname *, int, vnode_t *, cred_t *, int), 274 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 275 vnode_t **); 276 static void nfs_free_r_path(rnode_t *); 277 static void nfs_set_vroot(vnode_t *); 278 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 279 280 /* 281 * from rpcsec module (common/rpcsec) 282 */ 283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 284 extern void sec_clnt_freeh(AUTH *); 285 extern void sec_clnt_freeinfo(struct sec_data *); 286 287 /* 288 * used in mount policy 289 */ 290 extern ts_label_t *getflabel_cipso(vfs_t *); 291 292 /* 293 * EIO or EINTR are not recoverable errors. 294 */ 295 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 296 297 #ifdef DEBUG 298 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n" 299 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n" 300 #else 301 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n" 302 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n" 303 #endif 304 /* 305 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 306 */ 307 static int 308 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 309 struct chtab **chp, struct nfs_clnt *nfscl) 310 { 311 struct chhead *ch, *newch; 312 struct chhead **plistp; 313 struct chtab *cp; 314 int error; 315 k_sigset_t smask; 316 317 if (newcl == NULL || chp == NULL || ci == NULL) 318 return (EINVAL); 319 320 *newcl = NULL; 321 *chp = NULL; 322 323 /* 324 * Find an unused handle or create one 325 */ 326 newch = NULL; 327 nfscl->nfscl_stat.clgets.value.ui64++; 328 top: 329 /* 330 * Find the correct entry in the cache to check for free 331 * client handles. The search is based on the RPC program 332 * number, program version number, dev_t for the transport 333 * device, and the protocol family. 334 */ 335 mutex_enter(&nfscl->nfscl_chtable_lock); 336 plistp = &nfscl->nfscl_chtable; 337 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 338 if (ch->ch_prog == ci->cl_prog && 339 ch->ch_vers == ci->cl_vers && 340 ch->ch_dev == svp->sv_knconf->knc_rdev && 341 (strcmp(ch->ch_protofmly, 342 svp->sv_knconf->knc_protofmly) == 0)) 343 break; 344 plistp = &ch->ch_next; 345 } 346 347 /* 348 * If we didn't find a cache entry for this quadruple, then 349 * create one. If we don't have one already preallocated, 350 * then drop the cache lock, create one, and then start over. 351 * If we did have a preallocated entry, then just add it to 352 * the front of the list. 353 */ 354 if (ch == NULL) { 355 if (newch == NULL) { 356 mutex_exit(&nfscl->nfscl_chtable_lock); 357 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 358 newch->ch_timesused = 0; 359 newch->ch_prog = ci->cl_prog; 360 newch->ch_vers = ci->cl_vers; 361 newch->ch_dev = svp->sv_knconf->knc_rdev; 362 newch->ch_protofmly = kmem_alloc( 363 strlen(svp->sv_knconf->knc_protofmly) + 1, 364 KM_SLEEP); 365 (void) strcpy(newch->ch_protofmly, 366 svp->sv_knconf->knc_protofmly); 367 newch->ch_list = NULL; 368 goto top; 369 } 370 ch = newch; 371 newch = NULL; 372 ch->ch_next = nfscl->nfscl_chtable; 373 nfscl->nfscl_chtable = ch; 374 /* 375 * We found a cache entry, but if it isn't on the front of the 376 * list, then move it to the front of the list to try to take 377 * advantage of locality of operations. 378 */ 379 } else if (ch != nfscl->nfscl_chtable) { 380 *plistp = ch->ch_next; 381 ch->ch_next = nfscl->nfscl_chtable; 382 nfscl->nfscl_chtable = ch; 383 } 384 385 /* 386 * If there was a free client handle cached, then remove it 387 * from the list, init it, and use it. 388 */ 389 if (ch->ch_list != NULL) { 390 cp = ch->ch_list; 391 ch->ch_list = cp->ch_list; 392 mutex_exit(&nfscl->nfscl_chtable_lock); 393 if (newch != NULL) { 394 kmem_free(newch->ch_protofmly, 395 strlen(newch->ch_protofmly) + 1); 396 kmem_free(newch, sizeof (*newch)); 397 } 398 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 399 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 400 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 401 &cp->ch_client->cl_auth); 402 if (error || cp->ch_client->cl_auth == NULL) { 403 CLNT_DESTROY(cp->ch_client); 404 kmem_cache_free(chtab_cache, cp); 405 return ((error != 0) ? error : EINTR); 406 } 407 ch->ch_timesused++; 408 *newcl = cp->ch_client; 409 *chp = cp; 410 return (0); 411 } 412 413 /* 414 * There weren't any free client handles which fit, so allocate 415 * a new one and use that. 416 */ 417 #ifdef DEBUG 418 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64); 419 #endif 420 mutex_exit(&nfscl->nfscl_chtable_lock); 421 422 nfscl->nfscl_stat.cltoomany.value.ui64++; 423 if (newch != NULL) { 424 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 425 kmem_free(newch, sizeof (*newch)); 426 } 427 428 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 429 cp->ch_head = ch; 430 431 sigintr(&smask, (int)ci->cl_flags & MI_INT); 432 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 433 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 434 sigunintr(&smask); 435 436 if (error != 0) { 437 kmem_cache_free(chtab_cache, cp); 438 #ifdef DEBUG 439 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); 440 #endif 441 /* 442 * Warning is unnecessary if error is EINTR. 443 */ 444 if (error != EINTR) { 445 nfs_cmn_err(error, CE_WARN, 446 "clget: couldn't create handle: %m\n"); 447 } 448 return (error); 449 } 450 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 451 auth_destroy(cp->ch_client->cl_auth); 452 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 453 &cp->ch_client->cl_auth); 454 if (error || cp->ch_client->cl_auth == NULL) { 455 CLNT_DESTROY(cp->ch_client); 456 kmem_cache_free(chtab_cache, cp); 457 #ifdef DEBUG 458 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); 459 #endif 460 return ((error != 0) ? error : EINTR); 461 } 462 ch->ch_timesused++; 463 *newcl = cp->ch_client; 464 ASSERT(cp->ch_client->cl_nosignal == FALSE); 465 *chp = cp; 466 return (0); 467 } 468 469 int 470 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 471 struct chtab **chp) 472 { 473 struct nfs_clnt *nfscl; 474 475 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 476 ASSERT(nfscl != NULL); 477 478 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 479 } 480 481 static int 482 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 483 struct chtab **chp, struct nfs_clnt *nfscl) 484 { 485 clinfo_t ci; 486 int error; 487 488 /* 489 * Set read buffer size to rsize 490 * and add room for RPC headers. 491 */ 492 ci.cl_readsize = mi->mi_tsize; 493 if (ci.cl_readsize != 0) 494 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 495 496 /* 497 * If soft mount and server is down just try once. 498 * meaning: do not retransmit. 499 */ 500 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 501 ci.cl_retrans = 0; 502 else 503 ci.cl_retrans = mi->mi_retrans; 504 505 ci.cl_prog = NFS_ACL_PROGRAM; 506 ci.cl_vers = mi->mi_vers; 507 ci.cl_flags = mi->mi_flags; 508 509 /* 510 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 511 * security flavor, the client tries to establish a security context 512 * by contacting the server. If the connection is timed out or reset, 513 * e.g. server reboot, we will try again. 514 */ 515 do { 516 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 517 518 if (error == 0) 519 break; 520 521 /* 522 * For forced unmount or zone shutdown, bail out, no retry. 523 */ 524 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 525 error = EIO; 526 break; 527 } 528 529 /* do not retry for softmount */ 530 if (!(mi->mi_flags & MI_HARD)) 531 break; 532 533 /* let the caller deal with the failover case */ 534 if (FAILOVER_MOUNT(mi)) 535 break; 536 537 } while (error == ETIMEDOUT || error == ECONNRESET); 538 539 return (error); 540 } 541 542 static int 543 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 544 struct chtab **chp, struct nfs_clnt *nfscl) 545 { 546 clinfo_t ci; 547 int error; 548 549 /* 550 * Set read buffer size to rsize 551 * and add room for RPC headers. 552 */ 553 ci.cl_readsize = mi->mi_tsize; 554 if (ci.cl_readsize != 0) 555 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 556 557 /* 558 * If soft mount and server is down just try once. 559 * meaning: do not retransmit. 560 */ 561 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 562 ci.cl_retrans = 0; 563 else 564 ci.cl_retrans = mi->mi_retrans; 565 566 ci.cl_prog = mi->mi_prog; 567 ci.cl_vers = mi->mi_vers; 568 ci.cl_flags = mi->mi_flags; 569 570 /* 571 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 572 * security flavor, the client tries to establish a security context 573 * by contacting the server. If the connection is timed out or reset, 574 * e.g. server reboot, we will try again. 575 */ 576 do { 577 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 578 579 if (error == 0) 580 break; 581 582 /* 583 * For forced unmount or zone shutdown, bail out, no retry. 584 */ 585 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 586 error = EIO; 587 break; 588 } 589 590 /* do not retry for softmount */ 591 if (!(mi->mi_flags & MI_HARD)) 592 break; 593 594 /* let the caller deal with the failover case */ 595 if (FAILOVER_MOUNT(mi)) 596 break; 597 598 } while (error == ETIMEDOUT || error == ECONNRESET); 599 600 return (error); 601 } 602 603 static void 604 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 605 { 606 if (cl->cl_auth != NULL) { 607 sec_clnt_freeh(cl->cl_auth); 608 cl->cl_auth = NULL; 609 } 610 611 /* 612 * Timestamp this cache entry so that we know when it was last 613 * used. 614 */ 615 cp->ch_freed = gethrestime_sec(); 616 617 /* 618 * Add the free client handle to the front of the list. 619 * This way, the list will be sorted in youngest to oldest 620 * order. 621 */ 622 mutex_enter(&nfscl->nfscl_chtable_lock); 623 cp->ch_list = cp->ch_head->ch_list; 624 cp->ch_head->ch_list = cp; 625 mutex_exit(&nfscl->nfscl_chtable_lock); 626 } 627 628 void 629 clfree(CLIENT *cl, struct chtab *cp) 630 { 631 struct nfs_clnt *nfscl; 632 633 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 634 ASSERT(nfscl != NULL); 635 636 clfree_impl(cl, cp, nfscl); 637 } 638 639 #define CL_HOLDTIME 60 /* time to hold client handles */ 640 641 static void 642 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 643 { 644 struct chhead *ch; 645 struct chtab *cp; /* list of objects that can be reclaimed */ 646 struct chtab *cpe; 647 struct chtab *cpl; 648 struct chtab **cpp; 649 #ifdef DEBUG 650 int n = 0; 651 #endif 652 653 /* 654 * Need to reclaim some memory, so step through the cache 655 * looking through the lists for entries which can be freed. 656 */ 657 cp = NULL; 658 659 mutex_enter(&nfscl->nfscl_chtable_lock); 660 661 /* 662 * Here we step through each non-NULL quadruple and start to 663 * construct the reclaim list pointed to by cp. Note that 664 * cp will contain all eligible chtab entries. When this traversal 665 * completes, chtab entries from the last quadruple will be at the 666 * front of cp and entries from previously inspected quadruples have 667 * been appended to the rear of cp. 668 */ 669 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 670 if (ch->ch_list == NULL) 671 continue; 672 /* 673 * Search each list for entries older then 674 * cl_holdtime seconds. The lists are maintained 675 * in youngest to oldest order so that when the 676 * first entry is found which is old enough, then 677 * all of the rest of the entries on the list will 678 * be old enough as well. 679 */ 680 cpl = ch->ch_list; 681 cpp = &ch->ch_list; 682 while (cpl != NULL && 683 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 684 cpp = &cpl->ch_list; 685 cpl = cpl->ch_list; 686 } 687 if (cpl != NULL) { 688 *cpp = NULL; 689 if (cp != NULL) { 690 cpe = cpl; 691 while (cpe->ch_list != NULL) 692 cpe = cpe->ch_list; 693 cpe->ch_list = cp; 694 } 695 cp = cpl; 696 } 697 } 698 699 mutex_exit(&nfscl->nfscl_chtable_lock); 700 701 /* 702 * If cp is empty, then there is nothing to reclaim here. 703 */ 704 if (cp == NULL) 705 return; 706 707 /* 708 * Step through the list of entries to free, destroying each client 709 * handle and kmem_free'ing the memory for each entry. 710 */ 711 while (cp != NULL) { 712 #ifdef DEBUG 713 n++; 714 #endif 715 CLNT_DESTROY(cp->ch_client); 716 cpl = cp->ch_list; 717 kmem_cache_free(chtab_cache, cp); 718 cp = cpl; 719 } 720 721 #ifdef DEBUG 722 /* 723 * Update clalloc so that nfsstat shows the current number 724 * of allocated client handles. 725 */ 726 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 727 #endif 728 } 729 730 /* ARGSUSED */ 731 static void 732 clreclaim(void *all) 733 { 734 struct nfs_clnt *nfscl; 735 736 #ifdef DEBUG 737 clstat_debug.clreclaim.value.ui64++; 738 #endif 739 /* 740 * The system is low on memory; go through and try to reclaim some from 741 * every zone on the system. 742 */ 743 mutex_enter(&nfs_clnt_list_lock); 744 nfscl = list_head(&nfs_clnt_list); 745 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 746 clreclaim_zone(nfscl, CL_HOLDTIME); 747 mutex_exit(&nfs_clnt_list_lock); 748 } 749 750 /* 751 * Minimum time-out values indexed by call type 752 * These units are in "eights" of a second to avoid multiplies 753 */ 754 static unsigned int minimum_timeo[] = { 755 6, 7, 10 756 }; 757 758 /* 759 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 760 */ 761 #define MAXTIMO (20*hz) 762 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 763 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 764 765 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 766 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 767 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 768 769 /* 770 * Function called when rfscall notices that we have been 771 * re-transmitting, or when we get a response without retransmissions. 772 * Return 1 if the transfer size was adjusted down - 0 if no change. 773 */ 774 static int 775 nfs_feedback(int flag, int which, mntinfo_t *mi) 776 { 777 int kind; 778 int r = 0; 779 780 mutex_enter(&mi->mi_lock); 781 if (flag == FEEDBACK_REXMIT1) { 782 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 783 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 784 goto done; 785 if (mi->mi_curread > MIN_NFS_TSIZE) { 786 mi->mi_curread /= 2; 787 if (mi->mi_curread < MIN_NFS_TSIZE) 788 mi->mi_curread = MIN_NFS_TSIZE; 789 r = 1; 790 } 791 792 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 793 mi->mi_curwrite /= 2; 794 if (mi->mi_curwrite < MIN_NFS_TSIZE) 795 mi->mi_curwrite = MIN_NFS_TSIZE; 796 r = 1; 797 } 798 } else if (flag == FEEDBACK_OK) { 799 kind = mi->mi_timer_type[which]; 800 if (kind == 0 || 801 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 802 goto done; 803 if (kind == 1) { 804 if (mi->mi_curread >= mi->mi_tsize) 805 goto done; 806 mi->mi_curread += MIN_NFS_TSIZE; 807 if (mi->mi_curread > mi->mi_tsize/2) 808 mi->mi_curread = mi->mi_tsize; 809 } else if (kind == 2) { 810 if (mi->mi_curwrite >= mi->mi_stsize) 811 goto done; 812 mi->mi_curwrite += MIN_NFS_TSIZE; 813 if (mi->mi_curwrite > mi->mi_stsize/2) 814 mi->mi_curwrite = mi->mi_stsize; 815 } 816 } 817 done: 818 mutex_exit(&mi->mi_lock); 819 return (r); 820 } 821 822 #ifdef DEBUG 823 static int rfs2call_hits = 0; 824 static int rfs2call_misses = 0; 825 #endif 826 827 int 828 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 829 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 830 enum nfsstat *statusp, int flags, failinfo_t *fi) 831 { 832 int rpcerror; 833 enum clnt_stat rpc_status; 834 835 ASSERT(statusp != NULL); 836 837 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 838 cr, douprintf, &rpc_status, flags, fi); 839 if (!rpcerror) { 840 /* 841 * See crnetadjust() for comments. 842 */ 843 if (*statusp == NFSERR_ACCES && 844 (cr = crnetadjust(cr)) != NULL) { 845 #ifdef DEBUG 846 rfs2call_hits++; 847 #endif 848 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 849 resp, cr, douprintf, NULL, flags, fi); 850 crfree(cr); 851 #ifdef DEBUG 852 if (*statusp == NFSERR_ACCES) 853 rfs2call_misses++; 854 #endif 855 } 856 } else if (rpc_status == RPC_PROCUNAVAIL) { 857 *statusp = NFSERR_OPNOTSUPP; 858 rpcerror = 0; 859 } 860 861 return (rpcerror); 862 } 863 864 #define NFS3_JUKEBOX_DELAY 10 * hz 865 866 static clock_t nfs3_jukebox_delay = 0; 867 868 #ifdef DEBUG 869 static int rfs3call_hits = 0; 870 static int rfs3call_misses = 0; 871 #endif 872 873 int 874 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 875 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 876 nfsstat3 *statusp, int flags, failinfo_t *fi) 877 { 878 int rpcerror; 879 int user_informed; 880 881 user_informed = 0; 882 do { 883 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 884 cr, douprintf, NULL, flags, fi); 885 if (!rpcerror) { 886 cred_t *crr; 887 if (*statusp == NFS3ERR_JUKEBOX) { 888 if (ttoproc(curthread) == &p0) { 889 rpcerror = EAGAIN; 890 break; 891 } 892 if (!user_informed) { 893 user_informed = 1; 894 uprintf( 895 "file temporarily unavailable on the server, retrying...\n"); 896 } 897 delay(nfs3_jukebox_delay); 898 } 899 /* 900 * See crnetadjust() for comments. 901 */ 902 else if (*statusp == NFS3ERR_ACCES && 903 (crr = crnetadjust(cr)) != NULL) { 904 #ifdef DEBUG 905 rfs3call_hits++; 906 #endif 907 rpcerror = rfscall(mi, which, xdrargs, argsp, 908 xdrres, resp, crr, douprintf, 909 NULL, flags, fi); 910 911 crfree(crr); 912 #ifdef DEBUG 913 if (*statusp == NFS3ERR_ACCES) 914 rfs3call_misses++; 915 #endif 916 } 917 } 918 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 919 920 return (rpcerror); 921 } 922 923 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 924 #define INC_READERS(mi) { \ 925 mi->mi_readers++; \ 926 } 927 #define DEC_READERS(mi) { \ 928 mi->mi_readers--; \ 929 if (mi->mi_readers == 0) \ 930 cv_broadcast(&mi->mi_failover_cv); \ 931 } 932 933 static int 934 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 935 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 936 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 937 { 938 CLIENT *client; 939 struct chtab *ch; 940 cred_t *cr = icr; 941 enum clnt_stat status; 942 struct rpc_err rpcerr, rpcerr_tmp; 943 struct timeval wait; 944 int timeo; /* in units of hz */ 945 int my_rsize, my_wsize; 946 bool_t tryagain; 947 bool_t cred_cloned = FALSE; 948 k_sigset_t smask; 949 servinfo_t *svp; 950 struct nfs_clnt *nfscl; 951 zoneid_t zoneid = getzoneid(); 952 char *msg; 953 #ifdef DEBUG 954 char *bufp; 955 #endif 956 957 958 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 959 "rfscall_start:which %d mi %p", which, mi); 960 961 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 962 ASSERT(nfscl != NULL); 963 964 nfscl->nfscl_stat.calls.value.ui64++; 965 mi->mi_reqs[which].value.ui64++; 966 967 rpcerr.re_status = RPC_SUCCESS; 968 969 /* 970 * In case of forced unmount or zone shutdown, return EIO. 971 */ 972 973 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 974 rpcerr.re_status = RPC_FAILED; 975 rpcerr.re_errno = EIO; 976 return (rpcerr.re_errno); 977 } 978 979 /* 980 * Remember the transfer sizes in case 981 * nfs_feedback changes them underneath us. 982 */ 983 my_rsize = mi->mi_curread; 984 my_wsize = mi->mi_curwrite; 985 986 /* 987 * NFS client failover support 988 * 989 * If this rnode is not in sync with the current server (VALID_FH), 990 * we'd like to do a remap to get in sync. We can be interrupted 991 * in failover_remap(), and if so we'll bail. Otherwise, we'll 992 * use the best info we have to try the RPC. Part of that is 993 * unconditionally updating the filehandle copy kept for V3. 994 * 995 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 996 * rw_enter(); we're trying to keep the current server from being 997 * changed on us until we're done with the remapping and have a 998 * matching client handle. We don't want to sending a filehandle 999 * to the wrong host. 1000 */ 1001 failoverretry: 1002 if (FAILOVER_MOUNT(mi)) { 1003 mutex_enter(&mi->mi_lock); 1004 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1005 if (failover_wait(mi)) { 1006 mutex_exit(&mi->mi_lock); 1007 return (EINTR); 1008 } 1009 } 1010 INC_READERS(mi); 1011 mutex_exit(&mi->mi_lock); 1012 if (fi) { 1013 if (!VALID_FH(fi) && 1014 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1015 int remaperr; 1016 1017 svp = mi->mi_curr_serv; 1018 remaperr = failover_remap(fi); 1019 if (remaperr != 0) { 1020 #ifdef DEBUG 1021 if (remaperr != EINTR) 1022 nfs_cmn_err(remaperr, CE_WARN, 1023 "rfscall couldn't failover: %m"); 1024 #endif 1025 mutex_enter(&mi->mi_lock); 1026 DEC_READERS(mi); 1027 mutex_exit(&mi->mi_lock); 1028 /* 1029 * If failover_remap returns ETIMEDOUT 1030 * and the filesystem is hard mounted 1031 * we have to retry the call with a new 1032 * server. 1033 */ 1034 if ((mi->mi_flags & MI_HARD) && 1035 IS_RECOVERABLE_ERROR(remaperr)) { 1036 if (svp == mi->mi_curr_serv) 1037 failover_newserver(mi); 1038 rpcerr.re_status = RPC_SUCCESS; 1039 goto failoverretry; 1040 } 1041 rpcerr.re_errno = remaperr; 1042 return (remaperr); 1043 } 1044 } 1045 if (fi->fhp && fi->copyproc) 1046 (*fi->copyproc)(fi->fhp, fi->vp); 1047 } 1048 } 1049 1050 /* For TSOL, use a new cred which has net_mac_aware flag */ 1051 if (!cred_cloned && is_system_labeled()) { 1052 cred_cloned = TRUE; 1053 cr = crdup(icr); 1054 (void) setpflags(NET_MAC_AWARE, 1, cr); 1055 } 1056 1057 /* 1058 * clget() calls clnt_tli_kinit() which clears the xid, so we 1059 * are guaranteed to reprocess the retry as a new request. 1060 */ 1061 svp = mi->mi_curr_serv; 1062 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1063 1064 if (FAILOVER_MOUNT(mi)) { 1065 mutex_enter(&mi->mi_lock); 1066 DEC_READERS(mi); 1067 mutex_exit(&mi->mi_lock); 1068 1069 if ((rpcerr.re_errno == ETIMEDOUT || 1070 rpcerr.re_errno == ECONNRESET) && 1071 failover_safe(fi)) { 1072 if (svp == mi->mi_curr_serv) 1073 failover_newserver(mi); 1074 goto failoverretry; 1075 } 1076 } 1077 if (rpcerr.re_errno != 0) 1078 return (rpcerr.re_errno); 1079 1080 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1081 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1082 timeo = (mi->mi_timeo * hz) / 10; 1083 } else { 1084 mutex_enter(&mi->mi_lock); 1085 timeo = CLNT_SETTIMERS(client, 1086 &(mi->mi_timers[mi->mi_timer_type[which]]), 1087 &(mi->mi_timers[NFS_CALLTYPES]), 1088 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1089 (void (*)())NULL, (caddr_t)mi, 0); 1090 mutex_exit(&mi->mi_lock); 1091 } 1092 1093 /* 1094 * If hard mounted fs, retry call forever unless hard error occurs. 1095 */ 1096 do { 1097 tryagain = FALSE; 1098 1099 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1100 status = RPC_FAILED; 1101 rpcerr.re_status = RPC_FAILED; 1102 rpcerr.re_errno = EIO; 1103 break; 1104 } 1105 1106 TICK_TO_TIMEVAL(timeo, &wait); 1107 1108 /* 1109 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1110 * and SIGTERM. (Preserving the existing masks). 1111 * Mask out SIGINT if mount option nointr is specified. 1112 */ 1113 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1114 if (!(mi->mi_flags & MI_INT)) 1115 client->cl_nosignal = TRUE; 1116 1117 /* 1118 * If there is a current signal, then don't bother 1119 * even trying to send out the request because we 1120 * won't be able to block waiting for the response. 1121 * Simply assume RPC_INTR and get on with it. 1122 */ 1123 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1124 status = RPC_INTR; 1125 else { 1126 status = CLNT_CALL(client, which, xdrargs, argsp, 1127 xdrres, resp, wait); 1128 } 1129 1130 if (!(mi->mi_flags & MI_INT)) 1131 client->cl_nosignal = FALSE; 1132 /* 1133 * restore original signal mask 1134 */ 1135 sigunintr(&smask); 1136 1137 switch (status) { 1138 case RPC_SUCCESS: 1139 if ((mi->mi_flags & MI_DYNAMIC) && 1140 mi->mi_timer_type[which] != 0 && 1141 (mi->mi_curread != my_rsize || 1142 mi->mi_curwrite != my_wsize)) 1143 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1144 break; 1145 1146 case RPC_INTR: 1147 /* 1148 * There is no way to recover from this error, 1149 * even if mount option nointr is specified. 1150 * SIGKILL, for example, cannot be blocked. 1151 */ 1152 rpcerr.re_status = RPC_INTR; 1153 rpcerr.re_errno = EINTR; 1154 break; 1155 1156 case RPC_UDERROR: 1157 /* 1158 * If the NFS server is local (vold) and 1159 * it goes away then we get RPC_UDERROR. 1160 * This is a retryable error, so we would 1161 * loop, so check to see if the specific 1162 * error was ECONNRESET, indicating that 1163 * target did not exist at all. If so, 1164 * return with RPC_PROGUNAVAIL and 1165 * ECONNRESET to indicate why. 1166 */ 1167 CLNT_GETERR(client, &rpcerr); 1168 if (rpcerr.re_errno == ECONNRESET) { 1169 rpcerr.re_status = RPC_PROGUNAVAIL; 1170 rpcerr.re_errno = ECONNRESET; 1171 break; 1172 } 1173 /*FALLTHROUGH*/ 1174 1175 default: /* probably RPC_TIMEDOUT */ 1176 if (IS_UNRECOVERABLE_RPC(status)) 1177 break; 1178 1179 /* 1180 * increment server not responding count 1181 */ 1182 mutex_enter(&mi->mi_lock); 1183 mi->mi_noresponse++; 1184 mutex_exit(&mi->mi_lock); 1185 #ifdef DEBUG 1186 nfscl->nfscl_stat.noresponse.value.ui64++; 1187 #endif 1188 1189 if (!(mi->mi_flags & MI_HARD)) { 1190 if (!(mi->mi_flags & MI_SEMISOFT) || 1191 (mi->mi_ss_call_type[which] == 0)) 1192 break; 1193 } 1194 1195 /* 1196 * The call is in progress (over COTS). 1197 * Try the CLNT_CALL again, but don't 1198 * print a noisy error message. 1199 */ 1200 if (status == RPC_INPROGRESS) { 1201 tryagain = TRUE; 1202 break; 1203 } 1204 1205 if (flags & RFSCALL_SOFT) 1206 break; 1207 1208 /* 1209 * On zone shutdown, just move on. 1210 */ 1211 if (zone_status_get(curproc->p_zone) >= 1212 ZONE_IS_SHUTTING_DOWN) { 1213 rpcerr.re_status = RPC_FAILED; 1214 rpcerr.re_errno = EIO; 1215 break; 1216 } 1217 1218 /* 1219 * NFS client failover support 1220 * 1221 * If the current server just failed us, we'll 1222 * start the process of finding a new server. 1223 * After that, we can just retry. 1224 */ 1225 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1226 if (svp == mi->mi_curr_serv) 1227 failover_newserver(mi); 1228 clfree_impl(client, ch, nfscl); 1229 goto failoverretry; 1230 } 1231 1232 tryagain = TRUE; 1233 timeo = backoff(timeo); 1234 1235 CLNT_GETERR(client, &rpcerr_tmp); 1236 if ((status == RPC_CANTSEND) && 1237 (rpcerr_tmp.re_errno == ENOBUFS)) 1238 msg = SRV_QFULL_MSG; 1239 else 1240 msg = SRV_NOTRESP_MSG; 1241 1242 mutex_enter(&mi->mi_lock); 1243 if (!(mi->mi_flags & MI_PRINTED)) { 1244 mi->mi_flags |= MI_PRINTED; 1245 mutex_exit(&mi->mi_lock); 1246 #ifdef DEBUG 1247 zprintf(zoneid, msg, mi->mi_vers, 1248 svp->sv_hostname); 1249 #else 1250 zprintf(zoneid, msg, svp->sv_hostname); 1251 #endif 1252 } else 1253 mutex_exit(&mi->mi_lock); 1254 if (*douprintf && nfs_has_ctty()) { 1255 *douprintf = 0; 1256 if (!(mi->mi_flags & MI_NOPRINT)) 1257 #ifdef DEBUG 1258 uprintf(msg, mi->mi_vers, 1259 svp->sv_hostname); 1260 #else 1261 uprintf(msg, svp->sv_hostname); 1262 #endif 1263 } 1264 1265 /* 1266 * If doing dynamic adjustment of transfer 1267 * size and if it's a read or write call 1268 * and if the transfer size changed while 1269 * retransmitting or if the feedback routine 1270 * changed the transfer size, 1271 * then exit rfscall so that the transfer 1272 * size can be adjusted at the vnops level. 1273 */ 1274 if ((mi->mi_flags & MI_DYNAMIC) && 1275 mi->mi_timer_type[which] != 0 && 1276 (mi->mi_curread != my_rsize || 1277 mi->mi_curwrite != my_wsize || 1278 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1279 /* 1280 * On read or write calls, return 1281 * back to the vnode ops level if 1282 * the transfer size changed. 1283 */ 1284 clfree_impl(client, ch, nfscl); 1285 if (cred_cloned) 1286 crfree(cr); 1287 return (ENFS_TRYAGAIN); 1288 } 1289 } 1290 } while (tryagain); 1291 1292 if (status != RPC_SUCCESS) { 1293 /* 1294 * Let soft mounts use the timed out message. 1295 */ 1296 if (status == RPC_INPROGRESS) 1297 status = RPC_TIMEDOUT; 1298 nfscl->nfscl_stat.badcalls.value.ui64++; 1299 if (status != RPC_INTR) { 1300 mutex_enter(&mi->mi_lock); 1301 mi->mi_flags |= MI_DOWN; 1302 mutex_exit(&mi->mi_lock); 1303 CLNT_GETERR(client, &rpcerr); 1304 #ifdef DEBUG 1305 bufp = clnt_sperror(client, svp->sv_hostname); 1306 zprintf(zoneid, "NFS%d %s failed for %s\n", 1307 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1308 if (nfs_has_ctty()) { 1309 if (!(mi->mi_flags & MI_NOPRINT)) { 1310 uprintf("NFS%d %s failed for %s\n", 1311 mi->mi_vers, mi->mi_rfsnames[which], 1312 bufp); 1313 } 1314 } 1315 kmem_free(bufp, MAXPATHLEN); 1316 #else 1317 zprintf(zoneid, 1318 "NFS %s failed for server %s: error %d (%s)\n", 1319 mi->mi_rfsnames[which], svp->sv_hostname, 1320 status, clnt_sperrno(status)); 1321 if (nfs_has_ctty()) { 1322 if (!(mi->mi_flags & MI_NOPRINT)) { 1323 uprintf( 1324 "NFS %s failed for server %s: error %d (%s)\n", 1325 mi->mi_rfsnames[which], 1326 svp->sv_hostname, status, 1327 clnt_sperrno(status)); 1328 } 1329 } 1330 #endif 1331 /* 1332 * when CLNT_CALL() fails with RPC_AUTHERROR, 1333 * re_errno is set appropriately depending on 1334 * the authentication error 1335 */ 1336 if (status == RPC_VERSMISMATCH || 1337 status == RPC_PROGVERSMISMATCH) 1338 rpcerr.re_errno = EIO; 1339 } 1340 } else { 1341 /* 1342 * Test the value of mi_down and mi_printed without 1343 * holding the mi_lock mutex. If they are both zero, 1344 * then it is okay to skip the down and printed 1345 * processing. This saves on a mutex_enter and 1346 * mutex_exit pair for a normal, successful RPC. 1347 * This was just complete overhead. 1348 */ 1349 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1350 mutex_enter(&mi->mi_lock); 1351 mi->mi_flags &= ~MI_DOWN; 1352 if (mi->mi_flags & MI_PRINTED) { 1353 mi->mi_flags &= ~MI_PRINTED; 1354 mutex_exit(&mi->mi_lock); 1355 #ifdef DEBUG 1356 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1357 zprintf(zoneid, "NFS%d server %s ok\n", 1358 mi->mi_vers, svp->sv_hostname); 1359 #else 1360 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1361 zprintf(zoneid, "NFS server %s ok\n", 1362 svp->sv_hostname); 1363 #endif 1364 } else 1365 mutex_exit(&mi->mi_lock); 1366 } 1367 1368 if (*douprintf == 0) { 1369 if (!(mi->mi_flags & MI_NOPRINT)) 1370 #ifdef DEBUG 1371 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1372 uprintf("NFS%d server %s ok\n", 1373 mi->mi_vers, svp->sv_hostname); 1374 #else 1375 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1376 uprintf("NFS server %s ok\n", svp->sv_hostname); 1377 #endif 1378 *douprintf = 1; 1379 } 1380 } 1381 1382 clfree_impl(client, ch, nfscl); 1383 if (cred_cloned) 1384 crfree(cr); 1385 1386 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1387 1388 if (rpc_status != NULL) 1389 *rpc_status = rpcerr.re_status; 1390 1391 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1392 rpcerr.re_errno); 1393 1394 return (rpcerr.re_errno); 1395 } 1396 1397 #ifdef DEBUG 1398 static int acl2call_hits = 0; 1399 static int acl2call_misses = 0; 1400 #endif 1401 1402 int 1403 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1404 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1405 enum nfsstat *statusp, int flags, failinfo_t *fi) 1406 { 1407 int rpcerror; 1408 1409 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1410 cr, douprintf, flags, fi); 1411 if (!rpcerror) { 1412 /* 1413 * See comments with crnetadjust(). 1414 */ 1415 if (*statusp == NFSERR_ACCES && 1416 (cr = crnetadjust(cr)) != NULL) { 1417 #ifdef DEBUG 1418 acl2call_hits++; 1419 #endif 1420 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1421 resp, cr, douprintf, flags, fi); 1422 crfree(cr); 1423 #ifdef DEBUG 1424 if (*statusp == NFSERR_ACCES) 1425 acl2call_misses++; 1426 #endif 1427 } 1428 } 1429 1430 return (rpcerror); 1431 } 1432 1433 #ifdef DEBUG 1434 static int acl3call_hits = 0; 1435 static int acl3call_misses = 0; 1436 #endif 1437 1438 int 1439 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1440 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1441 nfsstat3 *statusp, int flags, failinfo_t *fi) 1442 { 1443 int rpcerror; 1444 int user_informed; 1445 1446 user_informed = 0; 1447 1448 do { 1449 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1450 cr, douprintf, flags, fi); 1451 if (!rpcerror) { 1452 cred_t *crr; 1453 if (*statusp == NFS3ERR_JUKEBOX) { 1454 if (!user_informed) { 1455 user_informed = 1; 1456 uprintf( 1457 "file temporarily unavailable on the server, retrying...\n"); 1458 } 1459 delay(nfs3_jukebox_delay); 1460 } 1461 /* 1462 * See crnetadjust() for comments. 1463 */ 1464 else if (*statusp == NFS3ERR_ACCES && 1465 (crr = crnetadjust(cr)) != NULL) { 1466 #ifdef DEBUG 1467 acl3call_hits++; 1468 #endif 1469 rpcerror = aclcall(mi, which, xdrargs, argsp, 1470 xdrres, resp, crr, douprintf, flags, fi); 1471 1472 crfree(crr); 1473 #ifdef DEBUG 1474 if (*statusp == NFS3ERR_ACCES) 1475 acl3call_misses++; 1476 #endif 1477 } 1478 } 1479 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1480 1481 return (rpcerror); 1482 } 1483 1484 static int 1485 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1486 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1487 int flags, failinfo_t *fi) 1488 { 1489 CLIENT *client; 1490 struct chtab *ch; 1491 cred_t *cr = icr; 1492 bool_t cred_cloned = FALSE; 1493 enum clnt_stat status; 1494 struct rpc_err rpcerr; 1495 struct timeval wait; 1496 int timeo; /* in units of hz */ 1497 #if 0 /* notyet */ 1498 int my_rsize, my_wsize; 1499 #endif 1500 bool_t tryagain; 1501 k_sigset_t smask; 1502 servinfo_t *svp; 1503 struct nfs_clnt *nfscl; 1504 zoneid_t zoneid = getzoneid(); 1505 #ifdef DEBUG 1506 char *bufp; 1507 #endif 1508 1509 #if 0 /* notyet */ 1510 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1511 "rfscall_start:which %d mi %p", which, mi); 1512 #endif 1513 1514 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1515 ASSERT(nfscl != NULL); 1516 1517 nfscl->nfscl_stat.calls.value.ui64++; 1518 mi->mi_aclreqs[which].value.ui64++; 1519 1520 rpcerr.re_status = RPC_SUCCESS; 1521 1522 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1523 rpcerr.re_status = RPC_FAILED; 1524 rpcerr.re_errno = EIO; 1525 return (rpcerr.re_errno); 1526 } 1527 1528 #if 0 /* notyet */ 1529 /* 1530 * Remember the transfer sizes in case 1531 * nfs_feedback changes them underneath us. 1532 */ 1533 my_rsize = mi->mi_curread; 1534 my_wsize = mi->mi_curwrite; 1535 #endif 1536 1537 /* 1538 * NFS client failover support 1539 * 1540 * If this rnode is not in sync with the current server (VALID_FH), 1541 * we'd like to do a remap to get in sync. We can be interrupted 1542 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1543 * use the best info we have to try the RPC. Part of that is 1544 * unconditionally updating the filehandle copy kept for V3. 1545 * 1546 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1547 * rw_enter(); we're trying to keep the current server from being 1548 * changed on us until we're done with the remapping and have a 1549 * matching client handle. We don't want to sending a filehandle 1550 * to the wrong host. 1551 */ 1552 failoverretry: 1553 if (FAILOVER_MOUNT(mi)) { 1554 mutex_enter(&mi->mi_lock); 1555 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1556 if (failover_wait(mi)) { 1557 mutex_exit(&mi->mi_lock); 1558 return (EINTR); 1559 } 1560 } 1561 INC_READERS(mi); 1562 mutex_exit(&mi->mi_lock); 1563 if (fi) { 1564 if (!VALID_FH(fi) && 1565 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1566 int remaperr; 1567 1568 svp = mi->mi_curr_serv; 1569 remaperr = failover_remap(fi); 1570 if (remaperr != 0) { 1571 #ifdef DEBUG 1572 if (remaperr != EINTR) 1573 nfs_cmn_err(remaperr, CE_WARN, 1574 "aclcall couldn't failover: %m"); 1575 #endif 1576 mutex_enter(&mi->mi_lock); 1577 DEC_READERS(mi); 1578 mutex_exit(&mi->mi_lock); 1579 1580 /* 1581 * If failover_remap returns ETIMEDOUT 1582 * and the filesystem is hard mounted 1583 * we have to retry the call with a new 1584 * server. 1585 */ 1586 if ((mi->mi_flags & MI_HARD) && 1587 IS_RECOVERABLE_ERROR(remaperr)) { 1588 if (svp == mi->mi_curr_serv) 1589 failover_newserver(mi); 1590 rpcerr.re_status = RPC_SUCCESS; 1591 goto failoverretry; 1592 } 1593 return (remaperr); 1594 } 1595 } 1596 if (fi->fhp && fi->copyproc) 1597 (*fi->copyproc)(fi->fhp, fi->vp); 1598 } 1599 } 1600 1601 /* For TSOL, use a new cred which has net_mac_aware flag */ 1602 if (!cred_cloned && is_system_labeled()) { 1603 cred_cloned = TRUE; 1604 cr = crdup(icr); 1605 (void) setpflags(NET_MAC_AWARE, 1, cr); 1606 } 1607 1608 /* 1609 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1610 * are guaranteed to reprocess the retry as a new request. 1611 */ 1612 svp = mi->mi_curr_serv; 1613 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1614 if (FAILOVER_MOUNT(mi)) { 1615 mutex_enter(&mi->mi_lock); 1616 DEC_READERS(mi); 1617 mutex_exit(&mi->mi_lock); 1618 1619 if ((rpcerr.re_errno == ETIMEDOUT || 1620 rpcerr.re_errno == ECONNRESET) && 1621 failover_safe(fi)) { 1622 if (svp == mi->mi_curr_serv) 1623 failover_newserver(mi); 1624 goto failoverretry; 1625 } 1626 } 1627 if (rpcerr.re_errno != 0) { 1628 if (cred_cloned) 1629 crfree(cr); 1630 return (rpcerr.re_errno); 1631 } 1632 1633 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1634 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1635 timeo = (mi->mi_timeo * hz) / 10; 1636 } else { 1637 mutex_enter(&mi->mi_lock); 1638 timeo = CLNT_SETTIMERS(client, 1639 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1640 &(mi->mi_timers[NFS_CALLTYPES]), 1641 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1642 (void (*)()) 0, (caddr_t)mi, 0); 1643 mutex_exit(&mi->mi_lock); 1644 } 1645 1646 /* 1647 * If hard mounted fs, retry call forever unless hard error occurs. 1648 */ 1649 do { 1650 tryagain = FALSE; 1651 1652 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1653 status = RPC_FAILED; 1654 rpcerr.re_status = RPC_FAILED; 1655 rpcerr.re_errno = EIO; 1656 break; 1657 } 1658 1659 TICK_TO_TIMEVAL(timeo, &wait); 1660 1661 /* 1662 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1663 * and SIGTERM. (Preserving the existing masks). 1664 * Mask out SIGINT if mount option nointr is specified. 1665 */ 1666 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1667 if (!(mi->mi_flags & MI_INT)) 1668 client->cl_nosignal = TRUE; 1669 1670 /* 1671 * If there is a current signal, then don't bother 1672 * even trying to send out the request because we 1673 * won't be able to block waiting for the response. 1674 * Simply assume RPC_INTR and get on with it. 1675 */ 1676 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1677 status = RPC_INTR; 1678 else { 1679 status = CLNT_CALL(client, which, xdrargs, argsp, 1680 xdrres, resp, wait); 1681 } 1682 1683 if (!(mi->mi_flags & MI_INT)) 1684 client->cl_nosignal = FALSE; 1685 /* 1686 * restore original signal mask 1687 */ 1688 sigunintr(&smask); 1689 1690 switch (status) { 1691 case RPC_SUCCESS: 1692 #if 0 /* notyet */ 1693 if ((mi->mi_flags & MI_DYNAMIC) && 1694 mi->mi_timer_type[which] != 0 && 1695 (mi->mi_curread != my_rsize || 1696 mi->mi_curwrite != my_wsize)) 1697 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1698 #endif 1699 break; 1700 1701 /* 1702 * Unfortunately, there are servers in the world which 1703 * are not coded correctly. They are not prepared to 1704 * handle RPC requests to the NFS port which are not 1705 * NFS requests. Thus, they may try to process the 1706 * NFS_ACL request as if it were an NFS request. This 1707 * does not work. Generally, an error will be generated 1708 * on the client because it will not be able to decode 1709 * the response from the server. However, it seems 1710 * possible that the server may not be able to decode 1711 * the arguments. Thus, the criteria for deciding 1712 * whether the server supports NFS_ACL or not is whether 1713 * the following RPC errors are returned from CLNT_CALL. 1714 */ 1715 case RPC_CANTDECODERES: 1716 case RPC_PROGUNAVAIL: 1717 case RPC_CANTDECODEARGS: 1718 case RPC_PROGVERSMISMATCH: 1719 mutex_enter(&mi->mi_lock); 1720 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1721 mutex_exit(&mi->mi_lock); 1722 break; 1723 1724 /* 1725 * If the server supports NFS_ACL but not the new ops 1726 * for extended attributes, make sure we don't retry. 1727 */ 1728 case RPC_PROCUNAVAIL: 1729 mutex_enter(&mi->mi_lock); 1730 mi->mi_flags &= ~MI_EXTATTR; 1731 mutex_exit(&mi->mi_lock); 1732 break; 1733 1734 case RPC_INTR: 1735 /* 1736 * There is no way to recover from this error, 1737 * even if mount option nointr is specified. 1738 * SIGKILL, for example, cannot be blocked. 1739 */ 1740 rpcerr.re_status = RPC_INTR; 1741 rpcerr.re_errno = EINTR; 1742 break; 1743 1744 case RPC_UDERROR: 1745 /* 1746 * If the NFS server is local (vold) and 1747 * it goes away then we get RPC_UDERROR. 1748 * This is a retryable error, so we would 1749 * loop, so check to see if the specific 1750 * error was ECONNRESET, indicating that 1751 * target did not exist at all. If so, 1752 * return with RPC_PROGUNAVAIL and 1753 * ECONNRESET to indicate why. 1754 */ 1755 CLNT_GETERR(client, &rpcerr); 1756 if (rpcerr.re_errno == ECONNRESET) { 1757 rpcerr.re_status = RPC_PROGUNAVAIL; 1758 rpcerr.re_errno = ECONNRESET; 1759 break; 1760 } 1761 /*FALLTHROUGH*/ 1762 1763 default: /* probably RPC_TIMEDOUT */ 1764 if (IS_UNRECOVERABLE_RPC(status)) 1765 break; 1766 1767 /* 1768 * increment server not responding count 1769 */ 1770 mutex_enter(&mi->mi_lock); 1771 mi->mi_noresponse++; 1772 mutex_exit(&mi->mi_lock); 1773 #ifdef DEBUG 1774 nfscl->nfscl_stat.noresponse.value.ui64++; 1775 #endif 1776 1777 if (!(mi->mi_flags & MI_HARD)) { 1778 if (!(mi->mi_flags & MI_SEMISOFT) || 1779 (mi->mi_acl_ss_call_type[which] == 0)) 1780 break; 1781 } 1782 1783 /* 1784 * The call is in progress (over COTS). 1785 * Try the CLNT_CALL again, but don't 1786 * print a noisy error message. 1787 */ 1788 if (status == RPC_INPROGRESS) { 1789 tryagain = TRUE; 1790 break; 1791 } 1792 1793 if (flags & RFSCALL_SOFT) 1794 break; 1795 1796 /* 1797 * On zone shutdown, just move on. 1798 */ 1799 if (zone_status_get(curproc->p_zone) >= 1800 ZONE_IS_SHUTTING_DOWN) { 1801 rpcerr.re_status = RPC_FAILED; 1802 rpcerr.re_errno = EIO; 1803 break; 1804 } 1805 1806 /* 1807 * NFS client failover support 1808 * 1809 * If the current server just failed us, we'll 1810 * start the process of finding a new server. 1811 * After that, we can just retry. 1812 */ 1813 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1814 if (svp == mi->mi_curr_serv) 1815 failover_newserver(mi); 1816 clfree_impl(client, ch, nfscl); 1817 goto failoverretry; 1818 } 1819 1820 tryagain = TRUE; 1821 timeo = backoff(timeo); 1822 mutex_enter(&mi->mi_lock); 1823 if (!(mi->mi_flags & MI_PRINTED)) { 1824 mi->mi_flags |= MI_PRINTED; 1825 mutex_exit(&mi->mi_lock); 1826 #ifdef DEBUG 1827 zprintf(zoneid, 1828 "NFS_ACL%d server %s not responding still trying\n", 1829 mi->mi_vers, svp->sv_hostname); 1830 #else 1831 zprintf(zoneid, 1832 "NFS server %s not responding still trying\n", 1833 svp->sv_hostname); 1834 #endif 1835 } else 1836 mutex_exit(&mi->mi_lock); 1837 if (*douprintf && nfs_has_ctty()) { 1838 *douprintf = 0; 1839 if (!(mi->mi_flags & MI_NOPRINT)) 1840 #ifdef DEBUG 1841 uprintf( 1842 "NFS_ACL%d server %s not responding still trying\n", 1843 mi->mi_vers, svp->sv_hostname); 1844 #else 1845 uprintf( 1846 "NFS server %s not responding still trying\n", 1847 svp->sv_hostname); 1848 #endif 1849 } 1850 1851 #if 0 /* notyet */ 1852 /* 1853 * If doing dynamic adjustment of transfer 1854 * size and if it's a read or write call 1855 * and if the transfer size changed while 1856 * retransmitting or if the feedback routine 1857 * changed the transfer size, 1858 * then exit rfscall so that the transfer 1859 * size can be adjusted at the vnops level. 1860 */ 1861 if ((mi->mi_flags & MI_DYNAMIC) && 1862 mi->mi_acl_timer_type[which] != 0 && 1863 (mi->mi_curread != my_rsize || 1864 mi->mi_curwrite != my_wsize || 1865 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1866 /* 1867 * On read or write calls, return 1868 * back to the vnode ops level if 1869 * the transfer size changed. 1870 */ 1871 clfree_impl(client, ch, nfscl); 1872 if (cred_cloned) 1873 crfree(cr); 1874 return (ENFS_TRYAGAIN); 1875 } 1876 #endif 1877 } 1878 } while (tryagain); 1879 1880 if (status != RPC_SUCCESS) { 1881 /* 1882 * Let soft mounts use the timed out message. 1883 */ 1884 if (status == RPC_INPROGRESS) 1885 status = RPC_TIMEDOUT; 1886 nfscl->nfscl_stat.badcalls.value.ui64++; 1887 if (status == RPC_CANTDECODERES || 1888 status == RPC_PROGUNAVAIL || 1889 status == RPC_PROCUNAVAIL || 1890 status == RPC_CANTDECODEARGS || 1891 status == RPC_PROGVERSMISMATCH) 1892 CLNT_GETERR(client, &rpcerr); 1893 else if (status != RPC_INTR) { 1894 mutex_enter(&mi->mi_lock); 1895 mi->mi_flags |= MI_DOWN; 1896 mutex_exit(&mi->mi_lock); 1897 CLNT_GETERR(client, &rpcerr); 1898 #ifdef DEBUG 1899 bufp = clnt_sperror(client, svp->sv_hostname); 1900 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1901 mi->mi_vers, mi->mi_aclnames[which], bufp); 1902 if (nfs_has_ctty()) { 1903 if (!(mi->mi_flags & MI_NOPRINT)) { 1904 uprintf("NFS_ACL%d %s failed for %s\n", 1905 mi->mi_vers, mi->mi_aclnames[which], 1906 bufp); 1907 } 1908 } 1909 kmem_free(bufp, MAXPATHLEN); 1910 #else 1911 zprintf(zoneid, 1912 "NFS %s failed for server %s: error %d (%s)\n", 1913 mi->mi_aclnames[which], svp->sv_hostname, 1914 status, clnt_sperrno(status)); 1915 if (nfs_has_ctty()) { 1916 if (!(mi->mi_flags & MI_NOPRINT)) 1917 uprintf( 1918 "NFS %s failed for server %s: error %d (%s)\n", 1919 mi->mi_aclnames[which], 1920 svp->sv_hostname, status, 1921 clnt_sperrno(status)); 1922 } 1923 #endif 1924 /* 1925 * when CLNT_CALL() fails with RPC_AUTHERROR, 1926 * re_errno is set appropriately depending on 1927 * the authentication error 1928 */ 1929 if (status == RPC_VERSMISMATCH || 1930 status == RPC_PROGVERSMISMATCH) 1931 rpcerr.re_errno = EIO; 1932 } 1933 } else { 1934 /* 1935 * Test the value of mi_down and mi_printed without 1936 * holding the mi_lock mutex. If they are both zero, 1937 * then it is okay to skip the down and printed 1938 * processing. This saves on a mutex_enter and 1939 * mutex_exit pair for a normal, successful RPC. 1940 * This was just complete overhead. 1941 */ 1942 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1943 mutex_enter(&mi->mi_lock); 1944 mi->mi_flags &= ~MI_DOWN; 1945 if (mi->mi_flags & MI_PRINTED) { 1946 mi->mi_flags &= ~MI_PRINTED; 1947 mutex_exit(&mi->mi_lock); 1948 #ifdef DEBUG 1949 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1950 mi->mi_vers, svp->sv_hostname); 1951 #else 1952 zprintf(zoneid, "NFS server %s ok\n", 1953 svp->sv_hostname); 1954 #endif 1955 } else 1956 mutex_exit(&mi->mi_lock); 1957 } 1958 1959 if (*douprintf == 0) { 1960 if (!(mi->mi_flags & MI_NOPRINT)) 1961 #ifdef DEBUG 1962 uprintf("NFS_ACL%d server %s ok\n", 1963 mi->mi_vers, svp->sv_hostname); 1964 #else 1965 uprintf("NFS server %s ok\n", svp->sv_hostname); 1966 #endif 1967 *douprintf = 1; 1968 } 1969 } 1970 1971 clfree_impl(client, ch, nfscl); 1972 if (cred_cloned) 1973 crfree(cr); 1974 1975 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1976 1977 #if 0 /* notyet */ 1978 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1979 rpcerr.re_errno); 1980 #endif 1981 1982 return (rpcerr.re_errno); 1983 } 1984 1985 int 1986 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1987 { 1988 uint_t mask = vap->va_mask; 1989 1990 if (!(mask & AT_MODE)) 1991 sa->sa_mode = (uint32_t)-1; 1992 else 1993 sa->sa_mode = vap->va_mode; 1994 if (!(mask & AT_UID)) 1995 sa->sa_uid = (uint32_t)-1; 1996 else 1997 sa->sa_uid = (uint32_t)vap->va_uid; 1998 if (!(mask & AT_GID)) 1999 sa->sa_gid = (uint32_t)-1; 2000 else 2001 sa->sa_gid = (uint32_t)vap->va_gid; 2002 if (!(mask & AT_SIZE)) 2003 sa->sa_size = (uint32_t)-1; 2004 else 2005 sa->sa_size = (uint32_t)vap->va_size; 2006 if (!(mask & AT_ATIME)) 2007 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 2008 else { 2009 /* check time validity */ 2010 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2011 return (EOVERFLOW); 2012 } 2013 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2014 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2015 } 2016 if (!(mask & AT_MTIME)) 2017 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2018 else { 2019 /* check time validity */ 2020 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2021 return (EOVERFLOW); 2022 } 2023 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2024 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2025 } 2026 return (0); 2027 } 2028 2029 int 2030 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2031 { 2032 uint_t mask = vap->va_mask; 2033 2034 if (!(mask & AT_MODE)) 2035 sa->mode.set_it = FALSE; 2036 else { 2037 sa->mode.set_it = TRUE; 2038 sa->mode.mode = (mode3)vap->va_mode; 2039 } 2040 if (!(mask & AT_UID)) 2041 sa->uid.set_it = FALSE; 2042 else { 2043 sa->uid.set_it = TRUE; 2044 sa->uid.uid = (uid3)vap->va_uid; 2045 } 2046 if (!(mask & AT_GID)) 2047 sa->gid.set_it = FALSE; 2048 else { 2049 sa->gid.set_it = TRUE; 2050 sa->gid.gid = (gid3)vap->va_gid; 2051 } 2052 if (!(mask & AT_SIZE)) 2053 sa->size.set_it = FALSE; 2054 else { 2055 sa->size.set_it = TRUE; 2056 sa->size.size = (size3)vap->va_size; 2057 } 2058 if (!(mask & AT_ATIME)) 2059 sa->atime.set_it = DONT_CHANGE; 2060 else { 2061 /* check time validity */ 2062 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2063 return (EOVERFLOW); 2064 } 2065 sa->atime.set_it = SET_TO_CLIENT_TIME; 2066 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2067 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2068 } 2069 if (!(mask & AT_MTIME)) 2070 sa->mtime.set_it = DONT_CHANGE; 2071 else { 2072 /* check time validity */ 2073 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2074 return (EOVERFLOW); 2075 } 2076 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2077 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2078 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2079 } 2080 return (0); 2081 } 2082 2083 void 2084 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2085 { 2086 2087 da->da_fhandle = VTOFH(dvp); 2088 da->da_name = nm; 2089 da->da_flags = 0; 2090 } 2091 2092 void 2093 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2094 { 2095 2096 da->dirp = VTOFH3(dvp); 2097 da->name = nm; 2098 } 2099 2100 int 2101 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2102 { 2103 int error; 2104 rnode_t *rp; 2105 struct vattr va; 2106 2107 va.va_mask = AT_MODE | AT_GID; 2108 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2109 if (error) 2110 return (error); 2111 2112 /* 2113 * To determine the expected group-id of the created file: 2114 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2115 * GRPID option, and the directory's set-gid bit is clear, 2116 * then use the process's gid. 2117 * 2) Otherwise, set the group-id to the gid of the parent directory. 2118 */ 2119 rp = VTOR(dvp); 2120 mutex_enter(&rp->r_statelock); 2121 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2122 *gidp = crgetgid(cr); 2123 else 2124 *gidp = va.va_gid; 2125 mutex_exit(&rp->r_statelock); 2126 return (0); 2127 } 2128 2129 int 2130 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2131 { 2132 int error; 2133 struct vattr va; 2134 2135 va.va_mask = AT_MODE; 2136 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2137 if (error) 2138 return (error); 2139 2140 /* 2141 * Modify the expected mode (om) so that the set-gid bit matches 2142 * that of the parent directory (dvp). 2143 */ 2144 if (va.va_mode & VSGID) 2145 *omp |= VSGID; 2146 else 2147 *omp &= ~VSGID; 2148 return (0); 2149 } 2150 2151 void 2152 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2153 { 2154 2155 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2156 if (!(vp->v_flag & VSWAPLIKE)) { 2157 mutex_enter(&vp->v_lock); 2158 vp->v_flag |= VSWAPLIKE; 2159 mutex_exit(&vp->v_lock); 2160 } 2161 } else { 2162 if (vp->v_flag & VSWAPLIKE) { 2163 mutex_enter(&vp->v_lock); 2164 vp->v_flag &= ~VSWAPLIKE; 2165 mutex_exit(&vp->v_lock); 2166 } 2167 } 2168 } 2169 2170 /* 2171 * Free the resources associated with an rnode. 2172 */ 2173 static void 2174 rinactive(rnode_t *rp, cred_t *cr) 2175 { 2176 vnode_t *vp; 2177 cred_t *cred; 2178 char *contents; 2179 int size; 2180 vsecattr_t *vsp; 2181 int error; 2182 nfs3_pathconf_info *info; 2183 2184 /* 2185 * Before freeing anything, wait until all asynchronous 2186 * activity is done on this rnode. This will allow all 2187 * asynchronous read ahead and write behind i/o's to 2188 * finish. 2189 */ 2190 mutex_enter(&rp->r_statelock); 2191 while (rp->r_count > 0) 2192 cv_wait(&rp->r_cv, &rp->r_statelock); 2193 mutex_exit(&rp->r_statelock); 2194 2195 /* 2196 * Flush and invalidate all pages associated with the vnode. 2197 */ 2198 vp = RTOV(rp); 2199 if (vn_has_cached_data(vp)) { 2200 ASSERT(vp->v_type != VCHR); 2201 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2202 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2203 if (error && (error == ENOSPC || error == EDQUOT)) { 2204 mutex_enter(&rp->r_statelock); 2205 if (!rp->r_error) 2206 rp->r_error = error; 2207 mutex_exit(&rp->r_statelock); 2208 } 2209 } 2210 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2211 } 2212 2213 /* 2214 * Free any held credentials and caches which may be associated 2215 * with this rnode. 2216 */ 2217 mutex_enter(&rp->r_statelock); 2218 cred = rp->r_cred; 2219 rp->r_cred = NULL; 2220 contents = rp->r_symlink.contents; 2221 size = rp->r_symlink.size; 2222 rp->r_symlink.contents = NULL; 2223 vsp = rp->r_secattr; 2224 rp->r_secattr = NULL; 2225 info = rp->r_pathconf; 2226 rp->r_pathconf = NULL; 2227 mutex_exit(&rp->r_statelock); 2228 2229 /* 2230 * Free the held credential. 2231 */ 2232 if (cred != NULL) 2233 crfree(cred); 2234 2235 /* 2236 * Free the access cache entries. 2237 */ 2238 (void) nfs_access_purge_rp(rp); 2239 2240 /* 2241 * Free the readdir cache entries. 2242 */ 2243 if (HAVE_RDDIR_CACHE(rp)) 2244 nfs_purge_rddir_cache(vp); 2245 2246 /* 2247 * Free the symbolic link cache. 2248 */ 2249 if (contents != NULL) { 2250 2251 kmem_free((void *)contents, size); 2252 } 2253 2254 /* 2255 * Free any cached ACL. 2256 */ 2257 if (vsp != NULL) 2258 nfs_acl_free(vsp); 2259 2260 /* 2261 * Free any cached pathconf information. 2262 */ 2263 if (info != NULL) 2264 kmem_free(info, sizeof (*info)); 2265 } 2266 2267 /* 2268 * Return a vnode for the given NFS Version 2 file handle. 2269 * If no rnode exists for this fhandle, create one and put it 2270 * into the hash queues. If the rnode for this fhandle 2271 * already exists, return it. 2272 * 2273 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2274 */ 2275 vnode_t * 2276 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2277 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2278 { 2279 int newnode; 2280 int index; 2281 vnode_t *vp; 2282 nfs_fhandle nfh; 2283 vattr_t va; 2284 2285 nfh.fh_len = NFS_FHSIZE; 2286 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2287 2288 index = rtablehash(&nfh); 2289 rw_enter(&rtable[index].r_lock, RW_READER); 2290 2291 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2292 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2293 2294 if (attr != NULL) { 2295 if (!newnode) { 2296 rw_exit(&rtable[index].r_lock); 2297 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2298 } else { 2299 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2300 vp->v_type = VBAD; 2301 else 2302 vp->v_type = n2v_type(attr); 2303 /* 2304 * A translation here seems to be necessary 2305 * because this function can be called 2306 * with `attr' that has come from the wire, 2307 * and been operated on by vattr_to_nattr(). 2308 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2309 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2310 * ->makenfsnode(). 2311 */ 2312 if ((attr->na_rdev & 0xffff0000) == 0) 2313 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2314 else 2315 vp->v_rdev = expldev(n2v_rdev(attr)); 2316 nfs_attrcache(vp, attr, t); 2317 rw_exit(&rtable[index].r_lock); 2318 } 2319 } else { 2320 if (newnode) { 2321 PURGE_ATTRCACHE(vp); 2322 } 2323 rw_exit(&rtable[index].r_lock); 2324 } 2325 2326 return (vp); 2327 } 2328 2329 /* 2330 * Return a vnode for the given NFS Version 3 file handle. 2331 * If no rnode exists for this fhandle, create one and put it 2332 * into the hash queues. If the rnode for this fhandle 2333 * already exists, return it. 2334 * 2335 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2336 */ 2337 vnode_t * 2338 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2339 cred_t *cr, char *dnm, char *nm) 2340 { 2341 int newnode; 2342 int index; 2343 vnode_t *vp; 2344 2345 index = rtablehash((nfs_fhandle *)fh); 2346 rw_enter(&rtable[index].r_lock, RW_READER); 2347 2348 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2349 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2350 dnm, nm); 2351 2352 if (vap == NULL) { 2353 if (newnode) { 2354 PURGE_ATTRCACHE(vp); 2355 } 2356 rw_exit(&rtable[index].r_lock); 2357 return (vp); 2358 } 2359 2360 if (!newnode) { 2361 rw_exit(&rtable[index].r_lock); 2362 nfs_attr_cache(vp, vap, t, cr); 2363 } else { 2364 rnode_t *rp = VTOR(vp); 2365 2366 vp->v_type = vap->va_type; 2367 vp->v_rdev = vap->va_rdev; 2368 2369 mutex_enter(&rp->r_statelock); 2370 if (rp->r_mtime <= t) 2371 nfs_attrcache_va(vp, vap); 2372 mutex_exit(&rp->r_statelock); 2373 rw_exit(&rtable[index].r_lock); 2374 } 2375 2376 return (vp); 2377 } 2378 2379 vnode_t * 2380 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2381 cred_t *cr, char *dnm, char *nm) 2382 { 2383 int newnode; 2384 int index; 2385 vnode_t *vp; 2386 vattr_t va; 2387 2388 index = rtablehash((nfs_fhandle *)fh); 2389 rw_enter(&rtable[index].r_lock, RW_READER); 2390 2391 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2392 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2393 dnm, nm); 2394 2395 if (attr == NULL) { 2396 if (newnode) { 2397 PURGE_ATTRCACHE(vp); 2398 } 2399 rw_exit(&rtable[index].r_lock); 2400 return (vp); 2401 } 2402 2403 if (!newnode) { 2404 rw_exit(&rtable[index].r_lock); 2405 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2406 } else { 2407 if (attr->type < NF3REG || attr->type > NF3FIFO) 2408 vp->v_type = VBAD; 2409 else 2410 vp->v_type = nf3_to_vt[attr->type]; 2411 vp->v_rdev = makedevice(attr->rdev.specdata1, 2412 attr->rdev.specdata2); 2413 nfs3_attrcache(vp, attr, t); 2414 rw_exit(&rtable[index].r_lock); 2415 } 2416 2417 return (vp); 2418 } 2419 2420 /* 2421 * Read this comment before making changes to rtablehash()! 2422 * This is a hash function in which seemingly obvious and harmless 2423 * changes can cause escalations costing million dollars! 2424 * Know what you are doing. 2425 * 2426 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2427 * algorithm is currently detailed here: 2428 * 2429 * http://burtleburtle.net/bob/hash/doobs.html 2430 * 2431 * Of course, the above link may not be valid by the time you are reading 2432 * this, but suffice it to say that the one-at-a-time algorithm works well in 2433 * almost all cases. If you are changing the algorithm be sure to verify that 2434 * the hash algorithm still provides even distribution in all cases and with 2435 * any server returning filehandles in whatever order (sequential or random). 2436 */ 2437 static int 2438 rtablehash(nfs_fhandle *fh) 2439 { 2440 ulong_t hash, len, i; 2441 char *key; 2442 2443 key = fh->fh_buf; 2444 len = (ulong_t)fh->fh_len; 2445 for (hash = 0, i = 0; i < len; i++) { 2446 hash += key[i]; 2447 hash += (hash << 10); 2448 hash ^= (hash >> 6); 2449 } 2450 hash += (hash << 3); 2451 hash ^= (hash >> 11); 2452 hash += (hash << 15); 2453 return (hash & rtablemask); 2454 } 2455 2456 static vnode_t * 2457 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2458 struct vnodeops *vops, 2459 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2460 int (*compar)(const void *, const void *), 2461 int *newnode, cred_t *cr, char *dnm, char *nm) 2462 { 2463 rnode_t *rp; 2464 rnode_t *trp; 2465 vnode_t *vp; 2466 mntinfo_t *mi; 2467 2468 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2469 2470 mi = VFTOMI(vfsp); 2471 start: 2472 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2473 vp = RTOV(rp); 2474 nfs_set_vroot(vp); 2475 *newnode = 0; 2476 return (vp); 2477 } 2478 rw_exit(&rhtp->r_lock); 2479 2480 mutex_enter(&rpfreelist_lock); 2481 if (rpfreelist != NULL && rnew >= nrnode) { 2482 rp = rpfreelist; 2483 rp_rmfree(rp); 2484 mutex_exit(&rpfreelist_lock); 2485 2486 vp = RTOV(rp); 2487 2488 if (rp->r_flags & RHASHED) { 2489 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2490 mutex_enter(&vp->v_lock); 2491 if (vp->v_count > 1) { 2492 vp->v_count--; 2493 mutex_exit(&vp->v_lock); 2494 rw_exit(&rp->r_hashq->r_lock); 2495 rw_enter(&rhtp->r_lock, RW_READER); 2496 goto start; 2497 } 2498 mutex_exit(&vp->v_lock); 2499 rp_rmhash_locked(rp); 2500 rw_exit(&rp->r_hashq->r_lock); 2501 } 2502 2503 rinactive(rp, cr); 2504 2505 mutex_enter(&vp->v_lock); 2506 if (vp->v_count > 1) { 2507 vp->v_count--; 2508 mutex_exit(&vp->v_lock); 2509 rw_enter(&rhtp->r_lock, RW_READER); 2510 goto start; 2511 } 2512 mutex_exit(&vp->v_lock); 2513 vn_invalid(vp); 2514 /* 2515 * destroy old locks before bzero'ing and 2516 * recreating the locks below. 2517 */ 2518 nfs_rw_destroy(&rp->r_rwlock); 2519 nfs_rw_destroy(&rp->r_lkserlock); 2520 mutex_destroy(&rp->r_statelock); 2521 cv_destroy(&rp->r_cv); 2522 cv_destroy(&rp->r_commit.c_cv); 2523 nfs_free_r_path(rp); 2524 avl_destroy(&rp->r_dir); 2525 /* 2526 * Make sure that if rnode is recycled then 2527 * VFS count is decremented properly before 2528 * reuse. 2529 */ 2530 VFS_RELE(vp->v_vfsp); 2531 vn_reinit(vp); 2532 } else { 2533 vnode_t *new_vp; 2534 2535 mutex_exit(&rpfreelist_lock); 2536 2537 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2538 new_vp = vn_alloc(KM_SLEEP); 2539 2540 atomic_inc_ulong((ulong_t *)&rnew); 2541 #ifdef DEBUG 2542 clstat_debug.nrnode.value.ui64++; 2543 #endif 2544 vp = new_vp; 2545 } 2546 2547 bzero(rp, sizeof (*rp)); 2548 rp->r_vnode = vp; 2549 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2550 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2551 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2552 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2553 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2554 rp->r_fh.fh_len = fh->fh_len; 2555 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2556 rp->r_server = mi->mi_curr_serv; 2557 if (FAILOVER_MOUNT(mi)) { 2558 /* 2559 * If replicated servers, stash pathnames 2560 */ 2561 if (dnm != NULL && nm != NULL) { 2562 char *s, *p; 2563 uint_t len; 2564 2565 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2566 rp->r_path = kmem_alloc(len, KM_SLEEP); 2567 #ifdef DEBUG 2568 clstat_debug.rpath.value.ui64 += len; 2569 #endif 2570 s = rp->r_path; 2571 for (p = dnm; *p; p++) 2572 *s++ = *p; 2573 *s++ = '/'; 2574 for (p = nm; *p; p++) 2575 *s++ = *p; 2576 *s = '\0'; 2577 } else { 2578 /* special case for root */ 2579 rp->r_path = kmem_alloc(2, KM_SLEEP); 2580 #ifdef DEBUG 2581 clstat_debug.rpath.value.ui64 += 2; 2582 #endif 2583 *rp->r_path = '.'; 2584 *(rp->r_path + 1) = '\0'; 2585 } 2586 } 2587 VFS_HOLD(vfsp); 2588 rp->r_putapage = putapage; 2589 rp->r_hashq = rhtp; 2590 rp->r_flags = RREADDIRPLUS; 2591 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2592 offsetof(rddir_cache, tree)); 2593 vn_setops(vp, vops); 2594 vp->v_data = (caddr_t)rp; 2595 vp->v_vfsp = vfsp; 2596 vp->v_type = VNON; 2597 vp->v_flag |= VMODSORT; 2598 nfs_set_vroot(vp); 2599 2600 /* 2601 * There is a race condition if someone else 2602 * alloc's the rnode while no locks are held, so we 2603 * check again and recover if found. 2604 */ 2605 rw_enter(&rhtp->r_lock, RW_WRITER); 2606 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2607 vp = RTOV(trp); 2608 nfs_set_vroot(vp); 2609 *newnode = 0; 2610 rw_exit(&rhtp->r_lock); 2611 rp_addfree(rp, cr); 2612 rw_enter(&rhtp->r_lock, RW_READER); 2613 return (vp); 2614 } 2615 rp_addhash(rp); 2616 *newnode = 1; 2617 return (vp); 2618 } 2619 2620 /* 2621 * Callback function to check if the page should be marked as 2622 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT. 2623 */ 2624 int 2625 nfs_setmod_check(page_t *pp) 2626 { 2627 if (pp->p_fsdata != C_NOCOMMIT) { 2628 pp->p_fsdata = C_NOCOMMIT; 2629 return (1); 2630 } 2631 return (0); 2632 } 2633 2634 static void 2635 nfs_set_vroot(vnode_t *vp) 2636 { 2637 rnode_t *rp; 2638 nfs_fhandle *rootfh; 2639 2640 rp = VTOR(vp); 2641 rootfh = &rp->r_server->sv_fhandle; 2642 if (rootfh->fh_len == rp->r_fh.fh_len && 2643 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2644 if (!(vp->v_flag & VROOT)) { 2645 mutex_enter(&vp->v_lock); 2646 vp->v_flag |= VROOT; 2647 mutex_exit(&vp->v_lock); 2648 } 2649 } 2650 } 2651 2652 static void 2653 nfs_free_r_path(rnode_t *rp) 2654 { 2655 char *path; 2656 size_t len; 2657 2658 path = rp->r_path; 2659 if (path) { 2660 rp->r_path = NULL; 2661 len = strlen(path) + 1; 2662 kmem_free(path, len); 2663 #ifdef DEBUG 2664 clstat_debug.rpath.value.ui64 -= len; 2665 #endif 2666 } 2667 } 2668 2669 /* 2670 * Put an rnode on the free list. 2671 * 2672 * Rnodes which were allocated above and beyond the normal limit 2673 * are immediately freed. 2674 */ 2675 void 2676 rp_addfree(rnode_t *rp, cred_t *cr) 2677 { 2678 vnode_t *vp; 2679 struct vfs *vfsp; 2680 2681 vp = RTOV(rp); 2682 ASSERT(vp->v_count >= 1); 2683 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2684 2685 /* 2686 * If we have too many rnodes allocated and there are no 2687 * references to this rnode, or if the rnode is no longer 2688 * accessible by it does not reside in the hash queues, 2689 * or if an i/o error occurred while writing to the file, 2690 * then just free it instead of putting it on the rnode 2691 * freelist. 2692 */ 2693 vfsp = vp->v_vfsp; 2694 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2695 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2696 if (rp->r_flags & RHASHED) { 2697 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2698 mutex_enter(&vp->v_lock); 2699 if (vp->v_count > 1) { 2700 vp->v_count--; 2701 mutex_exit(&vp->v_lock); 2702 rw_exit(&rp->r_hashq->r_lock); 2703 return; 2704 } 2705 mutex_exit(&vp->v_lock); 2706 rp_rmhash_locked(rp); 2707 rw_exit(&rp->r_hashq->r_lock); 2708 } 2709 2710 rinactive(rp, cr); 2711 2712 /* 2713 * Recheck the vnode reference count. We need to 2714 * make sure that another reference has not been 2715 * acquired while we were not holding v_lock. The 2716 * rnode is not in the rnode hash queues, so the 2717 * only way for a reference to have been acquired 2718 * is for a VOP_PUTPAGE because the rnode was marked 2719 * with RDIRTY or for a modified page. This 2720 * reference may have been acquired before our call 2721 * to rinactive. The i/o may have been completed, 2722 * thus allowing rinactive to complete, but the 2723 * reference to the vnode may not have been released 2724 * yet. In any case, the rnode can not be destroyed 2725 * until the other references to this vnode have been 2726 * released. The other references will take care of 2727 * either destroying the rnode or placing it on the 2728 * rnode freelist. If there are no other references, 2729 * then the rnode may be safely destroyed. 2730 */ 2731 mutex_enter(&vp->v_lock); 2732 if (vp->v_count > 1) { 2733 vp->v_count--; 2734 mutex_exit(&vp->v_lock); 2735 return; 2736 } 2737 mutex_exit(&vp->v_lock); 2738 2739 destroy_rnode(rp); 2740 return; 2741 } 2742 2743 /* 2744 * Lock the hash queue and then recheck the reference count 2745 * to ensure that no other threads have acquired a reference 2746 * to indicate that the rnode should not be placed on the 2747 * freelist. If another reference has been acquired, then 2748 * just release this one and let the other thread complete 2749 * the processing of adding this rnode to the freelist. 2750 */ 2751 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2752 2753 mutex_enter(&vp->v_lock); 2754 if (vp->v_count > 1) { 2755 vp->v_count--; 2756 mutex_exit(&vp->v_lock); 2757 rw_exit(&rp->r_hashq->r_lock); 2758 return; 2759 } 2760 mutex_exit(&vp->v_lock); 2761 2762 /* 2763 * If there is no cached data or metadata for this file, then 2764 * put the rnode on the front of the freelist so that it will 2765 * be reused before other rnodes which may have cached data or 2766 * metadata associated with them. 2767 */ 2768 mutex_enter(&rpfreelist_lock); 2769 if (rpfreelist == NULL) { 2770 rp->r_freef = rp; 2771 rp->r_freeb = rp; 2772 rpfreelist = rp; 2773 } else { 2774 rp->r_freef = rpfreelist; 2775 rp->r_freeb = rpfreelist->r_freeb; 2776 rpfreelist->r_freeb->r_freef = rp; 2777 rpfreelist->r_freeb = rp; 2778 if (!vn_has_cached_data(vp) && 2779 !HAVE_RDDIR_CACHE(rp) && 2780 rp->r_symlink.contents == NULL && 2781 rp->r_secattr == NULL && 2782 rp->r_pathconf == NULL) 2783 rpfreelist = rp; 2784 } 2785 mutex_exit(&rpfreelist_lock); 2786 2787 rw_exit(&rp->r_hashq->r_lock); 2788 } 2789 2790 /* 2791 * Remove an rnode from the free list. 2792 * 2793 * The caller must be holding rpfreelist_lock and the rnode 2794 * must be on the freelist. 2795 */ 2796 static void 2797 rp_rmfree(rnode_t *rp) 2798 { 2799 2800 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2801 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2802 2803 if (rp == rpfreelist) { 2804 rpfreelist = rp->r_freef; 2805 if (rp == rpfreelist) 2806 rpfreelist = NULL; 2807 } 2808 2809 rp->r_freeb->r_freef = rp->r_freef; 2810 rp->r_freef->r_freeb = rp->r_freeb; 2811 2812 rp->r_freef = rp->r_freeb = NULL; 2813 } 2814 2815 /* 2816 * Put a rnode in the hash table. 2817 * 2818 * The caller must be holding the exclusive hash queue lock. 2819 */ 2820 static void 2821 rp_addhash(rnode_t *rp) 2822 { 2823 2824 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2825 ASSERT(!(rp->r_flags & RHASHED)); 2826 2827 rp->r_hashf = rp->r_hashq->r_hashf; 2828 rp->r_hashq->r_hashf = rp; 2829 rp->r_hashb = (rnode_t *)rp->r_hashq; 2830 rp->r_hashf->r_hashb = rp; 2831 2832 mutex_enter(&rp->r_statelock); 2833 rp->r_flags |= RHASHED; 2834 mutex_exit(&rp->r_statelock); 2835 } 2836 2837 /* 2838 * Remove a rnode from the hash table. 2839 * 2840 * The caller must be holding the hash queue lock. 2841 */ 2842 static void 2843 rp_rmhash_locked(rnode_t *rp) 2844 { 2845 2846 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2847 ASSERT(rp->r_flags & RHASHED); 2848 2849 rp->r_hashb->r_hashf = rp->r_hashf; 2850 rp->r_hashf->r_hashb = rp->r_hashb; 2851 2852 mutex_enter(&rp->r_statelock); 2853 rp->r_flags &= ~RHASHED; 2854 mutex_exit(&rp->r_statelock); 2855 } 2856 2857 /* 2858 * Remove a rnode from the hash table. 2859 * 2860 * The caller must not be holding the hash queue lock. 2861 */ 2862 void 2863 rp_rmhash(rnode_t *rp) 2864 { 2865 2866 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2867 rp_rmhash_locked(rp); 2868 rw_exit(&rp->r_hashq->r_lock); 2869 } 2870 2871 /* 2872 * Lookup a rnode by fhandle. 2873 * 2874 * The caller must be holding the hash queue lock, either shared or exclusive. 2875 */ 2876 static rnode_t * 2877 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2878 { 2879 rnode_t *rp; 2880 vnode_t *vp; 2881 2882 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2883 2884 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2885 vp = RTOV(rp); 2886 if (vp->v_vfsp == vfsp && 2887 rp->r_fh.fh_len == fh->fh_len && 2888 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2889 /* 2890 * remove rnode from free list, if necessary. 2891 */ 2892 if (rp->r_freef != NULL) { 2893 mutex_enter(&rpfreelist_lock); 2894 /* 2895 * If the rnode is on the freelist, 2896 * then remove it and use that reference 2897 * as the new reference. Otherwise, 2898 * need to increment the reference count. 2899 */ 2900 if (rp->r_freef != NULL) { 2901 rp_rmfree(rp); 2902 mutex_exit(&rpfreelist_lock); 2903 } else { 2904 mutex_exit(&rpfreelist_lock); 2905 VN_HOLD(vp); 2906 } 2907 } else 2908 VN_HOLD(vp); 2909 return (rp); 2910 } 2911 } 2912 return (NULL); 2913 } 2914 2915 /* 2916 * Return 1 if there is a active vnode belonging to this vfs in the 2917 * rtable cache. 2918 * 2919 * Several of these checks are done without holding the usual 2920 * locks. This is safe because destroy_rtable(), rp_addfree(), 2921 * etc. will redo the necessary checks before actually destroying 2922 * any rnodes. 2923 */ 2924 int 2925 check_rtable(struct vfs *vfsp) 2926 { 2927 int index; 2928 rnode_t *rp; 2929 vnode_t *vp; 2930 2931 for (index = 0; index < rtablesize; index++) { 2932 rw_enter(&rtable[index].r_lock, RW_READER); 2933 for (rp = rtable[index].r_hashf; 2934 rp != (rnode_t *)(&rtable[index]); 2935 rp = rp->r_hashf) { 2936 vp = RTOV(rp); 2937 if (vp->v_vfsp == vfsp) { 2938 if (rp->r_freef == NULL || 2939 (vn_has_cached_data(vp) && 2940 (rp->r_flags & RDIRTY)) || 2941 rp->r_count > 0) { 2942 rw_exit(&rtable[index].r_lock); 2943 return (1); 2944 } 2945 } 2946 } 2947 rw_exit(&rtable[index].r_lock); 2948 } 2949 return (0); 2950 } 2951 2952 /* 2953 * Destroy inactive vnodes from the hash queues which belong to this 2954 * vfs. It is essential that we destroy all inactive vnodes during a 2955 * forced unmount as well as during a normal unmount. 2956 */ 2957 void 2958 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2959 { 2960 int index; 2961 rnode_t *rp; 2962 rnode_t *rlist; 2963 rnode_t *r_hashf; 2964 vnode_t *vp; 2965 2966 rlist = NULL; 2967 2968 for (index = 0; index < rtablesize; index++) { 2969 rw_enter(&rtable[index].r_lock, RW_WRITER); 2970 for (rp = rtable[index].r_hashf; 2971 rp != (rnode_t *)(&rtable[index]); 2972 rp = r_hashf) { 2973 /* save the hash pointer before destroying */ 2974 r_hashf = rp->r_hashf; 2975 vp = RTOV(rp); 2976 if (vp->v_vfsp == vfsp) { 2977 mutex_enter(&rpfreelist_lock); 2978 if (rp->r_freef != NULL) { 2979 rp_rmfree(rp); 2980 mutex_exit(&rpfreelist_lock); 2981 rp_rmhash_locked(rp); 2982 rp->r_hashf = rlist; 2983 rlist = rp; 2984 } else 2985 mutex_exit(&rpfreelist_lock); 2986 } 2987 } 2988 rw_exit(&rtable[index].r_lock); 2989 } 2990 2991 for (rp = rlist; rp != NULL; rp = rlist) { 2992 rlist = rp->r_hashf; 2993 /* 2994 * This call to rp_addfree will end up destroying the 2995 * rnode, but in a safe way with the appropriate set 2996 * of checks done. 2997 */ 2998 rp_addfree(rp, cr); 2999 } 3000 3001 } 3002 3003 /* 3004 * This routine destroys all the resources associated with the rnode 3005 * and then the rnode itself. 3006 */ 3007 static void 3008 destroy_rnode(rnode_t *rp) 3009 { 3010 vnode_t *vp; 3011 vfs_t *vfsp; 3012 3013 vp = RTOV(rp); 3014 vfsp = vp->v_vfsp; 3015 3016 ASSERT(vp->v_count == 1); 3017 ASSERT(rp->r_count == 0); 3018 ASSERT(rp->r_lmpl == NULL); 3019 ASSERT(rp->r_mapcnt == 0); 3020 ASSERT(!(rp->r_flags & RHASHED)); 3021 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 3022 atomic_dec_ulong((ulong_t *)&rnew); 3023 #ifdef DEBUG 3024 clstat_debug.nrnode.value.ui64--; 3025 #endif 3026 nfs_rw_destroy(&rp->r_rwlock); 3027 nfs_rw_destroy(&rp->r_lkserlock); 3028 mutex_destroy(&rp->r_statelock); 3029 cv_destroy(&rp->r_cv); 3030 cv_destroy(&rp->r_commit.c_cv); 3031 if (rp->r_flags & RDELMAPLIST) 3032 list_destroy(&rp->r_indelmap); 3033 nfs_free_r_path(rp); 3034 avl_destroy(&rp->r_dir); 3035 vn_invalid(vp); 3036 vn_free(vp); 3037 kmem_cache_free(rnode_cache, rp); 3038 VFS_RELE(vfsp); 3039 } 3040 3041 /* 3042 * Flush all vnodes in this (or every) vfs. 3043 * Used by nfs_sync and by nfs_unmount. 3044 */ 3045 void 3046 rflush(struct vfs *vfsp, cred_t *cr) 3047 { 3048 int index; 3049 rnode_t *rp; 3050 vnode_t *vp, **vplist; 3051 long num, cnt; 3052 3053 /* 3054 * Check to see whether there is anything to do. 3055 */ 3056 num = rnew; 3057 if (num == 0) 3058 return; 3059 3060 /* 3061 * Allocate a slot for all currently active rnodes on the 3062 * supposition that they all may need flushing. 3063 */ 3064 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3065 cnt = 0; 3066 3067 /* 3068 * Walk the hash queues looking for rnodes with page 3069 * lists associated with them. Make a list of these 3070 * files. 3071 */ 3072 for (index = 0; index < rtablesize; index++) { 3073 rw_enter(&rtable[index].r_lock, RW_READER); 3074 for (rp = rtable[index].r_hashf; 3075 rp != (rnode_t *)(&rtable[index]); 3076 rp = rp->r_hashf) { 3077 vp = RTOV(rp); 3078 /* 3079 * Don't bother sync'ing a vp if it 3080 * is part of virtual swap device or 3081 * if VFS is read-only 3082 */ 3083 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3084 continue; 3085 /* 3086 * If flushing all mounted file systems or 3087 * the vnode belongs to this vfs, has pages 3088 * and is marked as either dirty or mmap'd, 3089 * hold and add this vnode to the list of 3090 * vnodes to flush. 3091 */ 3092 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3093 vn_has_cached_data(vp) && 3094 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3095 VN_HOLD(vp); 3096 vplist[cnt++] = vp; 3097 if (cnt == num) { 3098 rw_exit(&rtable[index].r_lock); 3099 goto toomany; 3100 } 3101 } 3102 } 3103 rw_exit(&rtable[index].r_lock); 3104 } 3105 toomany: 3106 3107 /* 3108 * Flush and release all of the files on the list. 3109 */ 3110 while (cnt-- > 0) { 3111 vp = vplist[cnt]; 3112 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3113 VN_RELE(vp); 3114 } 3115 3116 /* 3117 * Free the space allocated to hold the list. 3118 */ 3119 kmem_free(vplist, num * sizeof (*vplist)); 3120 } 3121 3122 /* 3123 * This probably needs to be larger than or equal to 3124 * log2(sizeof (struct rnode)) due to the way that rnodes are 3125 * allocated. 3126 */ 3127 #define ACACHE_SHIFT_BITS 9 3128 3129 static int 3130 acachehash(rnode_t *rp, cred_t *cr) 3131 { 3132 3133 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3134 acachemask); 3135 } 3136 3137 #ifdef DEBUG 3138 static long nfs_access_cache_hits = 0; 3139 static long nfs_access_cache_misses = 0; 3140 #endif 3141 3142 nfs_access_type_t 3143 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3144 { 3145 vnode_t *vp; 3146 acache_t *ap; 3147 acache_hash_t *hp; 3148 nfs_access_type_t all; 3149 3150 vp = RTOV(rp); 3151 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3152 return (NFS_ACCESS_UNKNOWN); 3153 3154 if (rp->r_acache != NULL) { 3155 hp = &acache[acachehash(rp, cr)]; 3156 rw_enter(&hp->lock, RW_READER); 3157 ap = hp->next; 3158 while (ap != (acache_t *)hp) { 3159 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3160 if ((ap->known & acc) == acc) { 3161 #ifdef DEBUG 3162 nfs_access_cache_hits++; 3163 #endif 3164 if ((ap->allowed & acc) == acc) 3165 all = NFS_ACCESS_ALLOWED; 3166 else 3167 all = NFS_ACCESS_DENIED; 3168 } else { 3169 #ifdef DEBUG 3170 nfs_access_cache_misses++; 3171 #endif 3172 all = NFS_ACCESS_UNKNOWN; 3173 } 3174 rw_exit(&hp->lock); 3175 return (all); 3176 } 3177 ap = ap->next; 3178 } 3179 rw_exit(&hp->lock); 3180 } 3181 3182 #ifdef DEBUG 3183 nfs_access_cache_misses++; 3184 #endif 3185 return (NFS_ACCESS_UNKNOWN); 3186 } 3187 3188 void 3189 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3190 { 3191 acache_t *ap; 3192 acache_t *nap; 3193 acache_hash_t *hp; 3194 3195 hp = &acache[acachehash(rp, cr)]; 3196 3197 /* 3198 * Allocate now assuming that mostly an allocation will be 3199 * required. This allows the allocation to happen without 3200 * holding the hash bucket locked. 3201 */ 3202 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3203 if (nap != NULL) { 3204 nap->known = acc; 3205 nap->allowed = resacc; 3206 nap->rnode = rp; 3207 crhold(cr); 3208 nap->cred = cr; 3209 nap->hashq = hp; 3210 } 3211 3212 rw_enter(&hp->lock, RW_WRITER); 3213 3214 if (rp->r_acache != NULL) { 3215 ap = hp->next; 3216 while (ap != (acache_t *)hp) { 3217 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3218 ap->known |= acc; 3219 ap->allowed &= ~acc; 3220 ap->allowed |= resacc; 3221 rw_exit(&hp->lock); 3222 if (nap != NULL) { 3223 crfree(nap->cred); 3224 kmem_cache_free(acache_cache, nap); 3225 } 3226 return; 3227 } 3228 ap = ap->next; 3229 } 3230 } 3231 3232 if (nap != NULL) { 3233 #ifdef DEBUG 3234 clstat_debug.access.value.ui64++; 3235 #endif 3236 nap->next = hp->next; 3237 hp->next = nap; 3238 nap->next->prev = nap; 3239 nap->prev = (acache_t *)hp; 3240 3241 mutex_enter(&rp->r_statelock); 3242 nap->list = rp->r_acache; 3243 rp->r_acache = nap; 3244 mutex_exit(&rp->r_statelock); 3245 } 3246 3247 rw_exit(&hp->lock); 3248 } 3249 3250 int 3251 nfs_access_purge_rp(rnode_t *rp) 3252 { 3253 acache_t *ap; 3254 acache_t *tmpap; 3255 acache_t *rplist; 3256 3257 /* 3258 * If there aren't any cached entries, then there is nothing 3259 * to free. 3260 */ 3261 if (rp->r_acache == NULL) 3262 return (0); 3263 3264 mutex_enter(&rp->r_statelock); 3265 rplist = rp->r_acache; 3266 rp->r_acache = NULL; 3267 mutex_exit(&rp->r_statelock); 3268 3269 /* 3270 * Loop through each entry in the list pointed to in the 3271 * rnode. Remove each of these entries from the hash 3272 * queue that it is on and remove it from the list in 3273 * the rnode. 3274 */ 3275 for (ap = rplist; ap != NULL; ap = tmpap) { 3276 rw_enter(&ap->hashq->lock, RW_WRITER); 3277 ap->prev->next = ap->next; 3278 ap->next->prev = ap->prev; 3279 rw_exit(&ap->hashq->lock); 3280 3281 tmpap = ap->list; 3282 crfree(ap->cred); 3283 kmem_cache_free(acache_cache, ap); 3284 #ifdef DEBUG 3285 clstat_debug.access.value.ui64--; 3286 #endif 3287 } 3288 3289 return (1); 3290 } 3291 3292 static const char prefix[] = ".nfs"; 3293 3294 static kmutex_t newnum_lock; 3295 3296 int 3297 newnum(void) 3298 { 3299 static uint_t newnum = 0; 3300 uint_t id; 3301 3302 mutex_enter(&newnum_lock); 3303 if (newnum == 0) 3304 newnum = gethrestime_sec() & 0xffff; 3305 id = newnum++; 3306 mutex_exit(&newnum_lock); 3307 return (id); 3308 } 3309 3310 char * 3311 newname(void) 3312 { 3313 char *news; 3314 char *s; 3315 const char *p; 3316 uint_t id; 3317 3318 id = newnum(); 3319 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3320 s = news; 3321 p = prefix; 3322 while (*p != '\0') 3323 *s++ = *p++; 3324 while (id != 0) { 3325 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3326 id >>= 4; 3327 } 3328 *s = '\0'; 3329 return (news); 3330 } 3331 3332 /* 3333 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3334 * framework. 3335 */ 3336 static int 3337 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3338 { 3339 ksp->ks_snaptime = gethrtime(); 3340 if (rw == KSTAT_WRITE) { 3341 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3342 #ifdef DEBUG 3343 /* 3344 * Currently only the global zone can write to kstats, but we 3345 * add the check just for paranoia. 3346 */ 3347 if (INGLOBALZONE(curproc)) 3348 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3349 sizeof (clstat_debug)); 3350 #endif 3351 } else { 3352 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3353 #ifdef DEBUG 3354 /* 3355 * If we're displaying the "global" debug kstat values, we 3356 * display them as-is to all zones since in fact they apply to 3357 * the system as a whole. 3358 */ 3359 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3360 sizeof (clstat_debug)); 3361 #endif 3362 } 3363 return (0); 3364 } 3365 3366 static void * 3367 clinit_zone(zoneid_t zoneid) 3368 { 3369 kstat_t *nfs_client_kstat; 3370 struct nfs_clnt *nfscl; 3371 uint_t ndata; 3372 3373 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3374 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3375 nfscl->nfscl_chtable = NULL; 3376 nfscl->nfscl_zoneid = zoneid; 3377 3378 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3379 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3380 #ifdef DEBUG 3381 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3382 #endif 3383 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3384 "misc", KSTAT_TYPE_NAMED, ndata, 3385 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3386 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3387 nfs_client_kstat->ks_snapshot = cl_snapshot; 3388 kstat_install(nfs_client_kstat); 3389 } 3390 mutex_enter(&nfs_clnt_list_lock); 3391 list_insert_head(&nfs_clnt_list, nfscl); 3392 mutex_exit(&nfs_clnt_list_lock); 3393 return (nfscl); 3394 } 3395 3396 /*ARGSUSED*/ 3397 static void 3398 clfini_zone(zoneid_t zoneid, void *arg) 3399 { 3400 struct nfs_clnt *nfscl = arg; 3401 chhead_t *chp, *next; 3402 3403 if (nfscl == NULL) 3404 return; 3405 mutex_enter(&nfs_clnt_list_lock); 3406 list_remove(&nfs_clnt_list, nfscl); 3407 mutex_exit(&nfs_clnt_list_lock); 3408 clreclaim_zone(nfscl, 0); 3409 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3410 ASSERT(chp->ch_list == NULL); 3411 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3412 next = chp->ch_next; 3413 kmem_free(chp, sizeof (*chp)); 3414 } 3415 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3416 mutex_destroy(&nfscl->nfscl_chtable_lock); 3417 kmem_free(nfscl, sizeof (*nfscl)); 3418 } 3419 3420 /* 3421 * Called by endpnt_destructor to make sure the client handles are 3422 * cleaned up before the RPC endpoints. This becomes a no-op if 3423 * clfini_zone (above) is called first. This function is needed 3424 * (rather than relying on clfini_zone to clean up) because the ZSD 3425 * callbacks have no ordering mechanism, so we have no way to ensure 3426 * that clfini_zone is called before endpnt_destructor. 3427 */ 3428 void 3429 clcleanup_zone(zoneid_t zoneid) 3430 { 3431 struct nfs_clnt *nfscl; 3432 3433 mutex_enter(&nfs_clnt_list_lock); 3434 nfscl = list_head(&nfs_clnt_list); 3435 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3436 if (nfscl->nfscl_zoneid == zoneid) { 3437 clreclaim_zone(nfscl, 0); 3438 break; 3439 } 3440 } 3441 mutex_exit(&nfs_clnt_list_lock); 3442 } 3443 3444 int 3445 nfs_subrinit(void) 3446 { 3447 int i; 3448 ulong_t nrnode_max; 3449 3450 /* 3451 * Allocate and initialize the rnode hash queues 3452 */ 3453 if (nrnode <= 0) 3454 nrnode = ncsize; 3455 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3456 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3457 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3458 "!setting nrnode to max value of %ld", nrnode_max); 3459 nrnode = nrnode_max; 3460 } 3461 3462 rtablesize = 1 << highbit(nrnode / hashlen); 3463 rtablemask = rtablesize - 1; 3464 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3465 for (i = 0; i < rtablesize; i++) { 3466 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3467 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3468 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3469 } 3470 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3471 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3472 3473 /* 3474 * Allocate and initialize the access cache 3475 */ 3476 3477 /* 3478 * Initial guess is one access cache entry per rnode unless 3479 * nacache is set to a non-zero value and then it is used to 3480 * indicate a guess at the number of access cache entries. 3481 */ 3482 if (nacache > 0) 3483 acachesize = 1 << highbit(nacache / hashlen); 3484 else 3485 acachesize = rtablesize; 3486 acachemask = acachesize - 1; 3487 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3488 for (i = 0; i < acachesize; i++) { 3489 acache[i].next = (acache_t *)&acache[i]; 3490 acache[i].prev = (acache_t *)&acache[i]; 3491 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3492 } 3493 acache_cache = kmem_cache_create("nfs_access_cache", 3494 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3495 /* 3496 * Allocate and initialize the client handle cache 3497 */ 3498 chtab_cache = kmem_cache_create("client_handle_cache", 3499 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3500 /* 3501 * Initialize the list of per-zone client handles (and associated data). 3502 * This needs to be done before we call zone_key_create(). 3503 */ 3504 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3505 offsetof(struct nfs_clnt, nfscl_node)); 3506 /* 3507 * Initialize the zone_key for per-zone client handle lists. 3508 */ 3509 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3510 /* 3511 * Initialize the various mutexes and reader/writer locks 3512 */ 3513 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3514 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3515 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3516 3517 /* 3518 * Assign unique major number for all nfs mounts 3519 */ 3520 if ((nfs_major = getudev()) == -1) { 3521 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3522 "nfs: init: can't get unique device number"); 3523 nfs_major = 0; 3524 } 3525 nfs_minor = 0; 3526 3527 if (nfs3_jukebox_delay == 0) 3528 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3529 3530 return (0); 3531 } 3532 3533 void 3534 nfs_subrfini(void) 3535 { 3536 int i; 3537 3538 /* 3539 * Deallocate the rnode hash queues 3540 */ 3541 kmem_cache_destroy(rnode_cache); 3542 3543 for (i = 0; i < rtablesize; i++) 3544 rw_destroy(&rtable[i].r_lock); 3545 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3546 3547 /* 3548 * Deallocated the access cache 3549 */ 3550 kmem_cache_destroy(acache_cache); 3551 3552 for (i = 0; i < acachesize; i++) 3553 rw_destroy(&acache[i].lock); 3554 kmem_free(acache, acachesize * sizeof (*acache)); 3555 3556 /* 3557 * Deallocate the client handle cache 3558 */ 3559 kmem_cache_destroy(chtab_cache); 3560 3561 /* 3562 * Destroy the various mutexes and reader/writer locks 3563 */ 3564 mutex_destroy(&rpfreelist_lock); 3565 mutex_destroy(&newnum_lock); 3566 mutex_destroy(&nfs_minor_lock); 3567 (void) zone_key_delete(nfsclnt_zone_key); 3568 } 3569 3570 enum nfsstat 3571 puterrno(int error) 3572 { 3573 3574 switch (error) { 3575 case EOPNOTSUPP: 3576 return (NFSERR_OPNOTSUPP); 3577 case ENAMETOOLONG: 3578 return (NFSERR_NAMETOOLONG); 3579 case ENOTEMPTY: 3580 return (NFSERR_NOTEMPTY); 3581 case EDQUOT: 3582 return (NFSERR_DQUOT); 3583 case ESTALE: 3584 return (NFSERR_STALE); 3585 case EREMOTE: 3586 return (NFSERR_REMOTE); 3587 case ENOSYS: 3588 return (NFSERR_OPNOTSUPP); 3589 case EOVERFLOW: 3590 return (NFSERR_INVAL); 3591 default: 3592 return ((enum nfsstat)error); 3593 } 3594 /* NOTREACHED */ 3595 } 3596 3597 int 3598 geterrno(enum nfsstat status) 3599 { 3600 3601 switch (status) { 3602 case NFSERR_OPNOTSUPP: 3603 return (EOPNOTSUPP); 3604 case NFSERR_NAMETOOLONG: 3605 return (ENAMETOOLONG); 3606 case NFSERR_NOTEMPTY: 3607 return (ENOTEMPTY); 3608 case NFSERR_DQUOT: 3609 return (EDQUOT); 3610 case NFSERR_STALE: 3611 return (ESTALE); 3612 case NFSERR_REMOTE: 3613 return (EREMOTE); 3614 case NFSERR_WFLUSH: 3615 return (EIO); 3616 default: 3617 return ((int)status); 3618 } 3619 /* NOTREACHED */ 3620 } 3621 3622 enum nfsstat3 3623 puterrno3(int error) 3624 { 3625 3626 #ifdef DEBUG 3627 switch (error) { 3628 case 0: 3629 return (NFS3_OK); 3630 case EPERM: 3631 return (NFS3ERR_PERM); 3632 case ENOENT: 3633 return (NFS3ERR_NOENT); 3634 case EIO: 3635 return (NFS3ERR_IO); 3636 case ENXIO: 3637 return (NFS3ERR_NXIO); 3638 case EACCES: 3639 return (NFS3ERR_ACCES); 3640 case EEXIST: 3641 return (NFS3ERR_EXIST); 3642 case EXDEV: 3643 return (NFS3ERR_XDEV); 3644 case ENODEV: 3645 return (NFS3ERR_NODEV); 3646 case ENOTDIR: 3647 return (NFS3ERR_NOTDIR); 3648 case EISDIR: 3649 return (NFS3ERR_ISDIR); 3650 case EINVAL: 3651 return (NFS3ERR_INVAL); 3652 case EFBIG: 3653 return (NFS3ERR_FBIG); 3654 case ENOSPC: 3655 return (NFS3ERR_NOSPC); 3656 case EROFS: 3657 return (NFS3ERR_ROFS); 3658 case EMLINK: 3659 return (NFS3ERR_MLINK); 3660 case ENAMETOOLONG: 3661 return (NFS3ERR_NAMETOOLONG); 3662 case ENOTEMPTY: 3663 return (NFS3ERR_NOTEMPTY); 3664 case EDQUOT: 3665 return (NFS3ERR_DQUOT); 3666 case ESTALE: 3667 return (NFS3ERR_STALE); 3668 case EREMOTE: 3669 return (NFS3ERR_REMOTE); 3670 case ENOSYS: 3671 case EOPNOTSUPP: 3672 return (NFS3ERR_NOTSUPP); 3673 case EOVERFLOW: 3674 return (NFS3ERR_INVAL); 3675 default: 3676 zcmn_err(getzoneid(), CE_WARN, 3677 "puterrno3: got error %d", error); 3678 return ((enum nfsstat3)error); 3679 } 3680 #else 3681 switch (error) { 3682 case ENAMETOOLONG: 3683 return (NFS3ERR_NAMETOOLONG); 3684 case ENOTEMPTY: 3685 return (NFS3ERR_NOTEMPTY); 3686 case EDQUOT: 3687 return (NFS3ERR_DQUOT); 3688 case ESTALE: 3689 return (NFS3ERR_STALE); 3690 case ENOSYS: 3691 case EOPNOTSUPP: 3692 return (NFS3ERR_NOTSUPP); 3693 case EREMOTE: 3694 return (NFS3ERR_REMOTE); 3695 case EOVERFLOW: 3696 return (NFS3ERR_INVAL); 3697 default: 3698 return ((enum nfsstat3)error); 3699 } 3700 #endif 3701 } 3702 3703 int 3704 geterrno3(enum nfsstat3 status) 3705 { 3706 3707 #ifdef DEBUG 3708 switch (status) { 3709 case NFS3_OK: 3710 return (0); 3711 case NFS3ERR_PERM: 3712 return (EPERM); 3713 case NFS3ERR_NOENT: 3714 return (ENOENT); 3715 case NFS3ERR_IO: 3716 return (EIO); 3717 case NFS3ERR_NXIO: 3718 return (ENXIO); 3719 case NFS3ERR_ACCES: 3720 return (EACCES); 3721 case NFS3ERR_EXIST: 3722 return (EEXIST); 3723 case NFS3ERR_XDEV: 3724 return (EXDEV); 3725 case NFS3ERR_NODEV: 3726 return (ENODEV); 3727 case NFS3ERR_NOTDIR: 3728 return (ENOTDIR); 3729 case NFS3ERR_ISDIR: 3730 return (EISDIR); 3731 case NFS3ERR_INVAL: 3732 return (EINVAL); 3733 case NFS3ERR_FBIG: 3734 return (EFBIG); 3735 case NFS3ERR_NOSPC: 3736 return (ENOSPC); 3737 case NFS3ERR_ROFS: 3738 return (EROFS); 3739 case NFS3ERR_MLINK: 3740 return (EMLINK); 3741 case NFS3ERR_NAMETOOLONG: 3742 return (ENAMETOOLONG); 3743 case NFS3ERR_NOTEMPTY: 3744 return (ENOTEMPTY); 3745 case NFS3ERR_DQUOT: 3746 return (EDQUOT); 3747 case NFS3ERR_STALE: 3748 return (ESTALE); 3749 case NFS3ERR_REMOTE: 3750 return (EREMOTE); 3751 case NFS3ERR_BADHANDLE: 3752 return (ESTALE); 3753 case NFS3ERR_NOT_SYNC: 3754 return (EINVAL); 3755 case NFS3ERR_BAD_COOKIE: 3756 return (ENOENT); 3757 case NFS3ERR_NOTSUPP: 3758 return (EOPNOTSUPP); 3759 case NFS3ERR_TOOSMALL: 3760 return (EINVAL); 3761 case NFS3ERR_SERVERFAULT: 3762 return (EIO); 3763 case NFS3ERR_BADTYPE: 3764 return (EINVAL); 3765 case NFS3ERR_JUKEBOX: 3766 return (ENXIO); 3767 default: 3768 zcmn_err(getzoneid(), CE_WARN, 3769 "geterrno3: got status %d", status); 3770 return ((int)status); 3771 } 3772 #else 3773 switch (status) { 3774 case NFS3ERR_NAMETOOLONG: 3775 return (ENAMETOOLONG); 3776 case NFS3ERR_NOTEMPTY: 3777 return (ENOTEMPTY); 3778 case NFS3ERR_DQUOT: 3779 return (EDQUOT); 3780 case NFS3ERR_STALE: 3781 case NFS3ERR_BADHANDLE: 3782 return (ESTALE); 3783 case NFS3ERR_NOTSUPP: 3784 return (EOPNOTSUPP); 3785 case NFS3ERR_REMOTE: 3786 return (EREMOTE); 3787 case NFS3ERR_NOT_SYNC: 3788 case NFS3ERR_TOOSMALL: 3789 case NFS3ERR_BADTYPE: 3790 return (EINVAL); 3791 case NFS3ERR_BAD_COOKIE: 3792 return (ENOENT); 3793 case NFS3ERR_SERVERFAULT: 3794 return (EIO); 3795 case NFS3ERR_JUKEBOX: 3796 return (ENXIO); 3797 default: 3798 return ((int)status); 3799 } 3800 #endif 3801 } 3802 3803 rddir_cache * 3804 rddir_cache_alloc(int flags) 3805 { 3806 rddir_cache *rc; 3807 3808 rc = kmem_alloc(sizeof (*rc), flags); 3809 if (rc != NULL) { 3810 rc->entries = NULL; 3811 rc->flags = RDDIR; 3812 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3813 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3814 rc->count = 1; 3815 #ifdef DEBUG 3816 atomic_inc_64(&clstat_debug.dirent.value.ui64); 3817 #endif 3818 } 3819 return (rc); 3820 } 3821 3822 static void 3823 rddir_cache_free(rddir_cache *rc) 3824 { 3825 3826 #ifdef DEBUG 3827 atomic_dec_64(&clstat_debug.dirent.value.ui64); 3828 #endif 3829 if (rc->entries != NULL) { 3830 #ifdef DEBUG 3831 rddir_cache_buf_free(rc->entries, rc->buflen); 3832 #else 3833 kmem_free(rc->entries, rc->buflen); 3834 #endif 3835 } 3836 cv_destroy(&rc->cv); 3837 mutex_destroy(&rc->lock); 3838 kmem_free(rc, sizeof (*rc)); 3839 } 3840 3841 void 3842 rddir_cache_hold(rddir_cache *rc) 3843 { 3844 3845 mutex_enter(&rc->lock); 3846 rc->count++; 3847 mutex_exit(&rc->lock); 3848 } 3849 3850 void 3851 rddir_cache_rele(rddir_cache *rc) 3852 { 3853 3854 mutex_enter(&rc->lock); 3855 ASSERT(rc->count > 0); 3856 if (--rc->count == 0) { 3857 mutex_exit(&rc->lock); 3858 rddir_cache_free(rc); 3859 } else 3860 mutex_exit(&rc->lock); 3861 } 3862 3863 #ifdef DEBUG 3864 char * 3865 rddir_cache_buf_alloc(size_t size, int flags) 3866 { 3867 char *rc; 3868 3869 rc = kmem_alloc(size, flags); 3870 if (rc != NULL) 3871 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3872 return (rc); 3873 } 3874 3875 void 3876 rddir_cache_buf_free(void *addr, size_t size) 3877 { 3878 3879 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3880 kmem_free(addr, size); 3881 } 3882 #endif 3883 3884 static int 3885 nfs_free_data_reclaim(rnode_t *rp) 3886 { 3887 char *contents; 3888 int size; 3889 vsecattr_t *vsp; 3890 nfs3_pathconf_info *info; 3891 int freed; 3892 cred_t *cred; 3893 3894 /* 3895 * Free any held credentials and caches which 3896 * may be associated with this rnode. 3897 */ 3898 mutex_enter(&rp->r_statelock); 3899 cred = rp->r_cred; 3900 rp->r_cred = NULL; 3901 contents = rp->r_symlink.contents; 3902 size = rp->r_symlink.size; 3903 rp->r_symlink.contents = NULL; 3904 vsp = rp->r_secattr; 3905 rp->r_secattr = NULL; 3906 info = rp->r_pathconf; 3907 rp->r_pathconf = NULL; 3908 mutex_exit(&rp->r_statelock); 3909 3910 if (cred != NULL) 3911 crfree(cred); 3912 3913 /* 3914 * Free the access cache entries. 3915 */ 3916 freed = nfs_access_purge_rp(rp); 3917 3918 if (!HAVE_RDDIR_CACHE(rp) && 3919 contents == NULL && 3920 vsp == NULL && 3921 info == NULL) 3922 return (freed); 3923 3924 /* 3925 * Free the readdir cache entries 3926 */ 3927 if (HAVE_RDDIR_CACHE(rp)) 3928 nfs_purge_rddir_cache(RTOV(rp)); 3929 3930 /* 3931 * Free the symbolic link cache. 3932 */ 3933 if (contents != NULL) { 3934 3935 kmem_free((void *)contents, size); 3936 } 3937 3938 /* 3939 * Free any cached ACL. 3940 */ 3941 if (vsp != NULL) 3942 nfs_acl_free(vsp); 3943 3944 /* 3945 * Free any cached pathconf information. 3946 */ 3947 if (info != NULL) 3948 kmem_free(info, sizeof (*info)); 3949 3950 return (1); 3951 } 3952 3953 static int 3954 nfs_active_data_reclaim(rnode_t *rp) 3955 { 3956 char *contents; 3957 int size; 3958 vsecattr_t *vsp; 3959 nfs3_pathconf_info *info; 3960 int freed; 3961 3962 /* 3963 * Free any held credentials and caches which 3964 * may be associated with this rnode. 3965 */ 3966 if (!mutex_tryenter(&rp->r_statelock)) 3967 return (0); 3968 contents = rp->r_symlink.contents; 3969 size = rp->r_symlink.size; 3970 rp->r_symlink.contents = NULL; 3971 vsp = rp->r_secattr; 3972 rp->r_secattr = NULL; 3973 info = rp->r_pathconf; 3974 rp->r_pathconf = NULL; 3975 mutex_exit(&rp->r_statelock); 3976 3977 /* 3978 * Free the access cache entries. 3979 */ 3980 freed = nfs_access_purge_rp(rp); 3981 3982 if (!HAVE_RDDIR_CACHE(rp) && 3983 contents == NULL && 3984 vsp == NULL && 3985 info == NULL) 3986 return (freed); 3987 3988 /* 3989 * Free the readdir cache entries 3990 */ 3991 if (HAVE_RDDIR_CACHE(rp)) 3992 nfs_purge_rddir_cache(RTOV(rp)); 3993 3994 /* 3995 * Free the symbolic link cache. 3996 */ 3997 if (contents != NULL) { 3998 3999 kmem_free((void *)contents, size); 4000 } 4001 4002 /* 4003 * Free any cached ACL. 4004 */ 4005 if (vsp != NULL) 4006 nfs_acl_free(vsp); 4007 4008 /* 4009 * Free any cached pathconf information. 4010 */ 4011 if (info != NULL) 4012 kmem_free(info, sizeof (*info)); 4013 4014 return (1); 4015 } 4016 4017 static int 4018 nfs_free_reclaim(void) 4019 { 4020 int freed; 4021 rnode_t *rp; 4022 4023 #ifdef DEBUG 4024 clstat_debug.f_reclaim.value.ui64++; 4025 #endif 4026 freed = 0; 4027 mutex_enter(&rpfreelist_lock); 4028 rp = rpfreelist; 4029 if (rp != NULL) { 4030 do { 4031 if (nfs_free_data_reclaim(rp)) 4032 freed = 1; 4033 } while ((rp = rp->r_freef) != rpfreelist); 4034 } 4035 mutex_exit(&rpfreelist_lock); 4036 return (freed); 4037 } 4038 4039 static int 4040 nfs_active_reclaim(void) 4041 { 4042 int freed; 4043 int index; 4044 rnode_t *rp; 4045 4046 #ifdef DEBUG 4047 clstat_debug.a_reclaim.value.ui64++; 4048 #endif 4049 freed = 0; 4050 for (index = 0; index < rtablesize; index++) { 4051 rw_enter(&rtable[index].r_lock, RW_READER); 4052 for (rp = rtable[index].r_hashf; 4053 rp != (rnode_t *)(&rtable[index]); 4054 rp = rp->r_hashf) { 4055 if (nfs_active_data_reclaim(rp)) 4056 freed = 1; 4057 } 4058 rw_exit(&rtable[index].r_lock); 4059 } 4060 return (freed); 4061 } 4062 4063 static int 4064 nfs_rnode_reclaim(void) 4065 { 4066 int freed; 4067 rnode_t *rp; 4068 vnode_t *vp; 4069 4070 #ifdef DEBUG 4071 clstat_debug.r_reclaim.value.ui64++; 4072 #endif 4073 freed = 0; 4074 mutex_enter(&rpfreelist_lock); 4075 while ((rp = rpfreelist) != NULL) { 4076 rp_rmfree(rp); 4077 mutex_exit(&rpfreelist_lock); 4078 if (rp->r_flags & RHASHED) { 4079 vp = RTOV(rp); 4080 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4081 mutex_enter(&vp->v_lock); 4082 if (vp->v_count > 1) { 4083 vp->v_count--; 4084 mutex_exit(&vp->v_lock); 4085 rw_exit(&rp->r_hashq->r_lock); 4086 mutex_enter(&rpfreelist_lock); 4087 continue; 4088 } 4089 mutex_exit(&vp->v_lock); 4090 rp_rmhash_locked(rp); 4091 rw_exit(&rp->r_hashq->r_lock); 4092 } 4093 /* 4094 * This call to rp_addfree will end up destroying the 4095 * rnode, but in a safe way with the appropriate set 4096 * of checks done. 4097 */ 4098 rp_addfree(rp, CRED()); 4099 mutex_enter(&rpfreelist_lock); 4100 } 4101 mutex_exit(&rpfreelist_lock); 4102 return (freed); 4103 } 4104 4105 /*ARGSUSED*/ 4106 static void 4107 nfs_reclaim(void *cdrarg) 4108 { 4109 4110 #ifdef DEBUG 4111 clstat_debug.reclaim.value.ui64++; 4112 #endif 4113 if (nfs_free_reclaim()) 4114 return; 4115 4116 if (nfs_active_reclaim()) 4117 return; 4118 4119 (void) nfs_rnode_reclaim(); 4120 } 4121 4122 /* 4123 * NFS client failover support 4124 * 4125 * Routines to copy filehandles 4126 */ 4127 void 4128 nfscopyfh(caddr_t fhp, vnode_t *vp) 4129 { 4130 fhandle_t *dest = (fhandle_t *)fhp; 4131 4132 if (dest != NULL) 4133 *dest = *VTOFH(vp); 4134 } 4135 4136 void 4137 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4138 { 4139 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4140 4141 if (dest != NULL) 4142 *dest = *VTOFH3(vp); 4143 } 4144 4145 /* 4146 * NFS client failover support 4147 * 4148 * failover_safe() will test various conditions to ensure that 4149 * failover is permitted for this vnode. It will be denied 4150 * if: 4151 * 1) the operation in progress does not support failover (NULL fi) 4152 * 2) there are no available replicas (NULL mi_servers->sv_next) 4153 * 3) any locks are outstanding on this file 4154 */ 4155 static int 4156 failover_safe(failinfo_t *fi) 4157 { 4158 4159 /* 4160 * Does this op permit failover? 4161 */ 4162 if (fi == NULL || fi->vp == NULL) 4163 return (0); 4164 4165 /* 4166 * Are there any alternates to failover to? 4167 */ 4168 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4169 return (0); 4170 4171 /* 4172 * Disable check; we've forced local locking 4173 * 4174 * if (flk_has_remote_locks(fi->vp)) 4175 * return (0); 4176 */ 4177 4178 /* 4179 * If we have no partial path, we can't do anything 4180 */ 4181 if (VTOR(fi->vp)->r_path == NULL) 4182 return (0); 4183 4184 return (1); 4185 } 4186 4187 #include <sys/thread.h> 4188 4189 /* 4190 * NFS client failover support 4191 * 4192 * failover_newserver() will start a search for a new server, 4193 * preferably by starting an async thread to do the work. If 4194 * someone is already doing this (recognizable by MI_BINDINPROG 4195 * being set), it will simply return and the calling thread 4196 * will queue on the mi_failover_cv condition variable. 4197 */ 4198 static void 4199 failover_newserver(mntinfo_t *mi) 4200 { 4201 /* 4202 * Check if someone else is doing this already 4203 */ 4204 mutex_enter(&mi->mi_lock); 4205 if (mi->mi_flags & MI_BINDINPROG) { 4206 mutex_exit(&mi->mi_lock); 4207 return; 4208 } 4209 mi->mi_flags |= MI_BINDINPROG; 4210 4211 /* 4212 * Need to hold the vfs struct so that it can't be released 4213 * while the failover thread is selecting a new server. 4214 */ 4215 VFS_HOLD(mi->mi_vfsp); 4216 4217 /* 4218 * Start a thread to do the real searching. 4219 */ 4220 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4221 4222 mutex_exit(&mi->mi_lock); 4223 } 4224 4225 /* 4226 * NFS client failover support 4227 * 4228 * failover_thread() will find a new server to replace the one 4229 * currently in use, wake up other threads waiting on this mount 4230 * point, and die. It will start at the head of the server list 4231 * and poll servers until it finds one with an NFS server which is 4232 * registered and responds to a NULL procedure ping. 4233 * 4234 * XXX failover_thread is unsafe within the scope of the 4235 * present model defined for cpr to suspend the system. 4236 * Specifically, over-the-wire calls made by the thread 4237 * are unsafe. The thread needs to be reevaluated in case of 4238 * future updates to the cpr suspend model. 4239 */ 4240 static void 4241 failover_thread(mntinfo_t *mi) 4242 { 4243 servinfo_t *svp = NULL; 4244 CLIENT *cl; 4245 enum clnt_stat status; 4246 struct timeval tv; 4247 int error; 4248 int oncethru = 0; 4249 callb_cpr_t cprinfo; 4250 rnode_t *rp; 4251 int index; 4252 char *srvnames; 4253 size_t srvnames_len; 4254 struct nfs_clnt *nfscl = NULL; 4255 zoneid_t zoneid = getzoneid(); 4256 4257 #ifdef DEBUG 4258 /* 4259 * This is currently only needed to access counters which exist on 4260 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4261 * on non-DEBUG kernels. 4262 */ 4263 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4264 ASSERT(nfscl != NULL); 4265 #endif 4266 4267 /* 4268 * Its safe to piggyback on the mi_lock since failover_newserver() 4269 * code guarantees that there will be only one failover thread 4270 * per mountinfo at any instance. 4271 */ 4272 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4273 "failover_thread"); 4274 4275 mutex_enter(&mi->mi_lock); 4276 while (mi->mi_readers) { 4277 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4278 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4279 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4280 } 4281 mutex_exit(&mi->mi_lock); 4282 4283 tv.tv_sec = 2; 4284 tv.tv_usec = 0; 4285 4286 /* 4287 * Ping the null NFS procedure of every server in 4288 * the list until one responds. We always start 4289 * at the head of the list and always skip the one 4290 * that is current, since it's caused us a problem. 4291 */ 4292 while (svp == NULL) { 4293 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4294 if (!oncethru && svp == mi->mi_curr_serv) 4295 continue; 4296 4297 /* 4298 * If the file system was forcibly umounted 4299 * while trying to do a failover, then just 4300 * give up on the failover. It won't matter 4301 * what the server is. 4302 */ 4303 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4304 svp = NULL; 4305 goto done; 4306 } 4307 4308 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4309 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4310 if (error) 4311 continue; 4312 4313 if (!(mi->mi_flags & MI_INT)) 4314 cl->cl_nosignal = TRUE; 4315 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4316 xdr_void, NULL, tv); 4317 if (!(mi->mi_flags & MI_INT)) 4318 cl->cl_nosignal = FALSE; 4319 AUTH_DESTROY(cl->cl_auth); 4320 CLNT_DESTROY(cl); 4321 if (status == RPC_SUCCESS) { 4322 if (svp == mi->mi_curr_serv) { 4323 #ifdef DEBUG 4324 zcmn_err(zoneid, CE_NOTE, 4325 "NFS%d: failing over: selecting original server %s", 4326 mi->mi_vers, svp->sv_hostname); 4327 #else 4328 zcmn_err(zoneid, CE_NOTE, 4329 "NFS: failing over: selecting original server %s", 4330 svp->sv_hostname); 4331 #endif 4332 } else { 4333 #ifdef DEBUG 4334 zcmn_err(zoneid, CE_NOTE, 4335 "NFS%d: failing over from %s to %s", 4336 mi->mi_vers, 4337 mi->mi_curr_serv->sv_hostname, 4338 svp->sv_hostname); 4339 #else 4340 zcmn_err(zoneid, CE_NOTE, 4341 "NFS: failing over from %s to %s", 4342 mi->mi_curr_serv->sv_hostname, 4343 svp->sv_hostname); 4344 #endif 4345 } 4346 break; 4347 } 4348 } 4349 4350 if (svp == NULL) { 4351 if (!oncethru) { 4352 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4353 #ifdef DEBUG 4354 zprintf(zoneid, 4355 "NFS%d servers %s not responding " 4356 "still trying\n", mi->mi_vers, srvnames); 4357 #else 4358 zprintf(zoneid, "NFS servers %s not responding " 4359 "still trying\n", srvnames); 4360 #endif 4361 oncethru = 1; 4362 } 4363 mutex_enter(&mi->mi_lock); 4364 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4365 mutex_exit(&mi->mi_lock); 4366 delay(hz); 4367 mutex_enter(&mi->mi_lock); 4368 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4369 mutex_exit(&mi->mi_lock); 4370 } 4371 } 4372 4373 if (oncethru) { 4374 #ifdef DEBUG 4375 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4376 #else 4377 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4378 #endif 4379 } 4380 4381 if (svp != mi->mi_curr_serv) { 4382 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4383 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4384 rw_enter(&rtable[index].r_lock, RW_WRITER); 4385 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4386 mi->mi_vfsp); 4387 if (rp != NULL) { 4388 if (rp->r_flags & RHASHED) 4389 rp_rmhash_locked(rp); 4390 rw_exit(&rtable[index].r_lock); 4391 rp->r_server = svp; 4392 rp->r_fh = svp->sv_fhandle; 4393 (void) nfs_free_data_reclaim(rp); 4394 index = rtablehash(&rp->r_fh); 4395 rp->r_hashq = &rtable[index]; 4396 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4397 vn_exists(RTOV(rp)); 4398 rp_addhash(rp); 4399 rw_exit(&rp->r_hashq->r_lock); 4400 VN_RELE(RTOV(rp)); 4401 } else 4402 rw_exit(&rtable[index].r_lock); 4403 } 4404 4405 done: 4406 if (oncethru) 4407 kmem_free(srvnames, srvnames_len); 4408 mutex_enter(&mi->mi_lock); 4409 mi->mi_flags &= ~MI_BINDINPROG; 4410 if (svp != NULL) { 4411 mi->mi_curr_serv = svp; 4412 mi->mi_failover++; 4413 #ifdef DEBUG 4414 nfscl->nfscl_stat.failover.value.ui64++; 4415 #endif 4416 } 4417 cv_broadcast(&mi->mi_failover_cv); 4418 CALLB_CPR_EXIT(&cprinfo); 4419 VFS_RELE(mi->mi_vfsp); 4420 zthread_exit(); 4421 /* NOTREACHED */ 4422 } 4423 4424 /* 4425 * NFS client failover support 4426 * 4427 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4428 * is cleared, meaning that failover is complete. Called with 4429 * mi_lock mutex held. 4430 */ 4431 static int 4432 failover_wait(mntinfo_t *mi) 4433 { 4434 k_sigset_t smask; 4435 4436 /* 4437 * If someone else is hunting for a living server, 4438 * sleep until it's done. After our sleep, we may 4439 * be bound to the right server and get off cheaply. 4440 */ 4441 while (mi->mi_flags & MI_BINDINPROG) { 4442 /* 4443 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4444 * and SIGTERM. (Preserving the existing masks). 4445 * Mask out SIGINT if mount option nointr is specified. 4446 */ 4447 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4448 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4449 /* 4450 * restore original signal mask 4451 */ 4452 sigunintr(&smask); 4453 return (EINTR); 4454 } 4455 /* 4456 * restore original signal mask 4457 */ 4458 sigunintr(&smask); 4459 } 4460 return (0); 4461 } 4462 4463 /* 4464 * NFS client failover support 4465 * 4466 * failover_remap() will do a partial pathname lookup and find the 4467 * desired vnode on the current server. The interim vnode will be 4468 * discarded after we pilfer the new filehandle. 4469 * 4470 * Side effects: 4471 * - This routine will also update the filehandle in the args structure 4472 * pointed to by the fi->fhp pointer if it is non-NULL. 4473 */ 4474 4475 static int 4476 failover_remap(failinfo_t *fi) 4477 { 4478 vnode_t *vp, *nvp, *rootvp; 4479 rnode_t *rp, *nrp; 4480 mntinfo_t *mi; 4481 int error; 4482 #ifdef DEBUG 4483 struct nfs_clnt *nfscl; 4484 4485 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4486 ASSERT(nfscl != NULL); 4487 #endif 4488 /* 4489 * Sanity check 4490 */ 4491 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4492 return (EINVAL); 4493 vp = fi->vp; 4494 rp = VTOR(vp); 4495 mi = VTOMI(vp); 4496 4497 if (!(vp->v_flag & VROOT)) { 4498 /* 4499 * Given the root fh, use the path stored in 4500 * the rnode to find the fh for the new server. 4501 */ 4502 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4503 if (error) 4504 return (error); 4505 4506 error = failover_lookup(rp->r_path, rootvp, 4507 fi->lookupproc, fi->xattrdirproc, &nvp); 4508 4509 VN_RELE(rootvp); 4510 4511 if (error) 4512 return (error); 4513 4514 /* 4515 * If we found the same rnode, we're done now 4516 */ 4517 if (nvp == vp) { 4518 /* 4519 * Failed and the new server may physically be same 4520 * OR may share a same disk subsystem. In this case 4521 * file handle for a particular file path is not going 4522 * to change, given the same filehandle lookup will 4523 * always locate the same rnode as the existing one. 4524 * All we might need to do is to update the r_server 4525 * with the current servinfo. 4526 */ 4527 if (!VALID_FH(fi)) { 4528 rp->r_server = mi->mi_curr_serv; 4529 } 4530 VN_RELE(nvp); 4531 return (0); 4532 } 4533 4534 /* 4535 * Try to make it so that no one else will find this 4536 * vnode because it is just a temporary to hold the 4537 * new file handle until that file handle can be 4538 * copied to the original vnode/rnode. 4539 */ 4540 nrp = VTOR(nvp); 4541 mutex_enter(&mi->mi_remap_lock); 4542 /* 4543 * Some other thread could have raced in here and could 4544 * have done the remap for this particular rnode before 4545 * this thread here. Check for rp->r_server and 4546 * mi->mi_curr_serv and return if they are same. 4547 */ 4548 if (VALID_FH(fi)) { 4549 mutex_exit(&mi->mi_remap_lock); 4550 VN_RELE(nvp); 4551 return (0); 4552 } 4553 4554 if (nrp->r_flags & RHASHED) 4555 rp_rmhash(nrp); 4556 4557 /* 4558 * As a heuristic check on the validity of the new 4559 * file, check that the size and type match against 4560 * that we remember from the old version. 4561 */ 4562 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4563 mutex_exit(&mi->mi_remap_lock); 4564 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4565 "NFS replicas %s and %s: file %s not same.", 4566 rp->r_server->sv_hostname, 4567 nrp->r_server->sv_hostname, rp->r_path); 4568 VN_RELE(nvp); 4569 return (EINVAL); 4570 } 4571 4572 /* 4573 * snarf the filehandle from the new rnode 4574 * then release it, again while updating the 4575 * hash queues for the rnode. 4576 */ 4577 if (rp->r_flags & RHASHED) 4578 rp_rmhash(rp); 4579 rp->r_server = mi->mi_curr_serv; 4580 rp->r_fh = nrp->r_fh; 4581 rp->r_hashq = nrp->r_hashq; 4582 /* 4583 * Copy the attributes from the new rnode to the old 4584 * rnode. This will help to reduce unnecessary page 4585 * cache flushes. 4586 */ 4587 rp->r_attr = nrp->r_attr; 4588 rp->r_attrtime = nrp->r_attrtime; 4589 rp->r_mtime = nrp->r_mtime; 4590 (void) nfs_free_data_reclaim(rp); 4591 nfs_setswaplike(vp, &rp->r_attr); 4592 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4593 rp_addhash(rp); 4594 rw_exit(&rp->r_hashq->r_lock); 4595 mutex_exit(&mi->mi_remap_lock); 4596 VN_RELE(nvp); 4597 } 4598 4599 /* 4600 * Update successful failover remap count 4601 */ 4602 mutex_enter(&mi->mi_lock); 4603 mi->mi_remap++; 4604 mutex_exit(&mi->mi_lock); 4605 #ifdef DEBUG 4606 nfscl->nfscl_stat.remap.value.ui64++; 4607 #endif 4608 4609 /* 4610 * If we have a copied filehandle to update, do it now. 4611 */ 4612 if (fi->fhp != NULL && fi->copyproc != NULL) 4613 (*fi->copyproc)(fi->fhp, vp); 4614 4615 return (0); 4616 } 4617 4618 /* 4619 * NFS client failover support 4620 * 4621 * We want a simple pathname lookup routine to parse the pieces 4622 * of path in rp->r_path. We know that the path was a created 4623 * as rnodes were made, so we know we have only to deal with 4624 * paths that look like: 4625 * dir1/dir2/dir3/file 4626 * Any evidence of anything like .., symlinks, and ENOTDIR 4627 * are hard errors, because they mean something in this filesystem 4628 * is different from the one we came from, or has changed under 4629 * us in some way. If this is true, we want the failure. 4630 * 4631 * Extended attributes: if the filesystem is mounted with extended 4632 * attributes enabled (-o xattr), the attribute directory will be 4633 * represented in the r_path as the magic name XATTR_RPATH. So if 4634 * we see that name in the pathname, is must be because this node 4635 * is an extended attribute. Therefore, look it up that way. 4636 */ 4637 static int 4638 failover_lookup(char *path, vnode_t *root, 4639 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4640 vnode_t *, cred_t *, int), 4641 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4642 vnode_t **new) 4643 { 4644 vnode_t *dvp, *nvp; 4645 int error = EINVAL; 4646 char *s, *p, *tmppath; 4647 size_t len; 4648 mntinfo_t *mi; 4649 bool_t xattr; 4650 4651 /* Make local copy of path */ 4652 len = strlen(path) + 1; 4653 tmppath = kmem_alloc(len, KM_SLEEP); 4654 (void) strcpy(tmppath, path); 4655 s = tmppath; 4656 4657 dvp = root; 4658 VN_HOLD(dvp); 4659 mi = VTOMI(root); 4660 xattr = mi->mi_flags & MI_EXTATTR; 4661 4662 do { 4663 p = strchr(s, '/'); 4664 if (p != NULL) 4665 *p = '\0'; 4666 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4667 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4668 RFSCALL_SOFT); 4669 } else { 4670 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4671 CRED(), RFSCALL_SOFT); 4672 } 4673 if (p != NULL) 4674 *p++ = '/'; 4675 if (error) { 4676 VN_RELE(dvp); 4677 kmem_free(tmppath, len); 4678 return (error); 4679 } 4680 s = p; 4681 VN_RELE(dvp); 4682 dvp = nvp; 4683 } while (p != NULL); 4684 4685 if (nvp != NULL && new != NULL) 4686 *new = nvp; 4687 kmem_free(tmppath, len); 4688 return (0); 4689 } 4690 4691 /* 4692 * NFS client failover support 4693 * 4694 * sv_free() frees the malloc'd portion of a "servinfo_t". 4695 */ 4696 void 4697 sv_free(servinfo_t *svp) 4698 { 4699 servinfo_t *next; 4700 struct knetconfig *knconf; 4701 4702 while (svp != NULL) { 4703 next = svp->sv_next; 4704 if (svp->sv_secdata) 4705 sec_clnt_freeinfo(svp->sv_secdata); 4706 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4707 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4708 knconf = svp->sv_knconf; 4709 if (knconf != NULL) { 4710 if (knconf->knc_protofmly != NULL) 4711 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4712 if (knconf->knc_proto != NULL) 4713 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4714 kmem_free(knconf, sizeof (*knconf)); 4715 } 4716 knconf = svp->sv_origknconf; 4717 if (knconf != NULL) { 4718 if (knconf->knc_protofmly != NULL) 4719 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4720 if (knconf->knc_proto != NULL) 4721 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4722 kmem_free(knconf, sizeof (*knconf)); 4723 } 4724 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4725 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4726 mutex_destroy(&svp->sv_lock); 4727 kmem_free(svp, sizeof (*svp)); 4728 svp = next; 4729 } 4730 } 4731 4732 /* 4733 * Only can return non-zero if intr != 0. 4734 */ 4735 int 4736 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4737 { 4738 4739 mutex_enter(&l->lock); 4740 4741 /* 4742 * If this is a nested enter, then allow it. There 4743 * must be as many exits as enters through. 4744 */ 4745 if (l->owner == curthread) { 4746 /* lock is held for writing by current thread */ 4747 ASSERT(rw == RW_READER || rw == RW_WRITER); 4748 l->count--; 4749 } else if (rw == RW_READER) { 4750 /* 4751 * While there is a writer active or writers waiting, 4752 * then wait for them to finish up and move on. Then, 4753 * increment the count to indicate that a reader is 4754 * active. 4755 */ 4756 while (l->count < 0 || l->waiters > 0) { 4757 if (intr) { 4758 klwp_t *lwp = ttolwp(curthread); 4759 4760 if (lwp != NULL) 4761 lwp->lwp_nostop++; 4762 if (cv_wait_sig(&l->cv, &l->lock) == 0) { 4763 if (lwp != NULL) 4764 lwp->lwp_nostop--; 4765 mutex_exit(&l->lock); 4766 return (EINTR); 4767 } 4768 if (lwp != NULL) 4769 lwp->lwp_nostop--; 4770 } else 4771 cv_wait(&l->cv, &l->lock); 4772 4773 /* 4774 * If there are no readers active nor a writer active 4775 * we need to wake up the next waiter. If there is a 4776 * writer waiting we will wait again so we need to wake 4777 * up the next waiter (possible writer). If there is 4778 * no writer waiting we need to wake up the next 4779 * waiting reader (if any) so it is invited to the 4780 * party. 4781 */ 4782 if (l->count == 0) 4783 cv_signal(&l->cv); 4784 4785 /* 4786 * If there are readers active and no writers waiting 4787 * then wake up the next waiting reader (if any). 4788 */ 4789 if (l->count > 0 && l->waiters == 0) 4790 cv_signal(&l->cv); 4791 } 4792 ASSERT(l->count < INT_MAX); 4793 #ifdef DEBUG 4794 if ((l->count % 10000) == 9999) 4795 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4796 "rwlock @ %p\n", l->count, (void *)&l); 4797 #endif 4798 l->count++; 4799 } else { 4800 ASSERT(rw == RW_WRITER); 4801 /* 4802 * While there are readers active or a writer 4803 * active, then wait for all of the readers 4804 * to finish or for the writer to finish. 4805 * Then, set the owner field to curthread and 4806 * decrement count to indicate that a writer 4807 * is active. 4808 */ 4809 while (l->count != 0) { 4810 l->waiters++; 4811 if (intr) { 4812 klwp_t *lwp = ttolwp(curthread); 4813 4814 if (lwp != NULL) 4815 lwp->lwp_nostop++; 4816 if (cv_wait_sig(&l->cv, &l->lock) == 0) { 4817 if (lwp != NULL) 4818 lwp->lwp_nostop--; 4819 l->waiters--; 4820 /* 4821 * If there are readers active and no 4822 * writers waiting then wake up the 4823 * next waiting reader (if any). 4824 */ 4825 if (l->count > 0 && l->waiters == 0) 4826 cv_signal(&l->cv); 4827 mutex_exit(&l->lock); 4828 return (EINTR); 4829 } 4830 if (lwp != NULL) 4831 lwp->lwp_nostop--; 4832 } else 4833 cv_wait(&l->cv, &l->lock); 4834 l->waiters--; 4835 } 4836 ASSERT(l->owner == NULL); 4837 l->owner = curthread; 4838 l->count--; 4839 } 4840 4841 mutex_exit(&l->lock); 4842 4843 return (0); 4844 } 4845 4846 /* 4847 * If the lock is available, obtain it and return non-zero. If there is 4848 * already a conflicting lock, return 0 immediately. 4849 */ 4850 4851 int 4852 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4853 { 4854 mutex_enter(&l->lock); 4855 4856 /* 4857 * If this is a nested enter, then allow it. There 4858 * must be as many exits as enters through. 4859 */ 4860 if (l->owner == curthread) { 4861 /* lock is held for writing by current thread */ 4862 ASSERT(rw == RW_READER || rw == RW_WRITER); 4863 l->count--; 4864 } else if (rw == RW_READER) { 4865 /* 4866 * If there is a writer active or writers waiting, deny the 4867 * lock. Otherwise, bump the count of readers. 4868 */ 4869 if (l->count < 0 || l->waiters > 0) { 4870 mutex_exit(&l->lock); 4871 return (0); 4872 } 4873 l->count++; 4874 } else { 4875 ASSERT(rw == RW_WRITER); 4876 /* 4877 * If there are readers active or a writer active, deny the 4878 * lock. Otherwise, set the owner field to curthread and 4879 * decrement count to indicate that a writer is active. 4880 */ 4881 if (l->count != 0) { 4882 mutex_exit(&l->lock); 4883 return (0); 4884 } 4885 ASSERT(l->owner == NULL); 4886 l->owner = curthread; 4887 l->count--; 4888 } 4889 4890 mutex_exit(&l->lock); 4891 4892 return (1); 4893 } 4894 4895 void 4896 nfs_rw_exit(nfs_rwlock_t *l) 4897 { 4898 4899 mutex_enter(&l->lock); 4900 /* 4901 * If this is releasing a writer lock, then increment count to 4902 * indicate that there is one less writer active. If this was 4903 * the last of possibly nested writer locks, then clear the owner 4904 * field as well to indicate that there is no writer active 4905 * and wakeup the first waiting writer or reader. 4906 * 4907 * If releasing a reader lock, then just decrement count to 4908 * indicate that there is one less reader active. If this was 4909 * the last active reader and there are writer(s) waiting, 4910 * then wake up the first. 4911 */ 4912 if (l->owner != NULL) { 4913 ASSERT(l->owner == curthread); 4914 l->count++; 4915 if (l->count == 0) { 4916 l->owner = NULL; 4917 cv_signal(&l->cv); 4918 } 4919 } else { 4920 ASSERT(l->count > 0); 4921 l->count--; 4922 if (l->count == 0 && l->waiters > 0) 4923 cv_signal(&l->cv); 4924 } 4925 mutex_exit(&l->lock); 4926 } 4927 4928 int 4929 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4930 { 4931 4932 if (rw == RW_READER) 4933 return (l->count > 0); 4934 ASSERT(rw == RW_WRITER); 4935 return (l->count < 0); 4936 } 4937 4938 /* ARGSUSED */ 4939 void 4940 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4941 { 4942 4943 l->count = 0; 4944 l->waiters = 0; 4945 l->owner = NULL; 4946 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4947 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4948 } 4949 4950 void 4951 nfs_rw_destroy(nfs_rwlock_t *l) 4952 { 4953 4954 mutex_destroy(&l->lock); 4955 cv_destroy(&l->cv); 4956 } 4957 4958 int 4959 nfs3_rddir_compar(const void *x, const void *y) 4960 { 4961 rddir_cache *a = (rddir_cache *)x; 4962 rddir_cache *b = (rddir_cache *)y; 4963 4964 if (a->nfs3_cookie == b->nfs3_cookie) { 4965 if (a->buflen == b->buflen) 4966 return (0); 4967 if (a->buflen < b->buflen) 4968 return (-1); 4969 return (1); 4970 } 4971 4972 if (a->nfs3_cookie < b->nfs3_cookie) 4973 return (-1); 4974 4975 return (1); 4976 } 4977 4978 int 4979 nfs_rddir_compar(const void *x, const void *y) 4980 { 4981 rddir_cache *a = (rddir_cache *)x; 4982 rddir_cache *b = (rddir_cache *)y; 4983 4984 if (a->nfs_cookie == b->nfs_cookie) { 4985 if (a->buflen == b->buflen) 4986 return (0); 4987 if (a->buflen < b->buflen) 4988 return (-1); 4989 return (1); 4990 } 4991 4992 if (a->nfs_cookie < b->nfs_cookie) 4993 return (-1); 4994 4995 return (1); 4996 } 4997 4998 static char * 4999 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 5000 { 5001 servinfo_t *s; 5002 char *srvnames; 5003 char *namep; 5004 size_t length; 5005 5006 /* 5007 * Calculate the length of the string required to hold all 5008 * of the server names plus either a comma or a null 5009 * character following each individual one. 5010 */ 5011 length = 0; 5012 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 5013 length += s->sv_hostnamelen; 5014 5015 srvnames = kmem_alloc(length, KM_SLEEP); 5016 5017 namep = srvnames; 5018 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 5019 (void) strcpy(namep, s->sv_hostname); 5020 namep += s->sv_hostnamelen - 1; 5021 *namep++ = ','; 5022 } 5023 *--namep = '\0'; 5024 5025 *len = length; 5026 5027 return (srvnames); 5028 } 5029 5030 /* 5031 * These two functions are temporary and designed for the upgrade-workaround 5032 * only. They cannot be used for general zone-crossing NFS client support, and 5033 * will be removed shortly. 5034 * 5035 * When the workaround is enabled, all NFS traffic is forced into the global 5036 * zone. These functions are called when the code needs to refer to the state 5037 * of the underlying network connection. They're not called when the function 5038 * needs to refer to the state of the process that invoked the system call. 5039 * (E.g., when checking whether the zone is shutting down during the mount() 5040 * call.) 5041 */ 5042 5043 struct zone * 5044 nfs_zone(void) 5045 { 5046 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5047 } 5048 5049 zoneid_t 5050 nfs_zoneid(void) 5051 { 5052 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5053 } 5054 5055 /* 5056 * nfs_mount_label_policy: 5057 * Determine whether the mount is allowed according to MAC check, 5058 * by comparing (where appropriate) label of the remote server 5059 * against the label of the zone being mounted into. 5060 * 5061 * Returns: 5062 * 0 : access allowed 5063 * -1 : read-only access allowed (i.e., read-down) 5064 * >0 : error code, such as EACCES 5065 */ 5066 int 5067 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5068 struct knetconfig *knconf, cred_t *cr) 5069 { 5070 int addr_type; 5071 void *ipaddr; 5072 bslabel_t *server_sl, *mntlabel; 5073 zone_t *mntzone = NULL; 5074 ts_label_t *zlabel; 5075 tsol_tpc_t *tp; 5076 ts_label_t *tsl = NULL; 5077 int retv; 5078 5079 /* 5080 * Get the zone's label. Each zone on a labeled system has a label. 5081 */ 5082 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5083 zlabel = mntzone->zone_slabel; 5084 ASSERT(zlabel != NULL); 5085 label_hold(zlabel); 5086 5087 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5088 addr_type = IPV4_VERSION; 5089 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5090 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5091 addr_type = IPV6_VERSION; 5092 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5093 } else { 5094 retv = 0; 5095 goto out; 5096 } 5097 5098 retv = EACCES; /* assume the worst */ 5099 5100 /* 5101 * Next, get the assigned label of the remote server. 5102 */ 5103 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5104 if (tp == NULL) 5105 goto out; /* error getting host entry */ 5106 5107 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5108 goto rel_tpc; /* invalid domain */ 5109 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5110 (tp->tpc_tp.host_type != UNLABELED)) 5111 goto rel_tpc; /* invalid hosttype */ 5112 5113 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5114 tsl = getflabel_cipso(vfsp); 5115 if (tsl == NULL) 5116 goto rel_tpc; /* error getting server lbl */ 5117 5118 server_sl = label2bslabel(tsl); 5119 } else { /* UNLABELED */ 5120 server_sl = &tp->tpc_tp.tp_def_label; 5121 } 5122 5123 mntlabel = label2bslabel(zlabel); 5124 5125 /* 5126 * Now compare labels to complete the MAC check. If the labels 5127 * are equal or if the requestor is in the global zone and has 5128 * NET_MAC_AWARE, then allow read-write access. (Except for 5129 * mounts into the global zone itself; restrict these to 5130 * read-only.) 5131 * 5132 * If the requestor is in some other zone, but his label 5133 * dominates the server, then allow read-down. 5134 * 5135 * Otherwise, access is denied. 5136 */ 5137 if (blequal(mntlabel, server_sl) || 5138 (crgetzoneid(cr) == GLOBAL_ZONEID && 5139 getpflags(NET_MAC_AWARE, cr) != 0)) { 5140 if ((mntzone == global_zone) || 5141 !blequal(mntlabel, server_sl)) 5142 retv = -1; /* read-only */ 5143 else 5144 retv = 0; /* access OK */ 5145 } else if (bldominates(mntlabel, server_sl)) { 5146 retv = -1; /* read-only */ 5147 } else { 5148 retv = EACCES; 5149 } 5150 5151 if (tsl != NULL) 5152 label_rele(tsl); 5153 5154 rel_tpc: 5155 TPC_RELE(tp); 5156 out: 5157 if (mntzone) 5158 zone_rele(mntzone); 5159 label_rele(zlabel); 5160 return (retv); 5161 } 5162 5163 boolean_t 5164 nfs_has_ctty(void) 5165 { 5166 boolean_t rv; 5167 mutex_enter(&curproc->p_splock); 5168 rv = (curproc->p_sessp->s_vp != NULL); 5169 mutex_exit(&curproc->p_splock); 5170 return (rv); 5171 } 5172 5173 /* 5174 * See if xattr directory to see if it has any generic user attributes 5175 */ 5176 int 5177 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) 5178 { 5179 struct uio uio; 5180 struct iovec iov; 5181 char *dbuf; 5182 struct dirent64 *dp; 5183 size_t dlen = 8 * 1024; 5184 size_t dbuflen; 5185 int eof = 0; 5186 int error; 5187 5188 *valp = 0; 5189 dbuf = kmem_alloc(dlen, KM_SLEEP); 5190 uio.uio_iov = &iov; 5191 uio.uio_iovcnt = 1; 5192 uio.uio_segflg = UIO_SYSSPACE; 5193 uio.uio_fmode = 0; 5194 uio.uio_extflg = UIO_COPY_CACHED; 5195 uio.uio_loffset = 0; 5196 uio.uio_resid = dlen; 5197 iov.iov_base = dbuf; 5198 iov.iov_len = dlen; 5199 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 5200 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); 5201 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 5202 5203 dbuflen = dlen - uio.uio_resid; 5204 5205 if (error || dbuflen == 0) { 5206 kmem_free(dbuf, dlen); 5207 return (error); 5208 } 5209 5210 dp = (dirent64_t *)dbuf; 5211 5212 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { 5213 if (strcmp(dp->d_name, ".") == 0 || 5214 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, 5215 VIEW_READWRITE) == 0 || strcmp(dp->d_name, 5216 VIEW_READONLY) == 0) { 5217 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); 5218 continue; 5219 } 5220 5221 *valp = 1; 5222 break; 5223 } 5224 kmem_free(dbuf, dlen); 5225 return (0); 5226 } 5227