1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/systm.h> 33 #include <sys/cred.h> 34 #include <sys/proc.h> 35 #include <sys/user.h> 36 #include <sys/time.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/socket.h> 41 #include <sys/uio.h> 42 #include <sys/tiuser.h> 43 #include <sys/swap.h> 44 #include <sys/errno.h> 45 #include <sys/debug.h> 46 #include <sys/kmem.h> 47 #include <sys/kstat.h> 48 #include <sys/cmn_err.h> 49 #include <sys/vtrace.h> 50 #include <sys/session.h> 51 #include <sys/dnlc.h> 52 #include <sys/bitmap.h> 53 #include <sys/acl.h> 54 #include <sys/ddi.h> 55 #include <sys/pathname.h> 56 #include <sys/flock.h> 57 #include <sys/dirent.h> 58 #include <sys/flock.h> 59 #include <sys/callb.h> 60 #include <sys/atomic.h> 61 #include <sys/list.h> 62 #include <sys/tsol/tnet.h> 63 #include <sys/priv.h> 64 #include <sys/sdt.h> 65 #include <sys/attr.h> 66 67 #include <inet/ip6.h> 68 69 #include <rpc/types.h> 70 #include <rpc/xdr.h> 71 #include <rpc/auth.h> 72 #include <rpc/clnt.h> 73 74 #include <nfs/nfs.h> 75 #include <nfs/nfs4.h> 76 #include <nfs/nfs_clnt.h> 77 #include <nfs/rnode.h> 78 #include <nfs/nfs_acl.h> 79 80 #include <sys/tsol/label.h> 81 82 /* 83 * The hash queues for the access to active and cached rnodes 84 * are organized as doubly linked lists. A reader/writer lock 85 * for each hash bucket is used to control access and to synchronize 86 * lookups, additions, and deletions from the hash queue. 87 * 88 * The rnode freelist is organized as a doubly linked list with 89 * a head pointer. Additions and deletions are synchronized via 90 * a single mutex. 91 * 92 * In order to add an rnode to the free list, it must be hashed into 93 * a hash queue and the exclusive lock to the hash queue be held. 94 * If an rnode is not hashed into a hash queue, then it is destroyed 95 * because it represents no valuable information that can be reused 96 * about the file. The exclusive lock to the hash queue must be 97 * held in order to prevent a lookup in the hash queue from finding 98 * the rnode and using it and assuming that the rnode is not on the 99 * freelist. The lookup in the hash queue will have the hash queue 100 * locked, either exclusive or shared. 101 * 102 * The vnode reference count for each rnode is not allowed to drop 103 * below 1. This prevents external entities, such as the VM 104 * subsystem, from acquiring references to vnodes already on the 105 * freelist and then trying to place them back on the freelist 106 * when their reference is released. This means that the when an 107 * rnode is looked up in the hash queues, then either the rnode 108 * is removed from the freelist and that reference is transferred to 109 * the new reference or the vnode reference count must be incremented 110 * accordingly. The mutex for the freelist must be held in order to 111 * accurately test to see if the rnode is on the freelist or not. 112 * The hash queue lock might be held shared and it is possible that 113 * two different threads may race to remove the rnode from the 114 * freelist. This race can be resolved by holding the mutex for the 115 * freelist. Please note that the mutex for the freelist does not 116 * need to held if the rnode is not on the freelist. It can not be 117 * placed on the freelist due to the requirement that the thread 118 * putting the rnode on the freelist must hold the exclusive lock 119 * to the hash queue and the thread doing the lookup in the hash 120 * queue is holding either a shared or exclusive lock to the hash 121 * queue. 122 * 123 * The lock ordering is: 124 * 125 * hash bucket lock -> vnode lock 126 * hash bucket lock -> freelist lock 127 */ 128 static rhashq_t *rtable; 129 130 static kmutex_t rpfreelist_lock; 131 static rnode_t *rpfreelist = NULL; 132 static long rnew = 0; 133 long nrnode = 0; 134 135 static int rtablesize; 136 static int rtablemask; 137 138 static int hashlen = 4; 139 140 static struct kmem_cache *rnode_cache; 141 142 /* 143 * Mutex to protect the following variables: 144 * nfs_major 145 * nfs_minor 146 */ 147 kmutex_t nfs_minor_lock; 148 int nfs_major; 149 int nfs_minor; 150 151 /* Do we allow preepoch (negative) time values otw? */ 152 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 153 154 /* 155 * Access cache 156 */ 157 static acache_hash_t *acache; 158 static long nacache; /* used strictly to size the number of hash queues */ 159 160 static int acachesize; 161 static int acachemask; 162 static struct kmem_cache *acache_cache; 163 164 /* 165 * Client side utilities 166 */ 167 168 /* 169 * client side statistics 170 */ 171 static const struct clstat clstat_tmpl = { 172 { "calls", KSTAT_DATA_UINT64 }, 173 { "badcalls", KSTAT_DATA_UINT64 }, 174 { "clgets", KSTAT_DATA_UINT64 }, 175 { "cltoomany", KSTAT_DATA_UINT64 }, 176 #ifdef DEBUG 177 { "clalloc", KSTAT_DATA_UINT64 }, 178 { "noresponse", KSTAT_DATA_UINT64 }, 179 { "failover", KSTAT_DATA_UINT64 }, 180 { "remap", KSTAT_DATA_UINT64 }, 181 #endif 182 }; 183 184 /* 185 * The following are statistics that describe behavior of the system as a whole 186 * and doesn't correspond to any one particular zone. 187 */ 188 #ifdef DEBUG 189 static struct clstat_debug { 190 kstat_named_t nrnode; /* number of allocated rnodes */ 191 kstat_named_t access; /* size of access cache */ 192 kstat_named_t dirent; /* size of readdir cache */ 193 kstat_named_t dirents; /* size of readdir buf cache */ 194 kstat_named_t reclaim; /* number of reclaims */ 195 kstat_named_t clreclaim; /* number of cl reclaims */ 196 kstat_named_t f_reclaim; /* number of free reclaims */ 197 kstat_named_t a_reclaim; /* number of active reclaims */ 198 kstat_named_t r_reclaim; /* number of rnode reclaims */ 199 kstat_named_t rpath; /* bytes used to store rpaths */ 200 } clstat_debug = { 201 { "nrnode", KSTAT_DATA_UINT64 }, 202 { "access", KSTAT_DATA_UINT64 }, 203 { "dirent", KSTAT_DATA_UINT64 }, 204 { "dirents", KSTAT_DATA_UINT64 }, 205 { "reclaim", KSTAT_DATA_UINT64 }, 206 { "clreclaim", KSTAT_DATA_UINT64 }, 207 { "f_reclaim", KSTAT_DATA_UINT64 }, 208 { "a_reclaim", KSTAT_DATA_UINT64 }, 209 { "r_reclaim", KSTAT_DATA_UINT64 }, 210 { "r_path", KSTAT_DATA_UINT64 }, 211 }; 212 #endif /* DEBUG */ 213 214 /* 215 * We keep a global list of per-zone client data, so we can clean up all zones 216 * if we get low on memory. 217 */ 218 static list_t nfs_clnt_list; 219 static kmutex_t nfs_clnt_list_lock; 220 static zone_key_t nfsclnt_zone_key; 221 222 static struct kmem_cache *chtab_cache; 223 224 /* 225 * Some servers do not properly update the attributes of the 226 * directory when changes are made. To allow interoperability 227 * with these broken servers, the nfs_disable_rddir_cache 228 * parameter must be set in /etc/system 229 */ 230 int nfs_disable_rddir_cache = 0; 231 232 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 233 struct chtab **); 234 void clfree(CLIENT *, struct chtab *); 235 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 236 struct chtab **, struct nfs_clnt *); 237 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 238 struct chtab **, struct nfs_clnt *); 239 static void clreclaim(void *); 240 static int nfs_feedback(int, int, mntinfo_t *); 241 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 242 caddr_t, cred_t *, int *, enum clnt_stat *, int, 243 failinfo_t *); 244 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 245 caddr_t, cred_t *, int *, int, failinfo_t *); 246 static void rinactive(rnode_t *, cred_t *); 247 static int rtablehash(nfs_fhandle *); 248 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 249 struct vnodeops *, 250 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 251 cred_t *), 252 int (*)(const void *, const void *), int *, cred_t *, 253 char *, char *); 254 static void rp_rmfree(rnode_t *); 255 static void rp_addhash(rnode_t *); 256 static void rp_rmhash_locked(rnode_t *); 257 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 258 static void destroy_rnode(rnode_t *); 259 static void rddir_cache_free(rddir_cache *); 260 static int nfs_free_data_reclaim(rnode_t *); 261 static int nfs_active_data_reclaim(rnode_t *); 262 static int nfs_free_reclaim(void); 263 static int nfs_active_reclaim(void); 264 static int nfs_rnode_reclaim(void); 265 static void nfs_reclaim(void *); 266 static int failover_safe(failinfo_t *); 267 static void failover_newserver(mntinfo_t *mi); 268 static void failover_thread(mntinfo_t *mi); 269 static int failover_wait(mntinfo_t *); 270 static int failover_remap(failinfo_t *); 271 static int failover_lookup(char *, vnode_t *, 272 int (*)(vnode_t *, char *, vnode_t **, 273 struct pathname *, int, vnode_t *, cred_t *, int), 274 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 275 vnode_t **); 276 static void nfs_free_r_path(rnode_t *); 277 static void nfs_set_vroot(vnode_t *); 278 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 279 280 /* 281 * from rpcsec module (common/rpcsec) 282 */ 283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 284 extern void sec_clnt_freeh(AUTH *); 285 extern void sec_clnt_freeinfo(struct sec_data *); 286 287 /* 288 * used in mount policy 289 */ 290 extern ts_label_t *getflabel_cipso(vfs_t *); 291 292 /* 293 * EIO or EINTR are not recoverable errors. 294 */ 295 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 296 297 #ifdef DEBUG 298 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n" 299 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n" 300 #else 301 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n" 302 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n" 303 #endif 304 /* 305 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 306 */ 307 static int 308 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 309 struct chtab **chp, struct nfs_clnt *nfscl) 310 { 311 struct chhead *ch, *newch; 312 struct chhead **plistp; 313 struct chtab *cp; 314 int error; 315 k_sigset_t smask; 316 317 if (newcl == NULL || chp == NULL || ci == NULL) 318 return (EINVAL); 319 320 *newcl = NULL; 321 *chp = NULL; 322 323 /* 324 * Find an unused handle or create one 325 */ 326 newch = NULL; 327 nfscl->nfscl_stat.clgets.value.ui64++; 328 top: 329 /* 330 * Find the correct entry in the cache to check for free 331 * client handles. The search is based on the RPC program 332 * number, program version number, dev_t for the transport 333 * device, and the protocol family. 334 */ 335 mutex_enter(&nfscl->nfscl_chtable_lock); 336 plistp = &nfscl->nfscl_chtable; 337 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 338 if (ch->ch_prog == ci->cl_prog && 339 ch->ch_vers == ci->cl_vers && 340 ch->ch_dev == svp->sv_knconf->knc_rdev && 341 (strcmp(ch->ch_protofmly, 342 svp->sv_knconf->knc_protofmly) == 0)) 343 break; 344 plistp = &ch->ch_next; 345 } 346 347 /* 348 * If we didn't find a cache entry for this quadruple, then 349 * create one. If we don't have one already preallocated, 350 * then drop the cache lock, create one, and then start over. 351 * If we did have a preallocated entry, then just add it to 352 * the front of the list. 353 */ 354 if (ch == NULL) { 355 if (newch == NULL) { 356 mutex_exit(&nfscl->nfscl_chtable_lock); 357 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 358 newch->ch_timesused = 0; 359 newch->ch_prog = ci->cl_prog; 360 newch->ch_vers = ci->cl_vers; 361 newch->ch_dev = svp->sv_knconf->knc_rdev; 362 newch->ch_protofmly = kmem_alloc( 363 strlen(svp->sv_knconf->knc_protofmly) + 1, 364 KM_SLEEP); 365 (void) strcpy(newch->ch_protofmly, 366 svp->sv_knconf->knc_protofmly); 367 newch->ch_list = NULL; 368 goto top; 369 } 370 ch = newch; 371 newch = NULL; 372 ch->ch_next = nfscl->nfscl_chtable; 373 nfscl->nfscl_chtable = ch; 374 /* 375 * We found a cache entry, but if it isn't on the front of the 376 * list, then move it to the front of the list to try to take 377 * advantage of locality of operations. 378 */ 379 } else if (ch != nfscl->nfscl_chtable) { 380 *plistp = ch->ch_next; 381 ch->ch_next = nfscl->nfscl_chtable; 382 nfscl->nfscl_chtable = ch; 383 } 384 385 /* 386 * If there was a free client handle cached, then remove it 387 * from the list, init it, and use it. 388 */ 389 if (ch->ch_list != NULL) { 390 cp = ch->ch_list; 391 ch->ch_list = cp->ch_list; 392 mutex_exit(&nfscl->nfscl_chtable_lock); 393 if (newch != NULL) { 394 kmem_free(newch->ch_protofmly, 395 strlen(newch->ch_protofmly) + 1); 396 kmem_free(newch, sizeof (*newch)); 397 } 398 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 399 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 400 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 401 &cp->ch_client->cl_auth); 402 if (error || cp->ch_client->cl_auth == NULL) { 403 CLNT_DESTROY(cp->ch_client); 404 kmem_cache_free(chtab_cache, cp); 405 return ((error != 0) ? error : EINTR); 406 } 407 ch->ch_timesused++; 408 *newcl = cp->ch_client; 409 *chp = cp; 410 return (0); 411 } 412 413 /* 414 * There weren't any free client handles which fit, so allocate 415 * a new one and use that. 416 */ 417 #ifdef DEBUG 418 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64); 419 #endif 420 mutex_exit(&nfscl->nfscl_chtable_lock); 421 422 nfscl->nfscl_stat.cltoomany.value.ui64++; 423 if (newch != NULL) { 424 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 425 kmem_free(newch, sizeof (*newch)); 426 } 427 428 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 429 cp->ch_head = ch; 430 431 sigintr(&smask, (int)ci->cl_flags & MI_INT); 432 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 433 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 434 sigunintr(&smask); 435 436 if (error != 0) { 437 kmem_cache_free(chtab_cache, cp); 438 #ifdef DEBUG 439 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); 440 #endif 441 /* 442 * Warning is unnecessary if error is EINTR. 443 */ 444 if (error != EINTR) { 445 nfs_cmn_err(error, CE_WARN, 446 "clget: couldn't create handle: %m\n"); 447 } 448 return (error); 449 } 450 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 451 auth_destroy(cp->ch_client->cl_auth); 452 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 453 &cp->ch_client->cl_auth); 454 if (error || cp->ch_client->cl_auth == NULL) { 455 CLNT_DESTROY(cp->ch_client); 456 kmem_cache_free(chtab_cache, cp); 457 #ifdef DEBUG 458 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); 459 #endif 460 return ((error != 0) ? error : EINTR); 461 } 462 ch->ch_timesused++; 463 *newcl = cp->ch_client; 464 ASSERT(cp->ch_client->cl_nosignal == FALSE); 465 *chp = cp; 466 return (0); 467 } 468 469 int 470 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 471 struct chtab **chp) 472 { 473 struct nfs_clnt *nfscl; 474 475 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 476 ASSERT(nfscl != NULL); 477 478 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 479 } 480 481 static int 482 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 483 struct chtab **chp, struct nfs_clnt *nfscl) 484 { 485 clinfo_t ci; 486 int error; 487 488 /* 489 * Set read buffer size to rsize 490 * and add room for RPC headers. 491 */ 492 ci.cl_readsize = mi->mi_tsize; 493 if (ci.cl_readsize != 0) 494 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 495 496 /* 497 * If soft mount and server is down just try once. 498 * meaning: do not retransmit. 499 */ 500 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 501 ci.cl_retrans = 0; 502 else 503 ci.cl_retrans = mi->mi_retrans; 504 505 ci.cl_prog = NFS_ACL_PROGRAM; 506 ci.cl_vers = mi->mi_vers; 507 ci.cl_flags = mi->mi_flags; 508 509 /* 510 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 511 * security flavor, the client tries to establish a security context 512 * by contacting the server. If the connection is timed out or reset, 513 * e.g. server reboot, we will try again. 514 */ 515 do { 516 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 517 518 if (error == 0) 519 break; 520 521 /* 522 * For forced unmount or zone shutdown, bail out, no retry. 523 */ 524 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 525 error = EIO; 526 break; 527 } 528 529 /* do not retry for softmount */ 530 if (!(mi->mi_flags & MI_HARD)) 531 break; 532 533 /* let the caller deal with the failover case */ 534 if (FAILOVER_MOUNT(mi)) 535 break; 536 537 } while (error == ETIMEDOUT || error == ECONNRESET); 538 539 return (error); 540 } 541 542 static int 543 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 544 struct chtab **chp, struct nfs_clnt *nfscl) 545 { 546 clinfo_t ci; 547 int error; 548 549 /* 550 * Set read buffer size to rsize 551 * and add room for RPC headers. 552 */ 553 ci.cl_readsize = mi->mi_tsize; 554 if (ci.cl_readsize != 0) 555 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 556 557 /* 558 * If soft mount and server is down just try once. 559 * meaning: do not retransmit. 560 */ 561 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 562 ci.cl_retrans = 0; 563 else 564 ci.cl_retrans = mi->mi_retrans; 565 566 ci.cl_prog = mi->mi_prog; 567 ci.cl_vers = mi->mi_vers; 568 ci.cl_flags = mi->mi_flags; 569 570 /* 571 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 572 * security flavor, the client tries to establish a security context 573 * by contacting the server. If the connection is timed out or reset, 574 * e.g. server reboot, we will try again. 575 */ 576 do { 577 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 578 579 if (error == 0) 580 break; 581 582 /* 583 * For forced unmount or zone shutdown, bail out, no retry. 584 */ 585 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 586 error = EIO; 587 break; 588 } 589 590 /* do not retry for softmount */ 591 if (!(mi->mi_flags & MI_HARD)) 592 break; 593 594 /* let the caller deal with the failover case */ 595 if (FAILOVER_MOUNT(mi)) 596 break; 597 598 } while (error == ETIMEDOUT || error == ECONNRESET); 599 600 return (error); 601 } 602 603 static void 604 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 605 { 606 if (cl->cl_auth != NULL) { 607 sec_clnt_freeh(cl->cl_auth); 608 cl->cl_auth = NULL; 609 } 610 611 /* 612 * Timestamp this cache entry so that we know when it was last 613 * used. 614 */ 615 cp->ch_freed = gethrestime_sec(); 616 617 /* 618 * Add the free client handle to the front of the list. 619 * This way, the list will be sorted in youngest to oldest 620 * order. 621 */ 622 mutex_enter(&nfscl->nfscl_chtable_lock); 623 cp->ch_list = cp->ch_head->ch_list; 624 cp->ch_head->ch_list = cp; 625 mutex_exit(&nfscl->nfscl_chtable_lock); 626 } 627 628 void 629 clfree(CLIENT *cl, struct chtab *cp) 630 { 631 struct nfs_clnt *nfscl; 632 633 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 634 ASSERT(nfscl != NULL); 635 636 clfree_impl(cl, cp, nfscl); 637 } 638 639 #define CL_HOLDTIME 60 /* time to hold client handles */ 640 641 static void 642 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 643 { 644 struct chhead *ch; 645 struct chtab *cp; /* list of objects that can be reclaimed */ 646 struct chtab *cpe; 647 struct chtab *cpl; 648 struct chtab **cpp; 649 #ifdef DEBUG 650 int n = 0; 651 #endif 652 653 /* 654 * Need to reclaim some memory, so step through the cache 655 * looking through the lists for entries which can be freed. 656 */ 657 cp = NULL; 658 659 mutex_enter(&nfscl->nfscl_chtable_lock); 660 661 /* 662 * Here we step through each non-NULL quadruple and start to 663 * construct the reclaim list pointed to by cp. Note that 664 * cp will contain all eligible chtab entries. When this traversal 665 * completes, chtab entries from the last quadruple will be at the 666 * front of cp and entries from previously inspected quadruples have 667 * been appended to the rear of cp. 668 */ 669 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 670 if (ch->ch_list == NULL) 671 continue; 672 /* 673 * Search each list for entries older then 674 * cl_holdtime seconds. The lists are maintained 675 * in youngest to oldest order so that when the 676 * first entry is found which is old enough, then 677 * all of the rest of the entries on the list will 678 * be old enough as well. 679 */ 680 cpl = ch->ch_list; 681 cpp = &ch->ch_list; 682 while (cpl != NULL && 683 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 684 cpp = &cpl->ch_list; 685 cpl = cpl->ch_list; 686 } 687 if (cpl != NULL) { 688 *cpp = NULL; 689 if (cp != NULL) { 690 cpe = cpl; 691 while (cpe->ch_list != NULL) 692 cpe = cpe->ch_list; 693 cpe->ch_list = cp; 694 } 695 cp = cpl; 696 } 697 } 698 699 mutex_exit(&nfscl->nfscl_chtable_lock); 700 701 /* 702 * If cp is empty, then there is nothing to reclaim here. 703 */ 704 if (cp == NULL) 705 return; 706 707 /* 708 * Step through the list of entries to free, destroying each client 709 * handle and kmem_free'ing the memory for each entry. 710 */ 711 while (cp != NULL) { 712 #ifdef DEBUG 713 n++; 714 #endif 715 CLNT_DESTROY(cp->ch_client); 716 cpl = cp->ch_list; 717 kmem_cache_free(chtab_cache, cp); 718 cp = cpl; 719 } 720 721 #ifdef DEBUG 722 /* 723 * Update clalloc so that nfsstat shows the current number 724 * of allocated client handles. 725 */ 726 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 727 #endif 728 } 729 730 /* ARGSUSED */ 731 static void 732 clreclaim(void *all) 733 { 734 struct nfs_clnt *nfscl; 735 736 #ifdef DEBUG 737 clstat_debug.clreclaim.value.ui64++; 738 #endif 739 /* 740 * The system is low on memory; go through and try to reclaim some from 741 * every zone on the system. 742 */ 743 mutex_enter(&nfs_clnt_list_lock); 744 nfscl = list_head(&nfs_clnt_list); 745 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 746 clreclaim_zone(nfscl, CL_HOLDTIME); 747 mutex_exit(&nfs_clnt_list_lock); 748 } 749 750 /* 751 * Minimum time-out values indexed by call type 752 * These units are in "eights" of a second to avoid multiplies 753 */ 754 static unsigned int minimum_timeo[] = { 755 6, 7, 10 756 }; 757 758 /* 759 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 760 */ 761 #define MAXTIMO (20*hz) 762 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 763 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 764 765 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 766 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 767 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 768 769 /* 770 * Function called when rfscall notices that we have been 771 * re-transmitting, or when we get a response without retransmissions. 772 * Return 1 if the transfer size was adjusted down - 0 if no change. 773 */ 774 static int 775 nfs_feedback(int flag, int which, mntinfo_t *mi) 776 { 777 int kind; 778 int r = 0; 779 780 mutex_enter(&mi->mi_lock); 781 if (flag == FEEDBACK_REXMIT1) { 782 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 783 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 784 goto done; 785 if (mi->mi_curread > MIN_NFS_TSIZE) { 786 mi->mi_curread /= 2; 787 if (mi->mi_curread < MIN_NFS_TSIZE) 788 mi->mi_curread = MIN_NFS_TSIZE; 789 r = 1; 790 } 791 792 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 793 mi->mi_curwrite /= 2; 794 if (mi->mi_curwrite < MIN_NFS_TSIZE) 795 mi->mi_curwrite = MIN_NFS_TSIZE; 796 r = 1; 797 } 798 } else if (flag == FEEDBACK_OK) { 799 kind = mi->mi_timer_type[which]; 800 if (kind == 0 || 801 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 802 goto done; 803 if (kind == 1) { 804 if (mi->mi_curread >= mi->mi_tsize) 805 goto done; 806 mi->mi_curread += MIN_NFS_TSIZE; 807 if (mi->mi_curread > mi->mi_tsize/2) 808 mi->mi_curread = mi->mi_tsize; 809 } else if (kind == 2) { 810 if (mi->mi_curwrite >= mi->mi_stsize) 811 goto done; 812 mi->mi_curwrite += MIN_NFS_TSIZE; 813 if (mi->mi_curwrite > mi->mi_stsize/2) 814 mi->mi_curwrite = mi->mi_stsize; 815 } 816 } 817 done: 818 mutex_exit(&mi->mi_lock); 819 return (r); 820 } 821 822 #ifdef DEBUG 823 static int rfs2call_hits = 0; 824 static int rfs2call_misses = 0; 825 #endif 826 827 int 828 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 829 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 830 enum nfsstat *statusp, int flags, failinfo_t *fi) 831 { 832 int rpcerror; 833 enum clnt_stat rpc_status; 834 835 ASSERT(statusp != NULL); 836 837 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 838 cr, douprintf, &rpc_status, flags, fi); 839 if (!rpcerror) { 840 /* 841 * See crnetadjust() for comments. 842 */ 843 if (*statusp == NFSERR_ACCES && 844 (cr = crnetadjust(cr)) != NULL) { 845 #ifdef DEBUG 846 rfs2call_hits++; 847 #endif 848 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 849 resp, cr, douprintf, NULL, flags, fi); 850 crfree(cr); 851 #ifdef DEBUG 852 if (*statusp == NFSERR_ACCES) 853 rfs2call_misses++; 854 #endif 855 } 856 } else if (rpc_status == RPC_PROCUNAVAIL) { 857 *statusp = NFSERR_OPNOTSUPP; 858 rpcerror = 0; 859 } 860 861 return (rpcerror); 862 } 863 864 #define NFS3_JUKEBOX_DELAY 10 * hz 865 866 static clock_t nfs3_jukebox_delay = 0; 867 868 #ifdef DEBUG 869 static int rfs3call_hits = 0; 870 static int rfs3call_misses = 0; 871 #endif 872 873 int 874 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 875 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 876 nfsstat3 *statusp, int flags, failinfo_t *fi) 877 { 878 int rpcerror; 879 int user_informed; 880 881 user_informed = 0; 882 do { 883 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 884 cr, douprintf, NULL, flags, fi); 885 if (!rpcerror) { 886 cred_t *crr; 887 if (*statusp == NFS3ERR_JUKEBOX) { 888 if (ttoproc(curthread) == &p0) { 889 rpcerror = EAGAIN; 890 break; 891 } 892 if (!user_informed) { 893 user_informed = 1; 894 uprintf( 895 "file temporarily unavailable on the server, retrying...\n"); 896 } 897 delay(nfs3_jukebox_delay); 898 } 899 /* 900 * See crnetadjust() for comments. 901 */ 902 else if (*statusp == NFS3ERR_ACCES && 903 (crr = crnetadjust(cr)) != NULL) { 904 #ifdef DEBUG 905 rfs3call_hits++; 906 #endif 907 rpcerror = rfscall(mi, which, xdrargs, argsp, 908 xdrres, resp, crr, douprintf, 909 NULL, flags, fi); 910 911 crfree(crr); 912 #ifdef DEBUG 913 if (*statusp == NFS3ERR_ACCES) 914 rfs3call_misses++; 915 #endif 916 } 917 } 918 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 919 920 return (rpcerror); 921 } 922 923 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 924 #define INC_READERS(mi) { \ 925 mi->mi_readers++; \ 926 } 927 #define DEC_READERS(mi) { \ 928 mi->mi_readers--; \ 929 if (mi->mi_readers == 0) \ 930 cv_broadcast(&mi->mi_failover_cv); \ 931 } 932 933 static int 934 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 935 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 936 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 937 { 938 CLIENT *client; 939 struct chtab *ch; 940 cred_t *cr = icr; 941 enum clnt_stat status; 942 struct rpc_err rpcerr, rpcerr_tmp; 943 struct timeval wait; 944 int timeo; /* in units of hz */ 945 int my_rsize, my_wsize; 946 bool_t tryagain; 947 bool_t cred_cloned = FALSE; 948 k_sigset_t smask; 949 servinfo_t *svp; 950 struct nfs_clnt *nfscl; 951 zoneid_t zoneid = getzoneid(); 952 char *msg; 953 #ifdef DEBUG 954 char *bufp; 955 #endif 956 957 958 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 959 "rfscall_start:which %d mi %p", which, mi); 960 961 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 962 ASSERT(nfscl != NULL); 963 964 nfscl->nfscl_stat.calls.value.ui64++; 965 mi->mi_reqs[which].value.ui64++; 966 967 rpcerr.re_status = RPC_SUCCESS; 968 969 /* 970 * In case of forced unmount or zone shutdown, return EIO. 971 */ 972 973 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 974 rpcerr.re_status = RPC_FAILED; 975 rpcerr.re_errno = EIO; 976 return (rpcerr.re_errno); 977 } 978 979 /* 980 * Remember the transfer sizes in case 981 * nfs_feedback changes them underneath us. 982 */ 983 my_rsize = mi->mi_curread; 984 my_wsize = mi->mi_curwrite; 985 986 /* 987 * NFS client failover support 988 * 989 * If this rnode is not in sync with the current server (VALID_FH), 990 * we'd like to do a remap to get in sync. We can be interrupted 991 * in failover_remap(), and if so we'll bail. Otherwise, we'll 992 * use the best info we have to try the RPC. Part of that is 993 * unconditionally updating the filehandle copy kept for V3. 994 * 995 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 996 * rw_enter(); we're trying to keep the current server from being 997 * changed on us until we're done with the remapping and have a 998 * matching client handle. We don't want to sending a filehandle 999 * to the wrong host. 1000 */ 1001 failoverretry: 1002 if (FAILOVER_MOUNT(mi)) { 1003 mutex_enter(&mi->mi_lock); 1004 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1005 if (failover_wait(mi)) { 1006 mutex_exit(&mi->mi_lock); 1007 return (EINTR); 1008 } 1009 } 1010 INC_READERS(mi); 1011 mutex_exit(&mi->mi_lock); 1012 if (fi) { 1013 if (!VALID_FH(fi) && 1014 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1015 int remaperr; 1016 1017 svp = mi->mi_curr_serv; 1018 remaperr = failover_remap(fi); 1019 if (remaperr != 0) { 1020 #ifdef DEBUG 1021 if (remaperr != EINTR) 1022 nfs_cmn_err(remaperr, CE_WARN, 1023 "rfscall couldn't failover: %m"); 1024 #endif 1025 mutex_enter(&mi->mi_lock); 1026 DEC_READERS(mi); 1027 mutex_exit(&mi->mi_lock); 1028 /* 1029 * If failover_remap returns ETIMEDOUT 1030 * and the filesystem is hard mounted 1031 * we have to retry the call with a new 1032 * server. 1033 */ 1034 if ((mi->mi_flags & MI_HARD) && 1035 IS_RECOVERABLE_ERROR(remaperr)) { 1036 if (svp == mi->mi_curr_serv) 1037 failover_newserver(mi); 1038 rpcerr.re_status = RPC_SUCCESS; 1039 goto failoverretry; 1040 } 1041 rpcerr.re_errno = remaperr; 1042 return (remaperr); 1043 } 1044 } 1045 if (fi->fhp && fi->copyproc) 1046 (*fi->copyproc)(fi->fhp, fi->vp); 1047 } 1048 } 1049 1050 /* For TSOL, use a new cred which has net_mac_aware flag */ 1051 if (!cred_cloned && is_system_labeled()) { 1052 cred_cloned = TRUE; 1053 cr = crdup(icr); 1054 (void) setpflags(NET_MAC_AWARE, 1, cr); 1055 } 1056 1057 /* 1058 * clget() calls clnt_tli_kinit() which clears the xid, so we 1059 * are guaranteed to reprocess the retry as a new request. 1060 */ 1061 svp = mi->mi_curr_serv; 1062 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1063 1064 if (FAILOVER_MOUNT(mi)) { 1065 mutex_enter(&mi->mi_lock); 1066 DEC_READERS(mi); 1067 mutex_exit(&mi->mi_lock); 1068 1069 if ((rpcerr.re_errno == ETIMEDOUT || 1070 rpcerr.re_errno == ECONNRESET) && 1071 failover_safe(fi)) { 1072 if (svp == mi->mi_curr_serv) 1073 failover_newserver(mi); 1074 goto failoverretry; 1075 } 1076 } 1077 if (rpcerr.re_errno != 0) 1078 return (rpcerr.re_errno); 1079 1080 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1081 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1082 timeo = (mi->mi_timeo * hz) / 10; 1083 } else { 1084 mutex_enter(&mi->mi_lock); 1085 timeo = CLNT_SETTIMERS(client, 1086 &(mi->mi_timers[mi->mi_timer_type[which]]), 1087 &(mi->mi_timers[NFS_CALLTYPES]), 1088 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1089 (void (*)())NULL, (caddr_t)mi, 0); 1090 mutex_exit(&mi->mi_lock); 1091 } 1092 1093 /* 1094 * If hard mounted fs, retry call forever unless hard error occurs. 1095 */ 1096 do { 1097 tryagain = FALSE; 1098 1099 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1100 status = RPC_FAILED; 1101 rpcerr.re_status = RPC_FAILED; 1102 rpcerr.re_errno = EIO; 1103 break; 1104 } 1105 1106 TICK_TO_TIMEVAL(timeo, &wait); 1107 1108 /* 1109 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1110 * and SIGTERM. (Preserving the existing masks). 1111 * Mask out SIGINT if mount option nointr is specified. 1112 */ 1113 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1114 if (!(mi->mi_flags & MI_INT)) 1115 client->cl_nosignal = TRUE; 1116 1117 /* 1118 * If there is a current signal, then don't bother 1119 * even trying to send out the request because we 1120 * won't be able to block waiting for the response. 1121 * Simply assume RPC_INTR and get on with it. 1122 */ 1123 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1124 status = RPC_INTR; 1125 else { 1126 status = CLNT_CALL(client, which, xdrargs, argsp, 1127 xdrres, resp, wait); 1128 } 1129 1130 if (!(mi->mi_flags & MI_INT)) 1131 client->cl_nosignal = FALSE; 1132 /* 1133 * restore original signal mask 1134 */ 1135 sigunintr(&smask); 1136 1137 switch (status) { 1138 case RPC_SUCCESS: 1139 if ((mi->mi_flags & MI_DYNAMIC) && 1140 mi->mi_timer_type[which] != 0 && 1141 (mi->mi_curread != my_rsize || 1142 mi->mi_curwrite != my_wsize)) 1143 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1144 break; 1145 1146 case RPC_INTR: 1147 /* 1148 * There is no way to recover from this error, 1149 * even if mount option nointr is specified. 1150 * SIGKILL, for example, cannot be blocked. 1151 */ 1152 rpcerr.re_status = RPC_INTR; 1153 rpcerr.re_errno = EINTR; 1154 break; 1155 1156 case RPC_UDERROR: 1157 /* 1158 * If the NFS server is local (vold) and 1159 * it goes away then we get RPC_UDERROR. 1160 * This is a retryable error, so we would 1161 * loop, so check to see if the specific 1162 * error was ECONNRESET, indicating that 1163 * target did not exist at all. If so, 1164 * return with RPC_PROGUNAVAIL and 1165 * ECONNRESET to indicate why. 1166 */ 1167 CLNT_GETERR(client, &rpcerr); 1168 if (rpcerr.re_errno == ECONNRESET) { 1169 rpcerr.re_status = RPC_PROGUNAVAIL; 1170 rpcerr.re_errno = ECONNRESET; 1171 break; 1172 } 1173 /*FALLTHROUGH*/ 1174 1175 default: /* probably RPC_TIMEDOUT */ 1176 if (IS_UNRECOVERABLE_RPC(status)) 1177 break; 1178 1179 /* 1180 * increment server not responding count 1181 */ 1182 mutex_enter(&mi->mi_lock); 1183 mi->mi_noresponse++; 1184 mutex_exit(&mi->mi_lock); 1185 #ifdef DEBUG 1186 nfscl->nfscl_stat.noresponse.value.ui64++; 1187 #endif 1188 1189 if (!(mi->mi_flags & MI_HARD)) { 1190 if (!(mi->mi_flags & MI_SEMISOFT) || 1191 (mi->mi_ss_call_type[which] == 0)) 1192 break; 1193 } 1194 1195 /* 1196 * The call is in progress (over COTS). 1197 * Try the CLNT_CALL again, but don't 1198 * print a noisy error message. 1199 */ 1200 if (status == RPC_INPROGRESS) { 1201 tryagain = TRUE; 1202 break; 1203 } 1204 1205 if (flags & RFSCALL_SOFT) 1206 break; 1207 1208 /* 1209 * On zone shutdown, just move on. 1210 */ 1211 if (zone_status_get(curproc->p_zone) >= 1212 ZONE_IS_SHUTTING_DOWN) { 1213 rpcerr.re_status = RPC_FAILED; 1214 rpcerr.re_errno = EIO; 1215 break; 1216 } 1217 1218 /* 1219 * NFS client failover support 1220 * 1221 * If the current server just failed us, we'll 1222 * start the process of finding a new server. 1223 * After that, we can just retry. 1224 */ 1225 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1226 if (svp == mi->mi_curr_serv) 1227 failover_newserver(mi); 1228 clfree_impl(client, ch, nfscl); 1229 goto failoverretry; 1230 } 1231 1232 tryagain = TRUE; 1233 timeo = backoff(timeo); 1234 1235 CLNT_GETERR(client, &rpcerr_tmp); 1236 if ((status == RPC_CANTSEND) && 1237 (rpcerr_tmp.re_errno == ENOBUFS)) 1238 msg = SRV_QFULL_MSG; 1239 else 1240 msg = SRV_NOTRESP_MSG; 1241 1242 mutex_enter(&mi->mi_lock); 1243 if (!(mi->mi_flags & MI_PRINTED)) { 1244 mi->mi_flags |= MI_PRINTED; 1245 mutex_exit(&mi->mi_lock); 1246 #ifdef DEBUG 1247 zprintf(zoneid, msg, mi->mi_vers, 1248 svp->sv_hostname); 1249 #else 1250 zprintf(zoneid, msg, svp->sv_hostname); 1251 #endif 1252 } else 1253 mutex_exit(&mi->mi_lock); 1254 if (*douprintf && nfs_has_ctty()) { 1255 *douprintf = 0; 1256 if (!(mi->mi_flags & MI_NOPRINT)) 1257 #ifdef DEBUG 1258 uprintf(msg, mi->mi_vers, 1259 svp->sv_hostname); 1260 #else 1261 uprintf(msg, svp->sv_hostname); 1262 #endif 1263 } 1264 1265 /* 1266 * If doing dynamic adjustment of transfer 1267 * size and if it's a read or write call 1268 * and if the transfer size changed while 1269 * retransmitting or if the feedback routine 1270 * changed the transfer size, 1271 * then exit rfscall so that the transfer 1272 * size can be adjusted at the vnops level. 1273 */ 1274 if ((mi->mi_flags & MI_DYNAMIC) && 1275 mi->mi_timer_type[which] != 0 && 1276 (mi->mi_curread != my_rsize || 1277 mi->mi_curwrite != my_wsize || 1278 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1279 /* 1280 * On read or write calls, return 1281 * back to the vnode ops level if 1282 * the transfer size changed. 1283 */ 1284 clfree_impl(client, ch, nfscl); 1285 if (cred_cloned) 1286 crfree(cr); 1287 return (ENFS_TRYAGAIN); 1288 } 1289 } 1290 } while (tryagain); 1291 1292 if (status != RPC_SUCCESS) { 1293 /* 1294 * Let soft mounts use the timed out message. 1295 */ 1296 if (status == RPC_INPROGRESS) 1297 status = RPC_TIMEDOUT; 1298 nfscl->nfscl_stat.badcalls.value.ui64++; 1299 if (status != RPC_INTR) { 1300 mutex_enter(&mi->mi_lock); 1301 mi->mi_flags |= MI_DOWN; 1302 mutex_exit(&mi->mi_lock); 1303 CLNT_GETERR(client, &rpcerr); 1304 #ifdef DEBUG 1305 bufp = clnt_sperror(client, svp->sv_hostname); 1306 zprintf(zoneid, "NFS%d %s failed for %s\n", 1307 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1308 if (nfs_has_ctty()) { 1309 if (!(mi->mi_flags & MI_NOPRINT)) { 1310 uprintf("NFS%d %s failed for %s\n", 1311 mi->mi_vers, mi->mi_rfsnames[which], 1312 bufp); 1313 } 1314 } 1315 kmem_free(bufp, MAXPATHLEN); 1316 #else 1317 zprintf(zoneid, 1318 "NFS %s failed for server %s: error %d (%s)\n", 1319 mi->mi_rfsnames[which], svp->sv_hostname, 1320 status, clnt_sperrno(status)); 1321 if (nfs_has_ctty()) { 1322 if (!(mi->mi_flags & MI_NOPRINT)) { 1323 uprintf( 1324 "NFS %s failed for server %s: error %d (%s)\n", 1325 mi->mi_rfsnames[which], 1326 svp->sv_hostname, status, 1327 clnt_sperrno(status)); 1328 } 1329 } 1330 #endif 1331 /* 1332 * when CLNT_CALL() fails with RPC_AUTHERROR, 1333 * re_errno is set appropriately depending on 1334 * the authentication error 1335 */ 1336 if (status == RPC_VERSMISMATCH || 1337 status == RPC_PROGVERSMISMATCH) 1338 rpcerr.re_errno = EIO; 1339 } 1340 } else { 1341 /* 1342 * Test the value of mi_down and mi_printed without 1343 * holding the mi_lock mutex. If they are both zero, 1344 * then it is okay to skip the down and printed 1345 * processing. This saves on a mutex_enter and 1346 * mutex_exit pair for a normal, successful RPC. 1347 * This was just complete overhead. 1348 */ 1349 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1350 mutex_enter(&mi->mi_lock); 1351 mi->mi_flags &= ~MI_DOWN; 1352 if (mi->mi_flags & MI_PRINTED) { 1353 mi->mi_flags &= ~MI_PRINTED; 1354 mutex_exit(&mi->mi_lock); 1355 #ifdef DEBUG 1356 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1357 zprintf(zoneid, "NFS%d server %s ok\n", 1358 mi->mi_vers, svp->sv_hostname); 1359 #else 1360 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1361 zprintf(zoneid, "NFS server %s ok\n", 1362 svp->sv_hostname); 1363 #endif 1364 } else 1365 mutex_exit(&mi->mi_lock); 1366 } 1367 1368 if (*douprintf == 0) { 1369 if (!(mi->mi_flags & MI_NOPRINT)) 1370 #ifdef DEBUG 1371 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1372 uprintf("NFS%d server %s ok\n", 1373 mi->mi_vers, svp->sv_hostname); 1374 #else 1375 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1376 uprintf("NFS server %s ok\n", svp->sv_hostname); 1377 #endif 1378 *douprintf = 1; 1379 } 1380 } 1381 1382 clfree_impl(client, ch, nfscl); 1383 if (cred_cloned) 1384 crfree(cr); 1385 1386 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1387 1388 if (rpc_status != NULL) 1389 *rpc_status = rpcerr.re_status; 1390 1391 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1392 rpcerr.re_errno); 1393 1394 return (rpcerr.re_errno); 1395 } 1396 1397 #ifdef DEBUG 1398 static int acl2call_hits = 0; 1399 static int acl2call_misses = 0; 1400 #endif 1401 1402 int 1403 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1404 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1405 enum nfsstat *statusp, int flags, failinfo_t *fi) 1406 { 1407 int rpcerror; 1408 1409 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1410 cr, douprintf, flags, fi); 1411 if (!rpcerror) { 1412 /* 1413 * See comments with crnetadjust(). 1414 */ 1415 if (*statusp == NFSERR_ACCES && 1416 (cr = crnetadjust(cr)) != NULL) { 1417 #ifdef DEBUG 1418 acl2call_hits++; 1419 #endif 1420 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1421 resp, cr, douprintf, flags, fi); 1422 crfree(cr); 1423 #ifdef DEBUG 1424 if (*statusp == NFSERR_ACCES) 1425 acl2call_misses++; 1426 #endif 1427 } 1428 } 1429 1430 return (rpcerror); 1431 } 1432 1433 #ifdef DEBUG 1434 static int acl3call_hits = 0; 1435 static int acl3call_misses = 0; 1436 #endif 1437 1438 int 1439 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1440 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1441 nfsstat3 *statusp, int flags, failinfo_t *fi) 1442 { 1443 int rpcerror; 1444 int user_informed; 1445 1446 user_informed = 0; 1447 1448 do { 1449 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1450 cr, douprintf, flags, fi); 1451 if (!rpcerror) { 1452 cred_t *crr; 1453 if (*statusp == NFS3ERR_JUKEBOX) { 1454 if (!user_informed) { 1455 user_informed = 1; 1456 uprintf( 1457 "file temporarily unavailable on the server, retrying...\n"); 1458 } 1459 delay(nfs3_jukebox_delay); 1460 } 1461 /* 1462 * See crnetadjust() for comments. 1463 */ 1464 else if (*statusp == NFS3ERR_ACCES && 1465 (crr = crnetadjust(cr)) != NULL) { 1466 #ifdef DEBUG 1467 acl3call_hits++; 1468 #endif 1469 rpcerror = aclcall(mi, which, xdrargs, argsp, 1470 xdrres, resp, crr, douprintf, flags, fi); 1471 1472 crfree(crr); 1473 #ifdef DEBUG 1474 if (*statusp == NFS3ERR_ACCES) 1475 acl3call_misses++; 1476 #endif 1477 } 1478 } 1479 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1480 1481 return (rpcerror); 1482 } 1483 1484 static int 1485 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1486 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1487 int flags, failinfo_t *fi) 1488 { 1489 CLIENT *client; 1490 struct chtab *ch; 1491 cred_t *cr = icr; 1492 bool_t cred_cloned = FALSE; 1493 enum clnt_stat status; 1494 struct rpc_err rpcerr; 1495 struct timeval wait; 1496 int timeo; /* in units of hz */ 1497 #if 0 /* notyet */ 1498 int my_rsize, my_wsize; 1499 #endif 1500 bool_t tryagain; 1501 k_sigset_t smask; 1502 servinfo_t *svp; 1503 struct nfs_clnt *nfscl; 1504 zoneid_t zoneid = getzoneid(); 1505 #ifdef DEBUG 1506 char *bufp; 1507 #endif 1508 1509 #if 0 /* notyet */ 1510 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1511 "rfscall_start:which %d mi %p", which, mi); 1512 #endif 1513 1514 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1515 ASSERT(nfscl != NULL); 1516 1517 nfscl->nfscl_stat.calls.value.ui64++; 1518 mi->mi_aclreqs[which].value.ui64++; 1519 1520 rpcerr.re_status = RPC_SUCCESS; 1521 1522 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1523 rpcerr.re_status = RPC_FAILED; 1524 rpcerr.re_errno = EIO; 1525 return (rpcerr.re_errno); 1526 } 1527 1528 #if 0 /* notyet */ 1529 /* 1530 * Remember the transfer sizes in case 1531 * nfs_feedback changes them underneath us. 1532 */ 1533 my_rsize = mi->mi_curread; 1534 my_wsize = mi->mi_curwrite; 1535 #endif 1536 1537 /* 1538 * NFS client failover support 1539 * 1540 * If this rnode is not in sync with the current server (VALID_FH), 1541 * we'd like to do a remap to get in sync. We can be interrupted 1542 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1543 * use the best info we have to try the RPC. Part of that is 1544 * unconditionally updating the filehandle copy kept for V3. 1545 * 1546 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1547 * rw_enter(); we're trying to keep the current server from being 1548 * changed on us until we're done with the remapping and have a 1549 * matching client handle. We don't want to sending a filehandle 1550 * to the wrong host. 1551 */ 1552 failoverretry: 1553 if (FAILOVER_MOUNT(mi)) { 1554 mutex_enter(&mi->mi_lock); 1555 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1556 if (failover_wait(mi)) { 1557 mutex_exit(&mi->mi_lock); 1558 return (EINTR); 1559 } 1560 } 1561 INC_READERS(mi); 1562 mutex_exit(&mi->mi_lock); 1563 if (fi) { 1564 if (!VALID_FH(fi) && 1565 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1566 int remaperr; 1567 1568 svp = mi->mi_curr_serv; 1569 remaperr = failover_remap(fi); 1570 if (remaperr != 0) { 1571 #ifdef DEBUG 1572 if (remaperr != EINTR) 1573 nfs_cmn_err(remaperr, CE_WARN, 1574 "aclcall couldn't failover: %m"); 1575 #endif 1576 mutex_enter(&mi->mi_lock); 1577 DEC_READERS(mi); 1578 mutex_exit(&mi->mi_lock); 1579 1580 /* 1581 * If failover_remap returns ETIMEDOUT 1582 * and the filesystem is hard mounted 1583 * we have to retry the call with a new 1584 * server. 1585 */ 1586 if ((mi->mi_flags & MI_HARD) && 1587 IS_RECOVERABLE_ERROR(remaperr)) { 1588 if (svp == mi->mi_curr_serv) 1589 failover_newserver(mi); 1590 rpcerr.re_status = RPC_SUCCESS; 1591 goto failoverretry; 1592 } 1593 return (remaperr); 1594 } 1595 } 1596 if (fi->fhp && fi->copyproc) 1597 (*fi->copyproc)(fi->fhp, fi->vp); 1598 } 1599 } 1600 1601 /* For TSOL, use a new cred which has net_mac_aware flag */ 1602 if (!cred_cloned && is_system_labeled()) { 1603 cred_cloned = TRUE; 1604 cr = crdup(icr); 1605 (void) setpflags(NET_MAC_AWARE, 1, cr); 1606 } 1607 1608 /* 1609 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1610 * are guaranteed to reprocess the retry as a new request. 1611 */ 1612 svp = mi->mi_curr_serv; 1613 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1614 if (FAILOVER_MOUNT(mi)) { 1615 mutex_enter(&mi->mi_lock); 1616 DEC_READERS(mi); 1617 mutex_exit(&mi->mi_lock); 1618 1619 if ((rpcerr.re_errno == ETIMEDOUT || 1620 rpcerr.re_errno == ECONNRESET) && 1621 failover_safe(fi)) { 1622 if (svp == mi->mi_curr_serv) 1623 failover_newserver(mi); 1624 goto failoverretry; 1625 } 1626 } 1627 if (rpcerr.re_errno != 0) { 1628 if (cred_cloned) 1629 crfree(cr); 1630 return (rpcerr.re_errno); 1631 } 1632 1633 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1634 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1635 timeo = (mi->mi_timeo * hz) / 10; 1636 } else { 1637 mutex_enter(&mi->mi_lock); 1638 timeo = CLNT_SETTIMERS(client, 1639 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1640 &(mi->mi_timers[NFS_CALLTYPES]), 1641 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1642 (void (*)()) 0, (caddr_t)mi, 0); 1643 mutex_exit(&mi->mi_lock); 1644 } 1645 1646 /* 1647 * If hard mounted fs, retry call forever unless hard error occurs. 1648 */ 1649 do { 1650 tryagain = FALSE; 1651 1652 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1653 status = RPC_FAILED; 1654 rpcerr.re_status = RPC_FAILED; 1655 rpcerr.re_errno = EIO; 1656 break; 1657 } 1658 1659 TICK_TO_TIMEVAL(timeo, &wait); 1660 1661 /* 1662 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1663 * and SIGTERM. (Preserving the existing masks). 1664 * Mask out SIGINT if mount option nointr is specified. 1665 */ 1666 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1667 if (!(mi->mi_flags & MI_INT)) 1668 client->cl_nosignal = TRUE; 1669 1670 /* 1671 * If there is a current signal, then don't bother 1672 * even trying to send out the request because we 1673 * won't be able to block waiting for the response. 1674 * Simply assume RPC_INTR and get on with it. 1675 */ 1676 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1677 status = RPC_INTR; 1678 else { 1679 status = CLNT_CALL(client, which, xdrargs, argsp, 1680 xdrres, resp, wait); 1681 } 1682 1683 if (!(mi->mi_flags & MI_INT)) 1684 client->cl_nosignal = FALSE; 1685 /* 1686 * restore original signal mask 1687 */ 1688 sigunintr(&smask); 1689 1690 switch (status) { 1691 case RPC_SUCCESS: 1692 #if 0 /* notyet */ 1693 if ((mi->mi_flags & MI_DYNAMIC) && 1694 mi->mi_timer_type[which] != 0 && 1695 (mi->mi_curread != my_rsize || 1696 mi->mi_curwrite != my_wsize)) 1697 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1698 #endif 1699 break; 1700 1701 /* 1702 * Unfortunately, there are servers in the world which 1703 * are not coded correctly. They are not prepared to 1704 * handle RPC requests to the NFS port which are not 1705 * NFS requests. Thus, they may try to process the 1706 * NFS_ACL request as if it were an NFS request. This 1707 * does not work. Generally, an error will be generated 1708 * on the client because it will not be able to decode 1709 * the response from the server. However, it seems 1710 * possible that the server may not be able to decode 1711 * the arguments. Thus, the criteria for deciding 1712 * whether the server supports NFS_ACL or not is whether 1713 * the following RPC errors are returned from CLNT_CALL. 1714 */ 1715 case RPC_CANTDECODERES: 1716 case RPC_PROGUNAVAIL: 1717 case RPC_CANTDECODEARGS: 1718 case RPC_PROGVERSMISMATCH: 1719 mutex_enter(&mi->mi_lock); 1720 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1721 mutex_exit(&mi->mi_lock); 1722 break; 1723 1724 /* 1725 * If the server supports NFS_ACL but not the new ops 1726 * for extended attributes, make sure we don't retry. 1727 */ 1728 case RPC_PROCUNAVAIL: 1729 mutex_enter(&mi->mi_lock); 1730 mi->mi_flags &= ~MI_EXTATTR; 1731 mutex_exit(&mi->mi_lock); 1732 break; 1733 1734 case RPC_INTR: 1735 /* 1736 * There is no way to recover from this error, 1737 * even if mount option nointr is specified. 1738 * SIGKILL, for example, cannot be blocked. 1739 */ 1740 rpcerr.re_status = RPC_INTR; 1741 rpcerr.re_errno = EINTR; 1742 break; 1743 1744 case RPC_UDERROR: 1745 /* 1746 * If the NFS server is local (vold) and 1747 * it goes away then we get RPC_UDERROR. 1748 * This is a retryable error, so we would 1749 * loop, so check to see if the specific 1750 * error was ECONNRESET, indicating that 1751 * target did not exist at all. If so, 1752 * return with RPC_PROGUNAVAIL and 1753 * ECONNRESET to indicate why. 1754 */ 1755 CLNT_GETERR(client, &rpcerr); 1756 if (rpcerr.re_errno == ECONNRESET) { 1757 rpcerr.re_status = RPC_PROGUNAVAIL; 1758 rpcerr.re_errno = ECONNRESET; 1759 break; 1760 } 1761 /*FALLTHROUGH*/ 1762 1763 default: /* probably RPC_TIMEDOUT */ 1764 if (IS_UNRECOVERABLE_RPC(status)) 1765 break; 1766 1767 /* 1768 * increment server not responding count 1769 */ 1770 mutex_enter(&mi->mi_lock); 1771 mi->mi_noresponse++; 1772 mutex_exit(&mi->mi_lock); 1773 #ifdef DEBUG 1774 nfscl->nfscl_stat.noresponse.value.ui64++; 1775 #endif 1776 1777 if (!(mi->mi_flags & MI_HARD)) { 1778 if (!(mi->mi_flags & MI_SEMISOFT) || 1779 (mi->mi_acl_ss_call_type[which] == 0)) 1780 break; 1781 } 1782 1783 /* 1784 * The call is in progress (over COTS). 1785 * Try the CLNT_CALL again, but don't 1786 * print a noisy error message. 1787 */ 1788 if (status == RPC_INPROGRESS) { 1789 tryagain = TRUE; 1790 break; 1791 } 1792 1793 if (flags & RFSCALL_SOFT) 1794 break; 1795 1796 /* 1797 * On zone shutdown, just move on. 1798 */ 1799 if (zone_status_get(curproc->p_zone) >= 1800 ZONE_IS_SHUTTING_DOWN) { 1801 rpcerr.re_status = RPC_FAILED; 1802 rpcerr.re_errno = EIO; 1803 break; 1804 } 1805 1806 /* 1807 * NFS client failover support 1808 * 1809 * If the current server just failed us, we'll 1810 * start the process of finding a new server. 1811 * After that, we can just retry. 1812 */ 1813 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1814 if (svp == mi->mi_curr_serv) 1815 failover_newserver(mi); 1816 clfree_impl(client, ch, nfscl); 1817 goto failoverretry; 1818 } 1819 1820 tryagain = TRUE; 1821 timeo = backoff(timeo); 1822 mutex_enter(&mi->mi_lock); 1823 if (!(mi->mi_flags & MI_PRINTED)) { 1824 mi->mi_flags |= MI_PRINTED; 1825 mutex_exit(&mi->mi_lock); 1826 #ifdef DEBUG 1827 zprintf(zoneid, 1828 "NFS_ACL%d server %s not responding still trying\n", 1829 mi->mi_vers, svp->sv_hostname); 1830 #else 1831 zprintf(zoneid, 1832 "NFS server %s not responding still trying\n", 1833 svp->sv_hostname); 1834 #endif 1835 } else 1836 mutex_exit(&mi->mi_lock); 1837 if (*douprintf && nfs_has_ctty()) { 1838 *douprintf = 0; 1839 if (!(mi->mi_flags & MI_NOPRINT)) 1840 #ifdef DEBUG 1841 uprintf( 1842 "NFS_ACL%d server %s not responding still trying\n", 1843 mi->mi_vers, svp->sv_hostname); 1844 #else 1845 uprintf( 1846 "NFS server %s not responding still trying\n", 1847 svp->sv_hostname); 1848 #endif 1849 } 1850 1851 #if 0 /* notyet */ 1852 /* 1853 * If doing dynamic adjustment of transfer 1854 * size and if it's a read or write call 1855 * and if the transfer size changed while 1856 * retransmitting or if the feedback routine 1857 * changed the transfer size, 1858 * then exit rfscall so that the transfer 1859 * size can be adjusted at the vnops level. 1860 */ 1861 if ((mi->mi_flags & MI_DYNAMIC) && 1862 mi->mi_acl_timer_type[which] != 0 && 1863 (mi->mi_curread != my_rsize || 1864 mi->mi_curwrite != my_wsize || 1865 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1866 /* 1867 * On read or write calls, return 1868 * back to the vnode ops level if 1869 * the transfer size changed. 1870 */ 1871 clfree_impl(client, ch, nfscl); 1872 if (cred_cloned) 1873 crfree(cr); 1874 return (ENFS_TRYAGAIN); 1875 } 1876 #endif 1877 } 1878 } while (tryagain); 1879 1880 if (status != RPC_SUCCESS) { 1881 /* 1882 * Let soft mounts use the timed out message. 1883 */ 1884 if (status == RPC_INPROGRESS) 1885 status = RPC_TIMEDOUT; 1886 nfscl->nfscl_stat.badcalls.value.ui64++; 1887 if (status == RPC_CANTDECODERES || 1888 status == RPC_PROGUNAVAIL || 1889 status == RPC_PROCUNAVAIL || 1890 status == RPC_CANTDECODEARGS || 1891 status == RPC_PROGVERSMISMATCH) 1892 CLNT_GETERR(client, &rpcerr); 1893 else if (status != RPC_INTR) { 1894 mutex_enter(&mi->mi_lock); 1895 mi->mi_flags |= MI_DOWN; 1896 mutex_exit(&mi->mi_lock); 1897 CLNT_GETERR(client, &rpcerr); 1898 #ifdef DEBUG 1899 bufp = clnt_sperror(client, svp->sv_hostname); 1900 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1901 mi->mi_vers, mi->mi_aclnames[which], bufp); 1902 if (nfs_has_ctty()) { 1903 if (!(mi->mi_flags & MI_NOPRINT)) { 1904 uprintf("NFS_ACL%d %s failed for %s\n", 1905 mi->mi_vers, mi->mi_aclnames[which], 1906 bufp); 1907 } 1908 } 1909 kmem_free(bufp, MAXPATHLEN); 1910 #else 1911 zprintf(zoneid, 1912 "NFS %s failed for server %s: error %d (%s)\n", 1913 mi->mi_aclnames[which], svp->sv_hostname, 1914 status, clnt_sperrno(status)); 1915 if (nfs_has_ctty()) { 1916 if (!(mi->mi_flags & MI_NOPRINT)) 1917 uprintf( 1918 "NFS %s failed for server %s: error %d (%s)\n", 1919 mi->mi_aclnames[which], 1920 svp->sv_hostname, status, 1921 clnt_sperrno(status)); 1922 } 1923 #endif 1924 /* 1925 * when CLNT_CALL() fails with RPC_AUTHERROR, 1926 * re_errno is set appropriately depending on 1927 * the authentication error 1928 */ 1929 if (status == RPC_VERSMISMATCH || 1930 status == RPC_PROGVERSMISMATCH) 1931 rpcerr.re_errno = EIO; 1932 } 1933 } else { 1934 /* 1935 * Test the value of mi_down and mi_printed without 1936 * holding the mi_lock mutex. If they are both zero, 1937 * then it is okay to skip the down and printed 1938 * processing. This saves on a mutex_enter and 1939 * mutex_exit pair for a normal, successful RPC. 1940 * This was just complete overhead. 1941 */ 1942 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1943 mutex_enter(&mi->mi_lock); 1944 mi->mi_flags &= ~MI_DOWN; 1945 if (mi->mi_flags & MI_PRINTED) { 1946 mi->mi_flags &= ~MI_PRINTED; 1947 mutex_exit(&mi->mi_lock); 1948 #ifdef DEBUG 1949 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1950 mi->mi_vers, svp->sv_hostname); 1951 #else 1952 zprintf(zoneid, "NFS server %s ok\n", 1953 svp->sv_hostname); 1954 #endif 1955 } else 1956 mutex_exit(&mi->mi_lock); 1957 } 1958 1959 if (*douprintf == 0) { 1960 if (!(mi->mi_flags & MI_NOPRINT)) 1961 #ifdef DEBUG 1962 uprintf("NFS_ACL%d server %s ok\n", 1963 mi->mi_vers, svp->sv_hostname); 1964 #else 1965 uprintf("NFS server %s ok\n", svp->sv_hostname); 1966 #endif 1967 *douprintf = 1; 1968 } 1969 } 1970 1971 clfree_impl(client, ch, nfscl); 1972 if (cred_cloned) 1973 crfree(cr); 1974 1975 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1976 1977 #if 0 /* notyet */ 1978 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1979 rpcerr.re_errno); 1980 #endif 1981 1982 return (rpcerr.re_errno); 1983 } 1984 1985 int 1986 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1987 { 1988 uint_t mask = vap->va_mask; 1989 1990 if (!(mask & AT_MODE)) 1991 sa->sa_mode = (uint32_t)-1; 1992 else 1993 sa->sa_mode = vap->va_mode; 1994 if (!(mask & AT_UID)) 1995 sa->sa_uid = (uint32_t)-1; 1996 else 1997 sa->sa_uid = (uint32_t)vap->va_uid; 1998 if (!(mask & AT_GID)) 1999 sa->sa_gid = (uint32_t)-1; 2000 else 2001 sa->sa_gid = (uint32_t)vap->va_gid; 2002 if (!(mask & AT_SIZE)) 2003 sa->sa_size = (uint32_t)-1; 2004 else 2005 sa->sa_size = (uint32_t)vap->va_size; 2006 if (!(mask & AT_ATIME)) 2007 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 2008 else { 2009 /* check time validity */ 2010 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2011 return (EOVERFLOW); 2012 } 2013 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2014 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2015 } 2016 if (!(mask & AT_MTIME)) 2017 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2018 else { 2019 /* check time validity */ 2020 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2021 return (EOVERFLOW); 2022 } 2023 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2024 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2025 } 2026 return (0); 2027 } 2028 2029 int 2030 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2031 { 2032 uint_t mask = vap->va_mask; 2033 2034 if (!(mask & AT_MODE)) 2035 sa->mode.set_it = FALSE; 2036 else { 2037 sa->mode.set_it = TRUE; 2038 sa->mode.mode = (mode3)vap->va_mode; 2039 } 2040 if (!(mask & AT_UID)) 2041 sa->uid.set_it = FALSE; 2042 else { 2043 sa->uid.set_it = TRUE; 2044 sa->uid.uid = (uid3)vap->va_uid; 2045 } 2046 if (!(mask & AT_GID)) 2047 sa->gid.set_it = FALSE; 2048 else { 2049 sa->gid.set_it = TRUE; 2050 sa->gid.gid = (gid3)vap->va_gid; 2051 } 2052 if (!(mask & AT_SIZE)) 2053 sa->size.set_it = FALSE; 2054 else { 2055 sa->size.set_it = TRUE; 2056 sa->size.size = (size3)vap->va_size; 2057 } 2058 if (!(mask & AT_ATIME)) 2059 sa->atime.set_it = DONT_CHANGE; 2060 else { 2061 /* check time validity */ 2062 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2063 return (EOVERFLOW); 2064 } 2065 sa->atime.set_it = SET_TO_CLIENT_TIME; 2066 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2067 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2068 } 2069 if (!(mask & AT_MTIME)) 2070 sa->mtime.set_it = DONT_CHANGE; 2071 else { 2072 /* check time validity */ 2073 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2074 return (EOVERFLOW); 2075 } 2076 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2077 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2078 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2079 } 2080 return (0); 2081 } 2082 2083 void 2084 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2085 { 2086 2087 da->da_fhandle = VTOFH(dvp); 2088 da->da_name = nm; 2089 da->da_flags = 0; 2090 } 2091 2092 void 2093 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2094 { 2095 2096 da->dirp = VTOFH3(dvp); 2097 da->name = nm; 2098 } 2099 2100 int 2101 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2102 { 2103 int error; 2104 rnode_t *rp; 2105 struct vattr va; 2106 2107 va.va_mask = AT_MODE | AT_GID; 2108 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2109 if (error) 2110 return (error); 2111 2112 /* 2113 * To determine the expected group-id of the created file: 2114 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2115 * GRPID option, and the directory's set-gid bit is clear, 2116 * then use the process's gid. 2117 * 2) Otherwise, set the group-id to the gid of the parent directory. 2118 */ 2119 rp = VTOR(dvp); 2120 mutex_enter(&rp->r_statelock); 2121 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2122 *gidp = crgetgid(cr); 2123 else 2124 *gidp = va.va_gid; 2125 mutex_exit(&rp->r_statelock); 2126 return (0); 2127 } 2128 2129 int 2130 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2131 { 2132 int error; 2133 struct vattr va; 2134 2135 va.va_mask = AT_MODE; 2136 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2137 if (error) 2138 return (error); 2139 2140 /* 2141 * Modify the expected mode (om) so that the set-gid bit matches 2142 * that of the parent directory (dvp). 2143 */ 2144 if (va.va_mode & VSGID) 2145 *omp |= VSGID; 2146 else 2147 *omp &= ~VSGID; 2148 return (0); 2149 } 2150 2151 void 2152 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2153 { 2154 2155 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2156 if (!(vp->v_flag & VSWAPLIKE)) { 2157 mutex_enter(&vp->v_lock); 2158 vp->v_flag |= VSWAPLIKE; 2159 mutex_exit(&vp->v_lock); 2160 } 2161 } else { 2162 if (vp->v_flag & VSWAPLIKE) { 2163 mutex_enter(&vp->v_lock); 2164 vp->v_flag &= ~VSWAPLIKE; 2165 mutex_exit(&vp->v_lock); 2166 } 2167 } 2168 } 2169 2170 /* 2171 * Free the resources associated with an rnode. 2172 */ 2173 static void 2174 rinactive(rnode_t *rp, cred_t *cr) 2175 { 2176 vnode_t *vp; 2177 cred_t *cred; 2178 char *contents; 2179 int size; 2180 vsecattr_t *vsp; 2181 int error; 2182 nfs3_pathconf_info *info; 2183 2184 /* 2185 * Before freeing anything, wait until all asynchronous 2186 * activity is done on this rnode. This will allow all 2187 * asynchronous read ahead and write behind i/o's to 2188 * finish. 2189 */ 2190 mutex_enter(&rp->r_statelock); 2191 while (rp->r_count > 0) 2192 cv_wait(&rp->r_cv, &rp->r_statelock); 2193 mutex_exit(&rp->r_statelock); 2194 2195 /* 2196 * Flush and invalidate all pages associated with the vnode. 2197 */ 2198 vp = RTOV(rp); 2199 if (vn_has_cached_data(vp)) { 2200 ASSERT(vp->v_type != VCHR); 2201 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2202 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2203 if (error && (error == ENOSPC || error == EDQUOT)) { 2204 mutex_enter(&rp->r_statelock); 2205 if (!rp->r_error) 2206 rp->r_error = error; 2207 mutex_exit(&rp->r_statelock); 2208 } 2209 } 2210 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2211 } 2212 2213 /* 2214 * Free any held credentials and caches which may be associated 2215 * with this rnode. 2216 */ 2217 mutex_enter(&rp->r_statelock); 2218 cred = rp->r_cred; 2219 rp->r_cred = NULL; 2220 contents = rp->r_symlink.contents; 2221 size = rp->r_symlink.size; 2222 rp->r_symlink.contents = NULL; 2223 vsp = rp->r_secattr; 2224 rp->r_secattr = NULL; 2225 info = rp->r_pathconf; 2226 rp->r_pathconf = NULL; 2227 mutex_exit(&rp->r_statelock); 2228 2229 /* 2230 * Free the held credential. 2231 */ 2232 if (cred != NULL) 2233 crfree(cred); 2234 2235 /* 2236 * Free the access cache entries. 2237 */ 2238 (void) nfs_access_purge_rp(rp); 2239 2240 /* 2241 * Free the readdir cache entries. 2242 */ 2243 if (HAVE_RDDIR_CACHE(rp)) 2244 nfs_purge_rddir_cache(vp); 2245 2246 /* 2247 * Free the symbolic link cache. 2248 */ 2249 if (contents != NULL) { 2250 2251 kmem_free((void *)contents, size); 2252 } 2253 2254 /* 2255 * Free any cached ACL. 2256 */ 2257 if (vsp != NULL) 2258 nfs_acl_free(vsp); 2259 2260 /* 2261 * Free any cached pathconf information. 2262 */ 2263 if (info != NULL) 2264 kmem_free(info, sizeof (*info)); 2265 } 2266 2267 /* 2268 * Return a vnode for the given NFS Version 2 file handle. 2269 * If no rnode exists for this fhandle, create one and put it 2270 * into the hash queues. If the rnode for this fhandle 2271 * already exists, return it. 2272 * 2273 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2274 */ 2275 vnode_t * 2276 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2277 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2278 { 2279 int newnode; 2280 int index; 2281 vnode_t *vp; 2282 nfs_fhandle nfh; 2283 vattr_t va; 2284 2285 nfh.fh_len = NFS_FHSIZE; 2286 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2287 2288 index = rtablehash(&nfh); 2289 rw_enter(&rtable[index].r_lock, RW_READER); 2290 2291 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2292 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2293 2294 if (attr != NULL) { 2295 if (!newnode) { 2296 rw_exit(&rtable[index].r_lock); 2297 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2298 } else { 2299 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2300 vp->v_type = VBAD; 2301 else 2302 vp->v_type = n2v_type(attr); 2303 /* 2304 * A translation here seems to be necessary 2305 * because this function can be called 2306 * with `attr' that has come from the wire, 2307 * and been operated on by vattr_to_nattr(). 2308 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2309 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2310 * ->makenfsnode(). 2311 */ 2312 if ((attr->na_rdev & 0xffff0000) == 0) 2313 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2314 else 2315 vp->v_rdev = expldev(n2v_rdev(attr)); 2316 nfs_attrcache(vp, attr, t); 2317 rw_exit(&rtable[index].r_lock); 2318 } 2319 } else { 2320 if (newnode) { 2321 PURGE_ATTRCACHE(vp); 2322 } 2323 rw_exit(&rtable[index].r_lock); 2324 } 2325 2326 return (vp); 2327 } 2328 2329 /* 2330 * Return a vnode for the given NFS Version 3 file handle. 2331 * If no rnode exists for this fhandle, create one and put it 2332 * into the hash queues. If the rnode for this fhandle 2333 * already exists, return it. 2334 * 2335 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2336 */ 2337 vnode_t * 2338 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2339 cred_t *cr, char *dnm, char *nm) 2340 { 2341 int newnode; 2342 int index; 2343 vnode_t *vp; 2344 2345 index = rtablehash((nfs_fhandle *)fh); 2346 rw_enter(&rtable[index].r_lock, RW_READER); 2347 2348 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2349 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2350 dnm, nm); 2351 2352 if (vap == NULL) { 2353 if (newnode) { 2354 PURGE_ATTRCACHE(vp); 2355 } 2356 rw_exit(&rtable[index].r_lock); 2357 return (vp); 2358 } 2359 2360 if (!newnode) { 2361 rw_exit(&rtable[index].r_lock); 2362 nfs_attr_cache(vp, vap, t, cr); 2363 } else { 2364 rnode_t *rp = VTOR(vp); 2365 2366 vp->v_type = vap->va_type; 2367 vp->v_rdev = vap->va_rdev; 2368 2369 mutex_enter(&rp->r_statelock); 2370 if (rp->r_mtime <= t) 2371 nfs_attrcache_va(vp, vap); 2372 mutex_exit(&rp->r_statelock); 2373 rw_exit(&rtable[index].r_lock); 2374 } 2375 2376 return (vp); 2377 } 2378 2379 vnode_t * 2380 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2381 cred_t *cr, char *dnm, char *nm) 2382 { 2383 int newnode; 2384 int index; 2385 vnode_t *vp; 2386 vattr_t va; 2387 2388 index = rtablehash((nfs_fhandle *)fh); 2389 rw_enter(&rtable[index].r_lock, RW_READER); 2390 2391 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2392 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2393 dnm, nm); 2394 2395 if (attr == NULL) { 2396 if (newnode) { 2397 PURGE_ATTRCACHE(vp); 2398 } 2399 rw_exit(&rtable[index].r_lock); 2400 return (vp); 2401 } 2402 2403 if (!newnode) { 2404 rw_exit(&rtable[index].r_lock); 2405 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2406 } else { 2407 if (attr->type < NF3REG || attr->type > NF3FIFO) 2408 vp->v_type = VBAD; 2409 else 2410 vp->v_type = nf3_to_vt[attr->type]; 2411 vp->v_rdev = makedevice(attr->rdev.specdata1, 2412 attr->rdev.specdata2); 2413 nfs3_attrcache(vp, attr, t); 2414 rw_exit(&rtable[index].r_lock); 2415 } 2416 2417 return (vp); 2418 } 2419 2420 /* 2421 * Read this comment before making changes to rtablehash()! 2422 * This is a hash function in which seemingly obvious and harmless 2423 * changes can cause escalations costing million dollars! 2424 * Know what you are doing. 2425 * 2426 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2427 * algorithm is currently detailed here: 2428 * 2429 * http://burtleburtle.net/bob/hash/doobs.html 2430 * 2431 * Of course, the above link may not be valid by the time you are reading 2432 * this, but suffice it to say that the one-at-a-time algorithm works well in 2433 * almost all cases. If you are changing the algorithm be sure to verify that 2434 * the hash algorithm still provides even distribution in all cases and with 2435 * any server returning filehandles in whatever order (sequential or random). 2436 */ 2437 static int 2438 rtablehash(nfs_fhandle *fh) 2439 { 2440 ulong_t hash, len, i; 2441 char *key; 2442 2443 key = fh->fh_buf; 2444 len = (ulong_t)fh->fh_len; 2445 for (hash = 0, i = 0; i < len; i++) { 2446 hash += key[i]; 2447 hash += (hash << 10); 2448 hash ^= (hash >> 6); 2449 } 2450 hash += (hash << 3); 2451 hash ^= (hash >> 11); 2452 hash += (hash << 15); 2453 return (hash & rtablemask); 2454 } 2455 2456 static vnode_t * 2457 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2458 struct vnodeops *vops, 2459 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2460 int (*compar)(const void *, const void *), 2461 int *newnode, cred_t *cr, char *dnm, char *nm) 2462 { 2463 rnode_t *rp; 2464 rnode_t *trp; 2465 vnode_t *vp; 2466 mntinfo_t *mi; 2467 2468 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2469 2470 mi = VFTOMI(vfsp); 2471 start: 2472 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2473 vp = RTOV(rp); 2474 nfs_set_vroot(vp); 2475 *newnode = 0; 2476 return (vp); 2477 } 2478 rw_exit(&rhtp->r_lock); 2479 2480 mutex_enter(&rpfreelist_lock); 2481 if (rpfreelist != NULL && rnew >= nrnode) { 2482 rp = rpfreelist; 2483 rp_rmfree(rp); 2484 mutex_exit(&rpfreelist_lock); 2485 2486 vp = RTOV(rp); 2487 2488 if (rp->r_flags & RHASHED) { 2489 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2490 mutex_enter(&vp->v_lock); 2491 if (vp->v_count > 1) { 2492 vp->v_count--; 2493 mutex_exit(&vp->v_lock); 2494 rw_exit(&rp->r_hashq->r_lock); 2495 rw_enter(&rhtp->r_lock, RW_READER); 2496 goto start; 2497 } 2498 mutex_exit(&vp->v_lock); 2499 rp_rmhash_locked(rp); 2500 rw_exit(&rp->r_hashq->r_lock); 2501 } 2502 2503 rinactive(rp, cr); 2504 2505 mutex_enter(&vp->v_lock); 2506 if (vp->v_count > 1) { 2507 vp->v_count--; 2508 mutex_exit(&vp->v_lock); 2509 rw_enter(&rhtp->r_lock, RW_READER); 2510 goto start; 2511 } 2512 mutex_exit(&vp->v_lock); 2513 vn_invalid(vp); 2514 /* 2515 * destroy old locks before bzero'ing and 2516 * recreating the locks below. 2517 */ 2518 nfs_rw_destroy(&rp->r_rwlock); 2519 nfs_rw_destroy(&rp->r_lkserlock); 2520 mutex_destroy(&rp->r_statelock); 2521 cv_destroy(&rp->r_cv); 2522 cv_destroy(&rp->r_commit.c_cv); 2523 nfs_free_r_path(rp); 2524 avl_destroy(&rp->r_dir); 2525 /* 2526 * Make sure that if rnode is recycled then 2527 * VFS count is decremented properly before 2528 * reuse. 2529 */ 2530 VFS_RELE(vp->v_vfsp); 2531 vn_reinit(vp); 2532 } else { 2533 vnode_t *new_vp; 2534 2535 mutex_exit(&rpfreelist_lock); 2536 2537 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2538 new_vp = vn_alloc(KM_SLEEP); 2539 2540 atomic_inc_ulong((ulong_t *)&rnew); 2541 #ifdef DEBUG 2542 clstat_debug.nrnode.value.ui64++; 2543 #endif 2544 vp = new_vp; 2545 } 2546 2547 bzero(rp, sizeof (*rp)); 2548 rp->r_vnode = vp; 2549 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2550 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2551 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2552 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2553 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2554 rp->r_fh.fh_len = fh->fh_len; 2555 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2556 rp->r_server = mi->mi_curr_serv; 2557 if (FAILOVER_MOUNT(mi)) { 2558 /* 2559 * If replicated servers, stash pathnames 2560 */ 2561 if (dnm != NULL && nm != NULL) { 2562 char *s, *p; 2563 uint_t len; 2564 2565 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2566 rp->r_path = kmem_alloc(len, KM_SLEEP); 2567 #ifdef DEBUG 2568 clstat_debug.rpath.value.ui64 += len; 2569 #endif 2570 s = rp->r_path; 2571 for (p = dnm; *p; p++) 2572 *s++ = *p; 2573 *s++ = '/'; 2574 for (p = nm; *p; p++) 2575 *s++ = *p; 2576 *s = '\0'; 2577 } else { 2578 /* special case for root */ 2579 rp->r_path = kmem_alloc(2, KM_SLEEP); 2580 #ifdef DEBUG 2581 clstat_debug.rpath.value.ui64 += 2; 2582 #endif 2583 *rp->r_path = '.'; 2584 *(rp->r_path + 1) = '\0'; 2585 } 2586 } 2587 VFS_HOLD(vfsp); 2588 rp->r_putapage = putapage; 2589 rp->r_hashq = rhtp; 2590 rp->r_flags = RREADDIRPLUS; 2591 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2592 offsetof(rddir_cache, tree)); 2593 vn_setops(vp, vops); 2594 vp->v_data = (caddr_t)rp; 2595 vp->v_vfsp = vfsp; 2596 vp->v_type = VNON; 2597 vp->v_flag |= VMODSORT; 2598 nfs_set_vroot(vp); 2599 2600 /* 2601 * There is a race condition if someone else 2602 * alloc's the rnode while no locks are held, so we 2603 * check again and recover if found. 2604 */ 2605 rw_enter(&rhtp->r_lock, RW_WRITER); 2606 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2607 vp = RTOV(trp); 2608 nfs_set_vroot(vp); 2609 *newnode = 0; 2610 rw_exit(&rhtp->r_lock); 2611 rp_addfree(rp, cr); 2612 rw_enter(&rhtp->r_lock, RW_READER); 2613 return (vp); 2614 } 2615 rp_addhash(rp); 2616 *newnode = 1; 2617 return (vp); 2618 } 2619 2620 /* 2621 * Callback function to check if the page should be marked as 2622 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT. 2623 */ 2624 int 2625 nfs_setmod_check(page_t *pp) 2626 { 2627 if (pp->p_fsdata != C_NOCOMMIT) { 2628 pp->p_fsdata = C_NOCOMMIT; 2629 return (1); 2630 } 2631 return (0); 2632 } 2633 2634 static void 2635 nfs_set_vroot(vnode_t *vp) 2636 { 2637 rnode_t *rp; 2638 nfs_fhandle *rootfh; 2639 2640 rp = VTOR(vp); 2641 rootfh = &rp->r_server->sv_fhandle; 2642 if (rootfh->fh_len == rp->r_fh.fh_len && 2643 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2644 if (!(vp->v_flag & VROOT)) { 2645 mutex_enter(&vp->v_lock); 2646 vp->v_flag |= VROOT; 2647 mutex_exit(&vp->v_lock); 2648 } 2649 } 2650 } 2651 2652 static void 2653 nfs_free_r_path(rnode_t *rp) 2654 { 2655 char *path; 2656 size_t len; 2657 2658 path = rp->r_path; 2659 if (path) { 2660 rp->r_path = NULL; 2661 len = strlen(path) + 1; 2662 kmem_free(path, len); 2663 #ifdef DEBUG 2664 clstat_debug.rpath.value.ui64 -= len; 2665 #endif 2666 } 2667 } 2668 2669 /* 2670 * Put an rnode on the free list. 2671 * 2672 * Rnodes which were allocated above and beyond the normal limit 2673 * are immediately freed. 2674 */ 2675 void 2676 rp_addfree(rnode_t *rp, cred_t *cr) 2677 { 2678 vnode_t *vp; 2679 struct vfs *vfsp; 2680 2681 vp = RTOV(rp); 2682 ASSERT(vp->v_count >= 1); 2683 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2684 2685 /* 2686 * If we have too many rnodes allocated and there are no 2687 * references to this rnode, or if the rnode is no longer 2688 * accessible by it does not reside in the hash queues, 2689 * or if an i/o error occurred while writing to the file, 2690 * then just free it instead of putting it on the rnode 2691 * freelist. 2692 */ 2693 vfsp = vp->v_vfsp; 2694 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2695 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2696 if (rp->r_flags & RHASHED) { 2697 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2698 mutex_enter(&vp->v_lock); 2699 if (vp->v_count > 1) { 2700 vp->v_count--; 2701 mutex_exit(&vp->v_lock); 2702 rw_exit(&rp->r_hashq->r_lock); 2703 return; 2704 } 2705 mutex_exit(&vp->v_lock); 2706 rp_rmhash_locked(rp); 2707 rw_exit(&rp->r_hashq->r_lock); 2708 } 2709 2710 rinactive(rp, cr); 2711 2712 /* 2713 * Recheck the vnode reference count. We need to 2714 * make sure that another reference has not been 2715 * acquired while we were not holding v_lock. The 2716 * rnode is not in the rnode hash queues, so the 2717 * only way for a reference to have been acquired 2718 * is for a VOP_PUTPAGE because the rnode was marked 2719 * with RDIRTY or for a modified page. This 2720 * reference may have been acquired before our call 2721 * to rinactive. The i/o may have been completed, 2722 * thus allowing rinactive to complete, but the 2723 * reference to the vnode may not have been released 2724 * yet. In any case, the rnode can not be destroyed 2725 * until the other references to this vnode have been 2726 * released. The other references will take care of 2727 * either destroying the rnode or placing it on the 2728 * rnode freelist. If there are no other references, 2729 * then the rnode may be safely destroyed. 2730 */ 2731 mutex_enter(&vp->v_lock); 2732 if (vp->v_count > 1) { 2733 vp->v_count--; 2734 mutex_exit(&vp->v_lock); 2735 return; 2736 } 2737 mutex_exit(&vp->v_lock); 2738 2739 destroy_rnode(rp); 2740 return; 2741 } 2742 2743 /* 2744 * Lock the hash queue and then recheck the reference count 2745 * to ensure that no other threads have acquired a reference 2746 * to indicate that the rnode should not be placed on the 2747 * freelist. If another reference has been acquired, then 2748 * just release this one and let the other thread complete 2749 * the processing of adding this rnode to the freelist. 2750 */ 2751 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2752 2753 mutex_enter(&vp->v_lock); 2754 if (vp->v_count > 1) { 2755 vp->v_count--; 2756 mutex_exit(&vp->v_lock); 2757 rw_exit(&rp->r_hashq->r_lock); 2758 return; 2759 } 2760 mutex_exit(&vp->v_lock); 2761 2762 /* 2763 * If there is no cached data or metadata for this file, then 2764 * put the rnode on the front of the freelist so that it will 2765 * be reused before other rnodes which may have cached data or 2766 * metadata associated with them. 2767 */ 2768 mutex_enter(&rpfreelist_lock); 2769 if (rpfreelist == NULL) { 2770 rp->r_freef = rp; 2771 rp->r_freeb = rp; 2772 rpfreelist = rp; 2773 } else { 2774 rp->r_freef = rpfreelist; 2775 rp->r_freeb = rpfreelist->r_freeb; 2776 rpfreelist->r_freeb->r_freef = rp; 2777 rpfreelist->r_freeb = rp; 2778 if (!vn_has_cached_data(vp) && 2779 !HAVE_RDDIR_CACHE(rp) && 2780 rp->r_symlink.contents == NULL && 2781 rp->r_secattr == NULL && 2782 rp->r_pathconf == NULL) 2783 rpfreelist = rp; 2784 } 2785 mutex_exit(&rpfreelist_lock); 2786 2787 rw_exit(&rp->r_hashq->r_lock); 2788 } 2789 2790 /* 2791 * Remove an rnode from the free list. 2792 * 2793 * The caller must be holding rpfreelist_lock and the rnode 2794 * must be on the freelist. 2795 */ 2796 static void 2797 rp_rmfree(rnode_t *rp) 2798 { 2799 2800 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2801 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2802 2803 if (rp == rpfreelist) { 2804 rpfreelist = rp->r_freef; 2805 if (rp == rpfreelist) 2806 rpfreelist = NULL; 2807 } 2808 2809 rp->r_freeb->r_freef = rp->r_freef; 2810 rp->r_freef->r_freeb = rp->r_freeb; 2811 2812 rp->r_freef = rp->r_freeb = NULL; 2813 } 2814 2815 /* 2816 * Put a rnode in the hash table. 2817 * 2818 * The caller must be holding the exclusive hash queue lock. 2819 */ 2820 static void 2821 rp_addhash(rnode_t *rp) 2822 { 2823 mntinfo_t *mi; 2824 2825 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2826 ASSERT(!(rp->r_flags & RHASHED)); 2827 2828 rp->r_hashf = rp->r_hashq->r_hashf; 2829 rp->r_hashq->r_hashf = rp; 2830 rp->r_hashb = (rnode_t *)rp->r_hashq; 2831 rp->r_hashf->r_hashb = rp; 2832 2833 mutex_enter(&rp->r_statelock); 2834 rp->r_flags |= RHASHED; 2835 mutex_exit(&rp->r_statelock); 2836 2837 mi = VTOMI(RTOV(rp)); 2838 mutex_enter(&mi->mi_rnodes_lock); 2839 list_insert_tail(&mi->mi_rnodes, rp); 2840 mutex_exit(&mi->mi_rnodes_lock); 2841 } 2842 2843 /* 2844 * Remove a rnode from the hash table. 2845 * 2846 * The caller must be holding the hash queue lock. 2847 */ 2848 static void 2849 rp_rmhash_locked(rnode_t *rp) 2850 { 2851 mntinfo_t *mi; 2852 2853 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2854 ASSERT(rp->r_flags & RHASHED); 2855 2856 rp->r_hashb->r_hashf = rp->r_hashf; 2857 rp->r_hashf->r_hashb = rp->r_hashb; 2858 2859 mutex_enter(&rp->r_statelock); 2860 rp->r_flags &= ~RHASHED; 2861 mutex_exit(&rp->r_statelock); 2862 2863 mi = VTOMI(RTOV(rp)); 2864 mutex_enter(&mi->mi_rnodes_lock); 2865 if (list_link_active(&rp->r_mi_link)) 2866 list_remove(&mi->mi_rnodes, rp); 2867 mutex_exit(&mi->mi_rnodes_lock); 2868 } 2869 2870 /* 2871 * Remove a rnode from the hash table. 2872 * 2873 * The caller must not be holding the hash queue lock. 2874 */ 2875 void 2876 rp_rmhash(rnode_t *rp) 2877 { 2878 2879 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2880 rp_rmhash_locked(rp); 2881 rw_exit(&rp->r_hashq->r_lock); 2882 } 2883 2884 /* 2885 * Lookup a rnode by fhandle. 2886 * 2887 * The caller must be holding the hash queue lock, either shared or exclusive. 2888 */ 2889 static rnode_t * 2890 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2891 { 2892 rnode_t *rp; 2893 vnode_t *vp; 2894 2895 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2896 2897 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2898 vp = RTOV(rp); 2899 if (vp->v_vfsp == vfsp && 2900 rp->r_fh.fh_len == fh->fh_len && 2901 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2902 /* 2903 * remove rnode from free list, if necessary. 2904 */ 2905 if (rp->r_freef != NULL) { 2906 mutex_enter(&rpfreelist_lock); 2907 /* 2908 * If the rnode is on the freelist, 2909 * then remove it and use that reference 2910 * as the new reference. Otherwise, 2911 * need to increment the reference count. 2912 */ 2913 if (rp->r_freef != NULL) { 2914 rp_rmfree(rp); 2915 mutex_exit(&rpfreelist_lock); 2916 } else { 2917 mutex_exit(&rpfreelist_lock); 2918 VN_HOLD(vp); 2919 } 2920 } else 2921 VN_HOLD(vp); 2922 return (rp); 2923 } 2924 } 2925 return (NULL); 2926 } 2927 2928 /* 2929 * Return 1 if there is an active vnode belonging to this vfs in the 2930 * rtable cache. 2931 * 2932 * Several of these checks are done without holding the usual 2933 * locks. This is safe because destroy_rtable(), rp_addfree(), 2934 * etc. will redo the necessary checks before actually destroying 2935 * any rnodes. 2936 */ 2937 int 2938 check_rtable(struct vfs *vfsp) 2939 { 2940 rnode_t *rp; 2941 vnode_t *vp; 2942 mntinfo_t *mi; 2943 2944 ASSERT(vfsp != NULL); 2945 mi = VFTOMI(vfsp); 2946 2947 mutex_enter(&mi->mi_rnodes_lock); 2948 for (rp = list_head(&mi->mi_rnodes); rp != NULL; 2949 rp = list_next(&mi->mi_rnodes, rp)) { 2950 vp = RTOV(rp); 2951 2952 if (rp->r_freef == NULL || 2953 (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) || 2954 rp->r_count > 0) { 2955 mutex_exit(&mi->mi_rnodes_lock); 2956 return (1); 2957 } 2958 } 2959 mutex_exit(&mi->mi_rnodes_lock); 2960 2961 return (0); 2962 } 2963 2964 /* 2965 * Destroy inactive vnodes from the hash queues which belong to this 2966 * vfs. It is essential that we destroy all inactive vnodes during a 2967 * forced unmount as well as during a normal unmount. 2968 */ 2969 void 2970 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2971 { 2972 rnode_t *rp; 2973 mntinfo_t *mi; 2974 2975 ASSERT(vfsp != NULL); 2976 2977 mi = VFTOMI(vfsp); 2978 2979 mutex_enter(&rpfreelist_lock); 2980 mutex_enter(&mi->mi_rnodes_lock); 2981 while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) { 2982 /* 2983 * If the rnode is no longer on the freelist it is not 2984 * ours and it will be handled by some other thread, so 2985 * skip it. 2986 */ 2987 if (rp->r_freef == NULL) 2988 continue; 2989 mutex_exit(&mi->mi_rnodes_lock); 2990 2991 rp_rmfree(rp); 2992 mutex_exit(&rpfreelist_lock); 2993 2994 rp_rmhash(rp); 2995 2996 /* 2997 * This call to rp_addfree will end up destroying the 2998 * rnode, but in a safe way with the appropriate set 2999 * of checks done. 3000 */ 3001 rp_addfree(rp, cr); 3002 3003 mutex_enter(&rpfreelist_lock); 3004 mutex_enter(&mi->mi_rnodes_lock); 3005 } 3006 mutex_exit(&mi->mi_rnodes_lock); 3007 mutex_exit(&rpfreelist_lock); 3008 } 3009 3010 /* 3011 * This routine destroys all the resources associated with the rnode 3012 * and then the rnode itself. 3013 */ 3014 static void 3015 destroy_rnode(rnode_t *rp) 3016 { 3017 vnode_t *vp; 3018 vfs_t *vfsp; 3019 3020 vp = RTOV(rp); 3021 vfsp = vp->v_vfsp; 3022 3023 ASSERT(vp->v_count == 1); 3024 ASSERT(rp->r_count == 0); 3025 ASSERT(rp->r_lmpl == NULL); 3026 ASSERT(rp->r_mapcnt == 0); 3027 ASSERT(!(rp->r_flags & RHASHED)); 3028 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 3029 atomic_dec_ulong((ulong_t *)&rnew); 3030 #ifdef DEBUG 3031 clstat_debug.nrnode.value.ui64--; 3032 #endif 3033 nfs_rw_destroy(&rp->r_rwlock); 3034 nfs_rw_destroy(&rp->r_lkserlock); 3035 mutex_destroy(&rp->r_statelock); 3036 cv_destroy(&rp->r_cv); 3037 cv_destroy(&rp->r_commit.c_cv); 3038 if (rp->r_flags & RDELMAPLIST) 3039 list_destroy(&rp->r_indelmap); 3040 nfs_free_r_path(rp); 3041 avl_destroy(&rp->r_dir); 3042 vn_invalid(vp); 3043 vn_free(vp); 3044 kmem_cache_free(rnode_cache, rp); 3045 VFS_RELE(vfsp); 3046 } 3047 3048 /* 3049 * Flush all vnodes in this (or every) vfs. 3050 * Used by nfs_sync and by nfs_unmount. 3051 */ 3052 void 3053 rflush(struct vfs *vfsp, cred_t *cr) 3054 { 3055 int index; 3056 rnode_t *rp; 3057 vnode_t *vp, **vplist; 3058 long num, cnt; 3059 3060 /* 3061 * Check to see whether there is anything to do. 3062 */ 3063 num = rnew; 3064 if (num == 0) 3065 return; 3066 3067 /* 3068 * Allocate a slot for all currently active rnodes on the 3069 * supposition that they all may need flushing. 3070 */ 3071 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3072 cnt = 0; 3073 3074 /* 3075 * If the vfs is known we can do fast path by iterating all rnodes that 3076 * belongs to this vfs. This is much faster than the traditional way 3077 * of iterating rtable (below) in a case there is a lot of rnodes that 3078 * does not belong to our vfs. 3079 */ 3080 if (vfsp != NULL) { 3081 mntinfo_t *mi = VFTOMI(vfsp); 3082 3083 mutex_enter(&mi->mi_rnodes_lock); 3084 for (rp = list_head(&mi->mi_rnodes); rp != NULL; 3085 rp = list_next(&mi->mi_rnodes, rp)) { 3086 vp = RTOV(rp); 3087 /* 3088 * Don't bother sync'ing a vp if it 3089 * is part of virtual swap device or 3090 * if VFS is read-only 3091 */ 3092 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3093 continue; 3094 /* 3095 * If the vnode has pages and is marked as either dirty 3096 * or mmap'd, hold and add this vnode to the list of 3097 * vnodes to flush. 3098 */ 3099 ASSERT(vp->v_vfsp == vfsp); 3100 if (vn_has_cached_data(vp) && 3101 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3102 VN_HOLD(vp); 3103 vplist[cnt++] = vp; 3104 if (cnt == num) { 3105 /* 3106 * The vplist is full because there is 3107 * too many rnodes. We are done for 3108 * now. 3109 */ 3110 break; 3111 } 3112 } 3113 } 3114 mutex_exit(&mi->mi_rnodes_lock); 3115 3116 goto done; 3117 } 3118 3119 ASSERT(vfsp == NULL); 3120 3121 /* 3122 * Walk the hash queues looking for rnodes with page 3123 * lists associated with them. Make a list of these 3124 * files. 3125 */ 3126 for (index = 0; index < rtablesize; index++) { 3127 rw_enter(&rtable[index].r_lock, RW_READER); 3128 for (rp = rtable[index].r_hashf; 3129 rp != (rnode_t *)(&rtable[index]); 3130 rp = rp->r_hashf) { 3131 vp = RTOV(rp); 3132 /* 3133 * Don't bother sync'ing a vp if it 3134 * is part of virtual swap device or 3135 * if VFS is read-only 3136 */ 3137 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3138 continue; 3139 /* 3140 * If the vnode has pages and is marked as either dirty 3141 * or mmap'd, hold and add this vnode to the list of 3142 * vnodes to flush. 3143 */ 3144 if (vn_has_cached_data(vp) && 3145 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3146 VN_HOLD(vp); 3147 vplist[cnt++] = vp; 3148 if (cnt == num) { 3149 rw_exit(&rtable[index].r_lock); 3150 /* 3151 * The vplist is full because there is 3152 * too many rnodes. We are done for 3153 * now. 3154 */ 3155 goto done; 3156 } 3157 } 3158 } 3159 rw_exit(&rtable[index].r_lock); 3160 } 3161 3162 done: 3163 3164 /* 3165 * Flush and release all of the files on the list. 3166 */ 3167 while (cnt-- > 0) { 3168 vp = vplist[cnt]; 3169 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3170 VN_RELE(vp); 3171 } 3172 3173 /* 3174 * Free the space allocated to hold the list. 3175 */ 3176 kmem_free(vplist, num * sizeof (*vplist)); 3177 } 3178 3179 /* 3180 * This probably needs to be larger than or equal to 3181 * log2(sizeof (struct rnode)) due to the way that rnodes are 3182 * allocated. 3183 */ 3184 #define ACACHE_SHIFT_BITS 9 3185 3186 static int 3187 acachehash(rnode_t *rp, cred_t *cr) 3188 { 3189 3190 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3191 acachemask); 3192 } 3193 3194 #ifdef DEBUG 3195 static long nfs_access_cache_hits = 0; 3196 static long nfs_access_cache_misses = 0; 3197 #endif 3198 3199 nfs_access_type_t 3200 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3201 { 3202 vnode_t *vp; 3203 acache_t *ap; 3204 acache_hash_t *hp; 3205 nfs_access_type_t all; 3206 3207 vp = RTOV(rp); 3208 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3209 return (NFS_ACCESS_UNKNOWN); 3210 3211 if (rp->r_acache != NULL) { 3212 hp = &acache[acachehash(rp, cr)]; 3213 rw_enter(&hp->lock, RW_READER); 3214 ap = hp->next; 3215 while (ap != (acache_t *)hp) { 3216 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3217 if ((ap->known & acc) == acc) { 3218 #ifdef DEBUG 3219 nfs_access_cache_hits++; 3220 #endif 3221 if ((ap->allowed & acc) == acc) 3222 all = NFS_ACCESS_ALLOWED; 3223 else 3224 all = NFS_ACCESS_DENIED; 3225 } else { 3226 #ifdef DEBUG 3227 nfs_access_cache_misses++; 3228 #endif 3229 all = NFS_ACCESS_UNKNOWN; 3230 } 3231 rw_exit(&hp->lock); 3232 return (all); 3233 } 3234 ap = ap->next; 3235 } 3236 rw_exit(&hp->lock); 3237 } 3238 3239 #ifdef DEBUG 3240 nfs_access_cache_misses++; 3241 #endif 3242 return (NFS_ACCESS_UNKNOWN); 3243 } 3244 3245 void 3246 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3247 { 3248 acache_t *ap; 3249 acache_t *nap; 3250 acache_hash_t *hp; 3251 3252 hp = &acache[acachehash(rp, cr)]; 3253 3254 /* 3255 * Allocate now assuming that mostly an allocation will be 3256 * required. This allows the allocation to happen without 3257 * holding the hash bucket locked. 3258 */ 3259 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3260 if (nap != NULL) { 3261 nap->known = acc; 3262 nap->allowed = resacc; 3263 nap->rnode = rp; 3264 crhold(cr); 3265 nap->cred = cr; 3266 nap->hashq = hp; 3267 } 3268 3269 rw_enter(&hp->lock, RW_WRITER); 3270 3271 if (rp->r_acache != NULL) { 3272 ap = hp->next; 3273 while (ap != (acache_t *)hp) { 3274 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3275 ap->known |= acc; 3276 ap->allowed &= ~acc; 3277 ap->allowed |= resacc; 3278 rw_exit(&hp->lock); 3279 if (nap != NULL) { 3280 crfree(nap->cred); 3281 kmem_cache_free(acache_cache, nap); 3282 } 3283 return; 3284 } 3285 ap = ap->next; 3286 } 3287 } 3288 3289 if (nap != NULL) { 3290 #ifdef DEBUG 3291 clstat_debug.access.value.ui64++; 3292 #endif 3293 nap->next = hp->next; 3294 hp->next = nap; 3295 nap->next->prev = nap; 3296 nap->prev = (acache_t *)hp; 3297 3298 mutex_enter(&rp->r_statelock); 3299 nap->list = rp->r_acache; 3300 rp->r_acache = nap; 3301 mutex_exit(&rp->r_statelock); 3302 } 3303 3304 rw_exit(&hp->lock); 3305 } 3306 3307 int 3308 nfs_access_purge_rp(rnode_t *rp) 3309 { 3310 acache_t *ap; 3311 acache_t *tmpap; 3312 acache_t *rplist; 3313 3314 /* 3315 * If there aren't any cached entries, then there is nothing 3316 * to free. 3317 */ 3318 if (rp->r_acache == NULL) 3319 return (0); 3320 3321 mutex_enter(&rp->r_statelock); 3322 rplist = rp->r_acache; 3323 rp->r_acache = NULL; 3324 mutex_exit(&rp->r_statelock); 3325 3326 /* 3327 * Loop through each entry in the list pointed to in the 3328 * rnode. Remove each of these entries from the hash 3329 * queue that it is on and remove it from the list in 3330 * the rnode. 3331 */ 3332 for (ap = rplist; ap != NULL; ap = tmpap) { 3333 rw_enter(&ap->hashq->lock, RW_WRITER); 3334 ap->prev->next = ap->next; 3335 ap->next->prev = ap->prev; 3336 rw_exit(&ap->hashq->lock); 3337 3338 tmpap = ap->list; 3339 crfree(ap->cred); 3340 kmem_cache_free(acache_cache, ap); 3341 #ifdef DEBUG 3342 clstat_debug.access.value.ui64--; 3343 #endif 3344 } 3345 3346 return (1); 3347 } 3348 3349 static const char prefix[] = ".nfs"; 3350 3351 static kmutex_t newnum_lock; 3352 3353 int 3354 newnum(void) 3355 { 3356 static uint_t newnum = 0; 3357 uint_t id; 3358 3359 mutex_enter(&newnum_lock); 3360 if (newnum == 0) 3361 newnum = gethrestime_sec() & 0xffff; 3362 id = newnum++; 3363 mutex_exit(&newnum_lock); 3364 return (id); 3365 } 3366 3367 char * 3368 newname(void) 3369 { 3370 char *news; 3371 char *s; 3372 const char *p; 3373 uint_t id; 3374 3375 id = newnum(); 3376 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3377 s = news; 3378 p = prefix; 3379 while (*p != '\0') 3380 *s++ = *p++; 3381 while (id != 0) { 3382 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3383 id >>= 4; 3384 } 3385 *s = '\0'; 3386 return (news); 3387 } 3388 3389 /* 3390 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3391 * framework. 3392 */ 3393 static int 3394 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3395 { 3396 ksp->ks_snaptime = gethrtime(); 3397 if (rw == KSTAT_WRITE) { 3398 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3399 #ifdef DEBUG 3400 /* 3401 * Currently only the global zone can write to kstats, but we 3402 * add the check just for paranoia. 3403 */ 3404 if (INGLOBALZONE(curproc)) 3405 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3406 sizeof (clstat_debug)); 3407 #endif 3408 } else { 3409 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3410 #ifdef DEBUG 3411 /* 3412 * If we're displaying the "global" debug kstat values, we 3413 * display them as-is to all zones since in fact they apply to 3414 * the system as a whole. 3415 */ 3416 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3417 sizeof (clstat_debug)); 3418 #endif 3419 } 3420 return (0); 3421 } 3422 3423 static void * 3424 clinit_zone(zoneid_t zoneid) 3425 { 3426 kstat_t *nfs_client_kstat; 3427 struct nfs_clnt *nfscl; 3428 uint_t ndata; 3429 3430 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3431 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3432 nfscl->nfscl_chtable = NULL; 3433 nfscl->nfscl_zoneid = zoneid; 3434 3435 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3436 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3437 #ifdef DEBUG 3438 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3439 #endif 3440 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3441 "misc", KSTAT_TYPE_NAMED, ndata, 3442 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3443 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3444 nfs_client_kstat->ks_snapshot = cl_snapshot; 3445 kstat_install(nfs_client_kstat); 3446 } 3447 mutex_enter(&nfs_clnt_list_lock); 3448 list_insert_head(&nfs_clnt_list, nfscl); 3449 mutex_exit(&nfs_clnt_list_lock); 3450 return (nfscl); 3451 } 3452 3453 /*ARGSUSED*/ 3454 static void 3455 clfini_zone(zoneid_t zoneid, void *arg) 3456 { 3457 struct nfs_clnt *nfscl = arg; 3458 chhead_t *chp, *next; 3459 3460 if (nfscl == NULL) 3461 return; 3462 mutex_enter(&nfs_clnt_list_lock); 3463 list_remove(&nfs_clnt_list, nfscl); 3464 mutex_exit(&nfs_clnt_list_lock); 3465 clreclaim_zone(nfscl, 0); 3466 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3467 ASSERT(chp->ch_list == NULL); 3468 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3469 next = chp->ch_next; 3470 kmem_free(chp, sizeof (*chp)); 3471 } 3472 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3473 mutex_destroy(&nfscl->nfscl_chtable_lock); 3474 kmem_free(nfscl, sizeof (*nfscl)); 3475 } 3476 3477 /* 3478 * Called by endpnt_destructor to make sure the client handles are 3479 * cleaned up before the RPC endpoints. This becomes a no-op if 3480 * clfini_zone (above) is called first. This function is needed 3481 * (rather than relying on clfini_zone to clean up) because the ZSD 3482 * callbacks have no ordering mechanism, so we have no way to ensure 3483 * that clfini_zone is called before endpnt_destructor. 3484 */ 3485 void 3486 clcleanup_zone(zoneid_t zoneid) 3487 { 3488 struct nfs_clnt *nfscl; 3489 3490 mutex_enter(&nfs_clnt_list_lock); 3491 nfscl = list_head(&nfs_clnt_list); 3492 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3493 if (nfscl->nfscl_zoneid == zoneid) { 3494 clreclaim_zone(nfscl, 0); 3495 break; 3496 } 3497 } 3498 mutex_exit(&nfs_clnt_list_lock); 3499 } 3500 3501 int 3502 nfs_subrinit(void) 3503 { 3504 int i; 3505 ulong_t nrnode_max; 3506 3507 /* 3508 * Allocate and initialize the rnode hash queues 3509 */ 3510 if (nrnode <= 0) 3511 nrnode = ncsize; 3512 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3513 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3514 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3515 "!setting nrnode to max value of %ld", nrnode_max); 3516 nrnode = nrnode_max; 3517 } 3518 3519 rtablesize = 1 << highbit(nrnode / hashlen); 3520 rtablemask = rtablesize - 1; 3521 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3522 for (i = 0; i < rtablesize; i++) { 3523 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3524 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3525 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3526 } 3527 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3528 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3529 3530 /* 3531 * Allocate and initialize the access cache 3532 */ 3533 3534 /* 3535 * Initial guess is one access cache entry per rnode unless 3536 * nacache is set to a non-zero value and then it is used to 3537 * indicate a guess at the number of access cache entries. 3538 */ 3539 if (nacache > 0) 3540 acachesize = 1 << highbit(nacache / hashlen); 3541 else 3542 acachesize = rtablesize; 3543 acachemask = acachesize - 1; 3544 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3545 for (i = 0; i < acachesize; i++) { 3546 acache[i].next = (acache_t *)&acache[i]; 3547 acache[i].prev = (acache_t *)&acache[i]; 3548 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3549 } 3550 acache_cache = kmem_cache_create("nfs_access_cache", 3551 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3552 /* 3553 * Allocate and initialize the client handle cache 3554 */ 3555 chtab_cache = kmem_cache_create("client_handle_cache", 3556 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3557 /* 3558 * Initialize the list of per-zone client handles (and associated data). 3559 * This needs to be done before we call zone_key_create(). 3560 */ 3561 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3562 offsetof(struct nfs_clnt, nfscl_node)); 3563 /* 3564 * Initialize the zone_key for per-zone client handle lists. 3565 */ 3566 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3567 /* 3568 * Initialize the various mutexes and reader/writer locks 3569 */ 3570 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3571 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3572 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3573 3574 /* 3575 * Assign unique major number for all nfs mounts 3576 */ 3577 if ((nfs_major = getudev()) == -1) { 3578 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3579 "nfs: init: can't get unique device number"); 3580 nfs_major = 0; 3581 } 3582 nfs_minor = 0; 3583 3584 if (nfs3_jukebox_delay == 0) 3585 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3586 3587 return (0); 3588 } 3589 3590 void 3591 nfs_subrfini(void) 3592 { 3593 int i; 3594 3595 /* 3596 * Deallocate the rnode hash queues 3597 */ 3598 kmem_cache_destroy(rnode_cache); 3599 3600 for (i = 0; i < rtablesize; i++) 3601 rw_destroy(&rtable[i].r_lock); 3602 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3603 3604 /* 3605 * Deallocated the access cache 3606 */ 3607 kmem_cache_destroy(acache_cache); 3608 3609 for (i = 0; i < acachesize; i++) 3610 rw_destroy(&acache[i].lock); 3611 kmem_free(acache, acachesize * sizeof (*acache)); 3612 3613 /* 3614 * Deallocate the client handle cache 3615 */ 3616 kmem_cache_destroy(chtab_cache); 3617 3618 /* 3619 * Destroy the various mutexes and reader/writer locks 3620 */ 3621 mutex_destroy(&rpfreelist_lock); 3622 mutex_destroy(&newnum_lock); 3623 mutex_destroy(&nfs_minor_lock); 3624 (void) zone_key_delete(nfsclnt_zone_key); 3625 } 3626 3627 enum nfsstat 3628 puterrno(int error) 3629 { 3630 3631 switch (error) { 3632 case EOPNOTSUPP: 3633 return (NFSERR_OPNOTSUPP); 3634 case ENAMETOOLONG: 3635 return (NFSERR_NAMETOOLONG); 3636 case ENOTEMPTY: 3637 return (NFSERR_NOTEMPTY); 3638 case EDQUOT: 3639 return (NFSERR_DQUOT); 3640 case ESTALE: 3641 return (NFSERR_STALE); 3642 case EREMOTE: 3643 return (NFSERR_REMOTE); 3644 case ENOSYS: 3645 return (NFSERR_OPNOTSUPP); 3646 case EOVERFLOW: 3647 return (NFSERR_INVAL); 3648 default: 3649 return ((enum nfsstat)error); 3650 } 3651 /* NOTREACHED */ 3652 } 3653 3654 int 3655 geterrno(enum nfsstat status) 3656 { 3657 3658 switch (status) { 3659 case NFSERR_OPNOTSUPP: 3660 return (EOPNOTSUPP); 3661 case NFSERR_NAMETOOLONG: 3662 return (ENAMETOOLONG); 3663 case NFSERR_NOTEMPTY: 3664 return (ENOTEMPTY); 3665 case NFSERR_DQUOT: 3666 return (EDQUOT); 3667 case NFSERR_STALE: 3668 return (ESTALE); 3669 case NFSERR_REMOTE: 3670 return (EREMOTE); 3671 case NFSERR_WFLUSH: 3672 return (EIO); 3673 default: 3674 return ((int)status); 3675 } 3676 /* NOTREACHED */ 3677 } 3678 3679 enum nfsstat3 3680 puterrno3(int error) 3681 { 3682 3683 #ifdef DEBUG 3684 switch (error) { 3685 case 0: 3686 return (NFS3_OK); 3687 case EPERM: 3688 return (NFS3ERR_PERM); 3689 case ENOENT: 3690 return (NFS3ERR_NOENT); 3691 case EIO: 3692 return (NFS3ERR_IO); 3693 case ENXIO: 3694 return (NFS3ERR_NXIO); 3695 case EACCES: 3696 return (NFS3ERR_ACCES); 3697 case EEXIST: 3698 return (NFS3ERR_EXIST); 3699 case EXDEV: 3700 return (NFS3ERR_XDEV); 3701 case ENODEV: 3702 return (NFS3ERR_NODEV); 3703 case ENOTDIR: 3704 return (NFS3ERR_NOTDIR); 3705 case EISDIR: 3706 return (NFS3ERR_ISDIR); 3707 case EINVAL: 3708 return (NFS3ERR_INVAL); 3709 case EFBIG: 3710 return (NFS3ERR_FBIG); 3711 case ENOSPC: 3712 return (NFS3ERR_NOSPC); 3713 case EROFS: 3714 return (NFS3ERR_ROFS); 3715 case EMLINK: 3716 return (NFS3ERR_MLINK); 3717 case ENAMETOOLONG: 3718 return (NFS3ERR_NAMETOOLONG); 3719 case ENOTEMPTY: 3720 return (NFS3ERR_NOTEMPTY); 3721 case EDQUOT: 3722 return (NFS3ERR_DQUOT); 3723 case ESTALE: 3724 return (NFS3ERR_STALE); 3725 case EREMOTE: 3726 return (NFS3ERR_REMOTE); 3727 case ENOSYS: 3728 case EOPNOTSUPP: 3729 return (NFS3ERR_NOTSUPP); 3730 case EOVERFLOW: 3731 return (NFS3ERR_INVAL); 3732 default: 3733 zcmn_err(getzoneid(), CE_WARN, 3734 "puterrno3: got error %d", error); 3735 return ((enum nfsstat3)error); 3736 } 3737 #else 3738 switch (error) { 3739 case ENAMETOOLONG: 3740 return (NFS3ERR_NAMETOOLONG); 3741 case ENOTEMPTY: 3742 return (NFS3ERR_NOTEMPTY); 3743 case EDQUOT: 3744 return (NFS3ERR_DQUOT); 3745 case ESTALE: 3746 return (NFS3ERR_STALE); 3747 case ENOSYS: 3748 case EOPNOTSUPP: 3749 return (NFS3ERR_NOTSUPP); 3750 case EREMOTE: 3751 return (NFS3ERR_REMOTE); 3752 case EOVERFLOW: 3753 return (NFS3ERR_INVAL); 3754 default: 3755 return ((enum nfsstat3)error); 3756 } 3757 #endif 3758 } 3759 3760 int 3761 geterrno3(enum nfsstat3 status) 3762 { 3763 3764 #ifdef DEBUG 3765 switch (status) { 3766 case NFS3_OK: 3767 return (0); 3768 case NFS3ERR_PERM: 3769 return (EPERM); 3770 case NFS3ERR_NOENT: 3771 return (ENOENT); 3772 case NFS3ERR_IO: 3773 return (EIO); 3774 case NFS3ERR_NXIO: 3775 return (ENXIO); 3776 case NFS3ERR_ACCES: 3777 return (EACCES); 3778 case NFS3ERR_EXIST: 3779 return (EEXIST); 3780 case NFS3ERR_XDEV: 3781 return (EXDEV); 3782 case NFS3ERR_NODEV: 3783 return (ENODEV); 3784 case NFS3ERR_NOTDIR: 3785 return (ENOTDIR); 3786 case NFS3ERR_ISDIR: 3787 return (EISDIR); 3788 case NFS3ERR_INVAL: 3789 return (EINVAL); 3790 case NFS3ERR_FBIG: 3791 return (EFBIG); 3792 case NFS3ERR_NOSPC: 3793 return (ENOSPC); 3794 case NFS3ERR_ROFS: 3795 return (EROFS); 3796 case NFS3ERR_MLINK: 3797 return (EMLINK); 3798 case NFS3ERR_NAMETOOLONG: 3799 return (ENAMETOOLONG); 3800 case NFS3ERR_NOTEMPTY: 3801 return (ENOTEMPTY); 3802 case NFS3ERR_DQUOT: 3803 return (EDQUOT); 3804 case NFS3ERR_STALE: 3805 return (ESTALE); 3806 case NFS3ERR_REMOTE: 3807 return (EREMOTE); 3808 case NFS3ERR_BADHANDLE: 3809 return (ESTALE); 3810 case NFS3ERR_NOT_SYNC: 3811 return (EINVAL); 3812 case NFS3ERR_BAD_COOKIE: 3813 return (ENOENT); 3814 case NFS3ERR_NOTSUPP: 3815 return (EOPNOTSUPP); 3816 case NFS3ERR_TOOSMALL: 3817 return (EINVAL); 3818 case NFS3ERR_SERVERFAULT: 3819 return (EIO); 3820 case NFS3ERR_BADTYPE: 3821 return (EINVAL); 3822 case NFS3ERR_JUKEBOX: 3823 return (ENXIO); 3824 default: 3825 zcmn_err(getzoneid(), CE_WARN, 3826 "geterrno3: got status %d", status); 3827 return ((int)status); 3828 } 3829 #else 3830 switch (status) { 3831 case NFS3ERR_NAMETOOLONG: 3832 return (ENAMETOOLONG); 3833 case NFS3ERR_NOTEMPTY: 3834 return (ENOTEMPTY); 3835 case NFS3ERR_DQUOT: 3836 return (EDQUOT); 3837 case NFS3ERR_STALE: 3838 case NFS3ERR_BADHANDLE: 3839 return (ESTALE); 3840 case NFS3ERR_NOTSUPP: 3841 return (EOPNOTSUPP); 3842 case NFS3ERR_REMOTE: 3843 return (EREMOTE); 3844 case NFS3ERR_NOT_SYNC: 3845 case NFS3ERR_TOOSMALL: 3846 case NFS3ERR_BADTYPE: 3847 return (EINVAL); 3848 case NFS3ERR_BAD_COOKIE: 3849 return (ENOENT); 3850 case NFS3ERR_SERVERFAULT: 3851 return (EIO); 3852 case NFS3ERR_JUKEBOX: 3853 return (ENXIO); 3854 default: 3855 return ((int)status); 3856 } 3857 #endif 3858 } 3859 3860 rddir_cache * 3861 rddir_cache_alloc(int flags) 3862 { 3863 rddir_cache *rc; 3864 3865 rc = kmem_alloc(sizeof (*rc), flags); 3866 if (rc != NULL) { 3867 rc->entries = NULL; 3868 rc->flags = RDDIR; 3869 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3870 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3871 rc->count = 1; 3872 #ifdef DEBUG 3873 atomic_inc_64(&clstat_debug.dirent.value.ui64); 3874 #endif 3875 } 3876 return (rc); 3877 } 3878 3879 static void 3880 rddir_cache_free(rddir_cache *rc) 3881 { 3882 3883 #ifdef DEBUG 3884 atomic_dec_64(&clstat_debug.dirent.value.ui64); 3885 #endif 3886 if (rc->entries != NULL) { 3887 #ifdef DEBUG 3888 rddir_cache_buf_free(rc->entries, rc->buflen); 3889 #else 3890 kmem_free(rc->entries, rc->buflen); 3891 #endif 3892 } 3893 cv_destroy(&rc->cv); 3894 mutex_destroy(&rc->lock); 3895 kmem_free(rc, sizeof (*rc)); 3896 } 3897 3898 void 3899 rddir_cache_hold(rddir_cache *rc) 3900 { 3901 3902 mutex_enter(&rc->lock); 3903 rc->count++; 3904 mutex_exit(&rc->lock); 3905 } 3906 3907 void 3908 rddir_cache_rele(rddir_cache *rc) 3909 { 3910 3911 mutex_enter(&rc->lock); 3912 ASSERT(rc->count > 0); 3913 if (--rc->count == 0) { 3914 mutex_exit(&rc->lock); 3915 rddir_cache_free(rc); 3916 } else 3917 mutex_exit(&rc->lock); 3918 } 3919 3920 #ifdef DEBUG 3921 char * 3922 rddir_cache_buf_alloc(size_t size, int flags) 3923 { 3924 char *rc; 3925 3926 rc = kmem_alloc(size, flags); 3927 if (rc != NULL) 3928 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3929 return (rc); 3930 } 3931 3932 void 3933 rddir_cache_buf_free(void *addr, size_t size) 3934 { 3935 3936 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3937 kmem_free(addr, size); 3938 } 3939 #endif 3940 3941 static int 3942 nfs_free_data_reclaim(rnode_t *rp) 3943 { 3944 char *contents; 3945 int size; 3946 vsecattr_t *vsp; 3947 nfs3_pathconf_info *info; 3948 int freed; 3949 cred_t *cred; 3950 3951 /* 3952 * Free any held credentials and caches which 3953 * may be associated with this rnode. 3954 */ 3955 mutex_enter(&rp->r_statelock); 3956 cred = rp->r_cred; 3957 rp->r_cred = NULL; 3958 contents = rp->r_symlink.contents; 3959 size = rp->r_symlink.size; 3960 rp->r_symlink.contents = NULL; 3961 vsp = rp->r_secattr; 3962 rp->r_secattr = NULL; 3963 info = rp->r_pathconf; 3964 rp->r_pathconf = NULL; 3965 mutex_exit(&rp->r_statelock); 3966 3967 if (cred != NULL) 3968 crfree(cred); 3969 3970 /* 3971 * Free the access cache entries. 3972 */ 3973 freed = nfs_access_purge_rp(rp); 3974 3975 if (!HAVE_RDDIR_CACHE(rp) && 3976 contents == NULL && 3977 vsp == NULL && 3978 info == NULL) 3979 return (freed); 3980 3981 /* 3982 * Free the readdir cache entries 3983 */ 3984 if (HAVE_RDDIR_CACHE(rp)) 3985 nfs_purge_rddir_cache(RTOV(rp)); 3986 3987 /* 3988 * Free the symbolic link cache. 3989 */ 3990 if (contents != NULL) { 3991 3992 kmem_free((void *)contents, size); 3993 } 3994 3995 /* 3996 * Free any cached ACL. 3997 */ 3998 if (vsp != NULL) 3999 nfs_acl_free(vsp); 4000 4001 /* 4002 * Free any cached pathconf information. 4003 */ 4004 if (info != NULL) 4005 kmem_free(info, sizeof (*info)); 4006 4007 return (1); 4008 } 4009 4010 static int 4011 nfs_active_data_reclaim(rnode_t *rp) 4012 { 4013 char *contents; 4014 int size; 4015 vsecattr_t *vsp; 4016 nfs3_pathconf_info *info; 4017 int freed; 4018 4019 /* 4020 * Free any held credentials and caches which 4021 * may be associated with this rnode. 4022 */ 4023 if (!mutex_tryenter(&rp->r_statelock)) 4024 return (0); 4025 contents = rp->r_symlink.contents; 4026 size = rp->r_symlink.size; 4027 rp->r_symlink.contents = NULL; 4028 vsp = rp->r_secattr; 4029 rp->r_secattr = NULL; 4030 info = rp->r_pathconf; 4031 rp->r_pathconf = NULL; 4032 mutex_exit(&rp->r_statelock); 4033 4034 /* 4035 * Free the access cache entries. 4036 */ 4037 freed = nfs_access_purge_rp(rp); 4038 4039 if (!HAVE_RDDIR_CACHE(rp) && 4040 contents == NULL && 4041 vsp == NULL && 4042 info == NULL) 4043 return (freed); 4044 4045 /* 4046 * Free the readdir cache entries 4047 */ 4048 if (HAVE_RDDIR_CACHE(rp)) 4049 nfs_purge_rddir_cache(RTOV(rp)); 4050 4051 /* 4052 * Free the symbolic link cache. 4053 */ 4054 if (contents != NULL) { 4055 4056 kmem_free((void *)contents, size); 4057 } 4058 4059 /* 4060 * Free any cached ACL. 4061 */ 4062 if (vsp != NULL) 4063 nfs_acl_free(vsp); 4064 4065 /* 4066 * Free any cached pathconf information. 4067 */ 4068 if (info != NULL) 4069 kmem_free(info, sizeof (*info)); 4070 4071 return (1); 4072 } 4073 4074 static int 4075 nfs_free_reclaim(void) 4076 { 4077 int freed; 4078 rnode_t *rp; 4079 4080 #ifdef DEBUG 4081 clstat_debug.f_reclaim.value.ui64++; 4082 #endif 4083 freed = 0; 4084 mutex_enter(&rpfreelist_lock); 4085 rp = rpfreelist; 4086 if (rp != NULL) { 4087 do { 4088 if (nfs_free_data_reclaim(rp)) 4089 freed = 1; 4090 } while ((rp = rp->r_freef) != rpfreelist); 4091 } 4092 mutex_exit(&rpfreelist_lock); 4093 return (freed); 4094 } 4095 4096 static int 4097 nfs_active_reclaim(void) 4098 { 4099 int freed; 4100 int index; 4101 rnode_t *rp; 4102 4103 #ifdef DEBUG 4104 clstat_debug.a_reclaim.value.ui64++; 4105 #endif 4106 freed = 0; 4107 for (index = 0; index < rtablesize; index++) { 4108 rw_enter(&rtable[index].r_lock, RW_READER); 4109 for (rp = rtable[index].r_hashf; 4110 rp != (rnode_t *)(&rtable[index]); 4111 rp = rp->r_hashf) { 4112 if (nfs_active_data_reclaim(rp)) 4113 freed = 1; 4114 } 4115 rw_exit(&rtable[index].r_lock); 4116 } 4117 return (freed); 4118 } 4119 4120 static int 4121 nfs_rnode_reclaim(void) 4122 { 4123 int freed; 4124 rnode_t *rp; 4125 vnode_t *vp; 4126 4127 #ifdef DEBUG 4128 clstat_debug.r_reclaim.value.ui64++; 4129 #endif 4130 freed = 0; 4131 mutex_enter(&rpfreelist_lock); 4132 while ((rp = rpfreelist) != NULL) { 4133 rp_rmfree(rp); 4134 mutex_exit(&rpfreelist_lock); 4135 if (rp->r_flags & RHASHED) { 4136 vp = RTOV(rp); 4137 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4138 mutex_enter(&vp->v_lock); 4139 if (vp->v_count > 1) { 4140 vp->v_count--; 4141 mutex_exit(&vp->v_lock); 4142 rw_exit(&rp->r_hashq->r_lock); 4143 mutex_enter(&rpfreelist_lock); 4144 continue; 4145 } 4146 mutex_exit(&vp->v_lock); 4147 rp_rmhash_locked(rp); 4148 rw_exit(&rp->r_hashq->r_lock); 4149 } 4150 /* 4151 * This call to rp_addfree will end up destroying the 4152 * rnode, but in a safe way with the appropriate set 4153 * of checks done. 4154 */ 4155 rp_addfree(rp, CRED()); 4156 mutex_enter(&rpfreelist_lock); 4157 } 4158 mutex_exit(&rpfreelist_lock); 4159 return (freed); 4160 } 4161 4162 /*ARGSUSED*/ 4163 static void 4164 nfs_reclaim(void *cdrarg) 4165 { 4166 4167 #ifdef DEBUG 4168 clstat_debug.reclaim.value.ui64++; 4169 #endif 4170 if (nfs_free_reclaim()) 4171 return; 4172 4173 if (nfs_active_reclaim()) 4174 return; 4175 4176 (void) nfs_rnode_reclaim(); 4177 } 4178 4179 /* 4180 * NFS client failover support 4181 * 4182 * Routines to copy filehandles 4183 */ 4184 void 4185 nfscopyfh(caddr_t fhp, vnode_t *vp) 4186 { 4187 fhandle_t *dest = (fhandle_t *)fhp; 4188 4189 if (dest != NULL) 4190 *dest = *VTOFH(vp); 4191 } 4192 4193 void 4194 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4195 { 4196 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4197 4198 if (dest != NULL) 4199 *dest = *VTOFH3(vp); 4200 } 4201 4202 /* 4203 * NFS client failover support 4204 * 4205 * failover_safe() will test various conditions to ensure that 4206 * failover is permitted for this vnode. It will be denied 4207 * if: 4208 * 1) the operation in progress does not support failover (NULL fi) 4209 * 2) there are no available replicas (NULL mi_servers->sv_next) 4210 * 3) any locks are outstanding on this file 4211 */ 4212 static int 4213 failover_safe(failinfo_t *fi) 4214 { 4215 4216 /* 4217 * Does this op permit failover? 4218 */ 4219 if (fi == NULL || fi->vp == NULL) 4220 return (0); 4221 4222 /* 4223 * Are there any alternates to failover to? 4224 */ 4225 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4226 return (0); 4227 4228 /* 4229 * Disable check; we've forced local locking 4230 * 4231 * if (flk_has_remote_locks(fi->vp)) 4232 * return (0); 4233 */ 4234 4235 /* 4236 * If we have no partial path, we can't do anything 4237 */ 4238 if (VTOR(fi->vp)->r_path == NULL) 4239 return (0); 4240 4241 return (1); 4242 } 4243 4244 #include <sys/thread.h> 4245 4246 /* 4247 * NFS client failover support 4248 * 4249 * failover_newserver() will start a search for a new server, 4250 * preferably by starting an async thread to do the work. If 4251 * someone is already doing this (recognizable by MI_BINDINPROG 4252 * being set), it will simply return and the calling thread 4253 * will queue on the mi_failover_cv condition variable. 4254 */ 4255 static void 4256 failover_newserver(mntinfo_t *mi) 4257 { 4258 /* 4259 * Check if someone else is doing this already 4260 */ 4261 mutex_enter(&mi->mi_lock); 4262 if (mi->mi_flags & MI_BINDINPROG) { 4263 mutex_exit(&mi->mi_lock); 4264 return; 4265 } 4266 mi->mi_flags |= MI_BINDINPROG; 4267 4268 /* 4269 * Need to hold the vfs struct so that it can't be released 4270 * while the failover thread is selecting a new server. 4271 */ 4272 VFS_HOLD(mi->mi_vfsp); 4273 4274 /* 4275 * Start a thread to do the real searching. 4276 */ 4277 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4278 4279 mutex_exit(&mi->mi_lock); 4280 } 4281 4282 /* 4283 * NFS client failover support 4284 * 4285 * failover_thread() will find a new server to replace the one 4286 * currently in use, wake up other threads waiting on this mount 4287 * point, and die. It will start at the head of the server list 4288 * and poll servers until it finds one with an NFS server which is 4289 * registered and responds to a NULL procedure ping. 4290 * 4291 * XXX failover_thread is unsafe within the scope of the 4292 * present model defined for cpr to suspend the system. 4293 * Specifically, over-the-wire calls made by the thread 4294 * are unsafe. The thread needs to be reevaluated in case of 4295 * future updates to the cpr suspend model. 4296 */ 4297 static void 4298 failover_thread(mntinfo_t *mi) 4299 { 4300 servinfo_t *svp = NULL; 4301 CLIENT *cl; 4302 enum clnt_stat status; 4303 struct timeval tv; 4304 int error; 4305 int oncethru = 0; 4306 callb_cpr_t cprinfo; 4307 rnode_t *rp; 4308 int index; 4309 char *srvnames; 4310 size_t srvnames_len; 4311 struct nfs_clnt *nfscl = NULL; 4312 zoneid_t zoneid = getzoneid(); 4313 4314 #ifdef DEBUG 4315 /* 4316 * This is currently only needed to access counters which exist on 4317 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4318 * on non-DEBUG kernels. 4319 */ 4320 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4321 ASSERT(nfscl != NULL); 4322 #endif 4323 4324 /* 4325 * Its safe to piggyback on the mi_lock since failover_newserver() 4326 * code guarantees that there will be only one failover thread 4327 * per mountinfo at any instance. 4328 */ 4329 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4330 "failover_thread"); 4331 4332 mutex_enter(&mi->mi_lock); 4333 while (mi->mi_readers) { 4334 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4335 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4336 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4337 } 4338 mutex_exit(&mi->mi_lock); 4339 4340 tv.tv_sec = 2; 4341 tv.tv_usec = 0; 4342 4343 /* 4344 * Ping the null NFS procedure of every server in 4345 * the list until one responds. We always start 4346 * at the head of the list and always skip the one 4347 * that is current, since it's caused us a problem. 4348 */ 4349 while (svp == NULL) { 4350 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4351 if (!oncethru && svp == mi->mi_curr_serv) 4352 continue; 4353 4354 /* 4355 * If the file system was forcibly umounted 4356 * while trying to do a failover, then just 4357 * give up on the failover. It won't matter 4358 * what the server is. 4359 */ 4360 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4361 svp = NULL; 4362 goto done; 4363 } 4364 4365 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4366 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4367 if (error) 4368 continue; 4369 4370 if (!(mi->mi_flags & MI_INT)) 4371 cl->cl_nosignal = TRUE; 4372 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4373 xdr_void, NULL, tv); 4374 if (!(mi->mi_flags & MI_INT)) 4375 cl->cl_nosignal = FALSE; 4376 AUTH_DESTROY(cl->cl_auth); 4377 CLNT_DESTROY(cl); 4378 if (status == RPC_SUCCESS) { 4379 if (svp == mi->mi_curr_serv) { 4380 #ifdef DEBUG 4381 zcmn_err(zoneid, CE_NOTE, 4382 "NFS%d: failing over: selecting original server %s", 4383 mi->mi_vers, svp->sv_hostname); 4384 #else 4385 zcmn_err(zoneid, CE_NOTE, 4386 "NFS: failing over: selecting original server %s", 4387 svp->sv_hostname); 4388 #endif 4389 } else { 4390 #ifdef DEBUG 4391 zcmn_err(zoneid, CE_NOTE, 4392 "NFS%d: failing over from %s to %s", 4393 mi->mi_vers, 4394 mi->mi_curr_serv->sv_hostname, 4395 svp->sv_hostname); 4396 #else 4397 zcmn_err(zoneid, CE_NOTE, 4398 "NFS: failing over from %s to %s", 4399 mi->mi_curr_serv->sv_hostname, 4400 svp->sv_hostname); 4401 #endif 4402 } 4403 break; 4404 } 4405 } 4406 4407 if (svp == NULL) { 4408 if (!oncethru) { 4409 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4410 #ifdef DEBUG 4411 zprintf(zoneid, 4412 "NFS%d servers %s not responding " 4413 "still trying\n", mi->mi_vers, srvnames); 4414 #else 4415 zprintf(zoneid, "NFS servers %s not responding " 4416 "still trying\n", srvnames); 4417 #endif 4418 oncethru = 1; 4419 } 4420 mutex_enter(&mi->mi_lock); 4421 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4422 mutex_exit(&mi->mi_lock); 4423 delay(hz); 4424 mutex_enter(&mi->mi_lock); 4425 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4426 mutex_exit(&mi->mi_lock); 4427 } 4428 } 4429 4430 if (oncethru) { 4431 #ifdef DEBUG 4432 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4433 #else 4434 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4435 #endif 4436 } 4437 4438 if (svp != mi->mi_curr_serv) { 4439 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4440 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4441 rw_enter(&rtable[index].r_lock, RW_WRITER); 4442 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4443 mi->mi_vfsp); 4444 if (rp != NULL) { 4445 if (rp->r_flags & RHASHED) 4446 rp_rmhash_locked(rp); 4447 rw_exit(&rtable[index].r_lock); 4448 rp->r_server = svp; 4449 rp->r_fh = svp->sv_fhandle; 4450 (void) nfs_free_data_reclaim(rp); 4451 index = rtablehash(&rp->r_fh); 4452 rp->r_hashq = &rtable[index]; 4453 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4454 vn_exists(RTOV(rp)); 4455 rp_addhash(rp); 4456 rw_exit(&rp->r_hashq->r_lock); 4457 VN_RELE(RTOV(rp)); 4458 } else 4459 rw_exit(&rtable[index].r_lock); 4460 } 4461 4462 done: 4463 if (oncethru) 4464 kmem_free(srvnames, srvnames_len); 4465 mutex_enter(&mi->mi_lock); 4466 mi->mi_flags &= ~MI_BINDINPROG; 4467 if (svp != NULL) { 4468 mi->mi_curr_serv = svp; 4469 mi->mi_failover++; 4470 #ifdef DEBUG 4471 nfscl->nfscl_stat.failover.value.ui64++; 4472 #endif 4473 } 4474 cv_broadcast(&mi->mi_failover_cv); 4475 CALLB_CPR_EXIT(&cprinfo); 4476 VFS_RELE(mi->mi_vfsp); 4477 zthread_exit(); 4478 /* NOTREACHED */ 4479 } 4480 4481 /* 4482 * NFS client failover support 4483 * 4484 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4485 * is cleared, meaning that failover is complete. Called with 4486 * mi_lock mutex held. 4487 */ 4488 static int 4489 failover_wait(mntinfo_t *mi) 4490 { 4491 k_sigset_t smask; 4492 4493 /* 4494 * If someone else is hunting for a living server, 4495 * sleep until it's done. After our sleep, we may 4496 * be bound to the right server and get off cheaply. 4497 */ 4498 while (mi->mi_flags & MI_BINDINPROG) { 4499 /* 4500 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4501 * and SIGTERM. (Preserving the existing masks). 4502 * Mask out SIGINT if mount option nointr is specified. 4503 */ 4504 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4505 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4506 /* 4507 * restore original signal mask 4508 */ 4509 sigunintr(&smask); 4510 return (EINTR); 4511 } 4512 /* 4513 * restore original signal mask 4514 */ 4515 sigunintr(&smask); 4516 } 4517 return (0); 4518 } 4519 4520 /* 4521 * NFS client failover support 4522 * 4523 * failover_remap() will do a partial pathname lookup and find the 4524 * desired vnode on the current server. The interim vnode will be 4525 * discarded after we pilfer the new filehandle. 4526 * 4527 * Side effects: 4528 * - This routine will also update the filehandle in the args structure 4529 * pointed to by the fi->fhp pointer if it is non-NULL. 4530 */ 4531 4532 static int 4533 failover_remap(failinfo_t *fi) 4534 { 4535 vnode_t *vp, *nvp, *rootvp; 4536 rnode_t *rp, *nrp; 4537 mntinfo_t *mi; 4538 int error; 4539 #ifdef DEBUG 4540 struct nfs_clnt *nfscl; 4541 4542 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4543 ASSERT(nfscl != NULL); 4544 #endif 4545 /* 4546 * Sanity check 4547 */ 4548 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4549 return (EINVAL); 4550 vp = fi->vp; 4551 rp = VTOR(vp); 4552 mi = VTOMI(vp); 4553 4554 if (!(vp->v_flag & VROOT)) { 4555 /* 4556 * Given the root fh, use the path stored in 4557 * the rnode to find the fh for the new server. 4558 */ 4559 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4560 if (error) 4561 return (error); 4562 4563 error = failover_lookup(rp->r_path, rootvp, 4564 fi->lookupproc, fi->xattrdirproc, &nvp); 4565 4566 VN_RELE(rootvp); 4567 4568 if (error) 4569 return (error); 4570 4571 /* 4572 * If we found the same rnode, we're done now 4573 */ 4574 if (nvp == vp) { 4575 /* 4576 * Failed and the new server may physically be same 4577 * OR may share a same disk subsystem. In this case 4578 * file handle for a particular file path is not going 4579 * to change, given the same filehandle lookup will 4580 * always locate the same rnode as the existing one. 4581 * All we might need to do is to update the r_server 4582 * with the current servinfo. 4583 */ 4584 if (!VALID_FH(fi)) { 4585 rp->r_server = mi->mi_curr_serv; 4586 } 4587 VN_RELE(nvp); 4588 return (0); 4589 } 4590 4591 /* 4592 * Try to make it so that no one else will find this 4593 * vnode because it is just a temporary to hold the 4594 * new file handle until that file handle can be 4595 * copied to the original vnode/rnode. 4596 */ 4597 nrp = VTOR(nvp); 4598 mutex_enter(&mi->mi_remap_lock); 4599 /* 4600 * Some other thread could have raced in here and could 4601 * have done the remap for this particular rnode before 4602 * this thread here. Check for rp->r_server and 4603 * mi->mi_curr_serv and return if they are same. 4604 */ 4605 if (VALID_FH(fi)) { 4606 mutex_exit(&mi->mi_remap_lock); 4607 VN_RELE(nvp); 4608 return (0); 4609 } 4610 4611 if (nrp->r_flags & RHASHED) 4612 rp_rmhash(nrp); 4613 4614 /* 4615 * As a heuristic check on the validity of the new 4616 * file, check that the size and type match against 4617 * that we remember from the old version. 4618 */ 4619 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4620 mutex_exit(&mi->mi_remap_lock); 4621 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4622 "NFS replicas %s and %s: file %s not same.", 4623 rp->r_server->sv_hostname, 4624 nrp->r_server->sv_hostname, rp->r_path); 4625 VN_RELE(nvp); 4626 return (EINVAL); 4627 } 4628 4629 /* 4630 * snarf the filehandle from the new rnode 4631 * then release it, again while updating the 4632 * hash queues for the rnode. 4633 */ 4634 if (rp->r_flags & RHASHED) 4635 rp_rmhash(rp); 4636 rp->r_server = mi->mi_curr_serv; 4637 rp->r_fh = nrp->r_fh; 4638 rp->r_hashq = nrp->r_hashq; 4639 /* 4640 * Copy the attributes from the new rnode to the old 4641 * rnode. This will help to reduce unnecessary page 4642 * cache flushes. 4643 */ 4644 rp->r_attr = nrp->r_attr; 4645 rp->r_attrtime = nrp->r_attrtime; 4646 rp->r_mtime = nrp->r_mtime; 4647 (void) nfs_free_data_reclaim(rp); 4648 nfs_setswaplike(vp, &rp->r_attr); 4649 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4650 rp_addhash(rp); 4651 rw_exit(&rp->r_hashq->r_lock); 4652 mutex_exit(&mi->mi_remap_lock); 4653 VN_RELE(nvp); 4654 } 4655 4656 /* 4657 * Update successful failover remap count 4658 */ 4659 mutex_enter(&mi->mi_lock); 4660 mi->mi_remap++; 4661 mutex_exit(&mi->mi_lock); 4662 #ifdef DEBUG 4663 nfscl->nfscl_stat.remap.value.ui64++; 4664 #endif 4665 4666 /* 4667 * If we have a copied filehandle to update, do it now. 4668 */ 4669 if (fi->fhp != NULL && fi->copyproc != NULL) 4670 (*fi->copyproc)(fi->fhp, vp); 4671 4672 return (0); 4673 } 4674 4675 /* 4676 * NFS client failover support 4677 * 4678 * We want a simple pathname lookup routine to parse the pieces 4679 * of path in rp->r_path. We know that the path was a created 4680 * as rnodes were made, so we know we have only to deal with 4681 * paths that look like: 4682 * dir1/dir2/dir3/file 4683 * Any evidence of anything like .., symlinks, and ENOTDIR 4684 * are hard errors, because they mean something in this filesystem 4685 * is different from the one we came from, or has changed under 4686 * us in some way. If this is true, we want the failure. 4687 * 4688 * Extended attributes: if the filesystem is mounted with extended 4689 * attributes enabled (-o xattr), the attribute directory will be 4690 * represented in the r_path as the magic name XATTR_RPATH. So if 4691 * we see that name in the pathname, is must be because this node 4692 * is an extended attribute. Therefore, look it up that way. 4693 */ 4694 static int 4695 failover_lookup(char *path, vnode_t *root, 4696 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4697 vnode_t *, cred_t *, int), 4698 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4699 vnode_t **new) 4700 { 4701 vnode_t *dvp, *nvp; 4702 int error = EINVAL; 4703 char *s, *p, *tmppath; 4704 size_t len; 4705 mntinfo_t *mi; 4706 bool_t xattr; 4707 4708 /* Make local copy of path */ 4709 len = strlen(path) + 1; 4710 tmppath = kmem_alloc(len, KM_SLEEP); 4711 (void) strcpy(tmppath, path); 4712 s = tmppath; 4713 4714 dvp = root; 4715 VN_HOLD(dvp); 4716 mi = VTOMI(root); 4717 xattr = mi->mi_flags & MI_EXTATTR; 4718 4719 do { 4720 p = strchr(s, '/'); 4721 if (p != NULL) 4722 *p = '\0'; 4723 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4724 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4725 RFSCALL_SOFT); 4726 } else { 4727 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4728 CRED(), RFSCALL_SOFT); 4729 } 4730 if (p != NULL) 4731 *p++ = '/'; 4732 if (error) { 4733 VN_RELE(dvp); 4734 kmem_free(tmppath, len); 4735 return (error); 4736 } 4737 s = p; 4738 VN_RELE(dvp); 4739 dvp = nvp; 4740 } while (p != NULL); 4741 4742 if (nvp != NULL && new != NULL) 4743 *new = nvp; 4744 kmem_free(tmppath, len); 4745 return (0); 4746 } 4747 4748 /* 4749 * NFS client failover support 4750 * 4751 * sv_free() frees the malloc'd portion of a "servinfo_t". 4752 */ 4753 void 4754 sv_free(servinfo_t *svp) 4755 { 4756 servinfo_t *next; 4757 struct knetconfig *knconf; 4758 4759 while (svp != NULL) { 4760 next = svp->sv_next; 4761 if (svp->sv_secdata) 4762 sec_clnt_freeinfo(svp->sv_secdata); 4763 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4764 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4765 knconf = svp->sv_knconf; 4766 if (knconf != NULL) { 4767 if (knconf->knc_protofmly != NULL) 4768 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4769 if (knconf->knc_proto != NULL) 4770 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4771 kmem_free(knconf, sizeof (*knconf)); 4772 } 4773 knconf = svp->sv_origknconf; 4774 if (knconf != NULL) { 4775 if (knconf->knc_protofmly != NULL) 4776 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4777 if (knconf->knc_proto != NULL) 4778 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4779 kmem_free(knconf, sizeof (*knconf)); 4780 } 4781 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4782 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4783 mutex_destroy(&svp->sv_lock); 4784 kmem_free(svp, sizeof (*svp)); 4785 svp = next; 4786 } 4787 } 4788 4789 /* 4790 * Only can return non-zero if intr != 0. 4791 */ 4792 int 4793 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4794 { 4795 4796 mutex_enter(&l->lock); 4797 4798 /* 4799 * If this is a nested enter, then allow it. There 4800 * must be as many exits as enters through. 4801 */ 4802 if (l->owner == curthread) { 4803 /* lock is held for writing by current thread */ 4804 ASSERT(rw == RW_READER || rw == RW_WRITER); 4805 l->count--; 4806 } else if (rw == RW_READER) { 4807 /* 4808 * While there is a writer active or writers waiting, 4809 * then wait for them to finish up and move on. Then, 4810 * increment the count to indicate that a reader is 4811 * active. 4812 */ 4813 while (l->count < 0 || l->waiters > 0) { 4814 if (intr) { 4815 klwp_t *lwp = ttolwp(curthread); 4816 4817 if (lwp != NULL) 4818 lwp->lwp_nostop++; 4819 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) { 4820 if (lwp != NULL) 4821 lwp->lwp_nostop--; 4822 mutex_exit(&l->lock); 4823 return (EINTR); 4824 } 4825 if (lwp != NULL) 4826 lwp->lwp_nostop--; 4827 } else 4828 cv_wait(&l->cv_rd, &l->lock); 4829 } 4830 ASSERT(l->count < INT_MAX); 4831 #ifdef DEBUG 4832 if ((l->count % 10000) == 9999) 4833 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4834 "rwlock @ %p\n", l->count, (void *)&l); 4835 #endif 4836 l->count++; 4837 } else { 4838 ASSERT(rw == RW_WRITER); 4839 /* 4840 * While there are readers active or a writer 4841 * active, then wait for all of the readers 4842 * to finish or for the writer to finish. 4843 * Then, set the owner field to curthread and 4844 * decrement count to indicate that a writer 4845 * is active. 4846 */ 4847 while (l->count != 0) { 4848 l->waiters++; 4849 if (intr) { 4850 klwp_t *lwp = ttolwp(curthread); 4851 4852 if (lwp != NULL) 4853 lwp->lwp_nostop++; 4854 if (cv_wait_sig(&l->cv, &l->lock) == 0) { 4855 if (lwp != NULL) 4856 lwp->lwp_nostop--; 4857 l->waiters--; 4858 /* 4859 * If there are readers active and no 4860 * writers waiting then wake up all of 4861 * the waiting readers (if any). 4862 */ 4863 if (l->count > 0 && l->waiters == 0) 4864 cv_broadcast(&l->cv_rd); 4865 mutex_exit(&l->lock); 4866 return (EINTR); 4867 } 4868 if (lwp != NULL) 4869 lwp->lwp_nostop--; 4870 } else 4871 cv_wait(&l->cv, &l->lock); 4872 l->waiters--; 4873 } 4874 ASSERT(l->owner == NULL); 4875 l->owner = curthread; 4876 l->count--; 4877 } 4878 4879 mutex_exit(&l->lock); 4880 4881 return (0); 4882 } 4883 4884 /* 4885 * If the lock is available, obtain it and return non-zero. If there is 4886 * already a conflicting lock, return 0 immediately. 4887 */ 4888 4889 int 4890 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4891 { 4892 mutex_enter(&l->lock); 4893 4894 /* 4895 * If this is a nested enter, then allow it. There 4896 * must be as many exits as enters through. 4897 */ 4898 if (l->owner == curthread) { 4899 /* lock is held for writing by current thread */ 4900 ASSERT(rw == RW_READER || rw == RW_WRITER); 4901 l->count--; 4902 } else if (rw == RW_READER) { 4903 /* 4904 * If there is a writer active or writers waiting, deny the 4905 * lock. Otherwise, bump the count of readers. 4906 */ 4907 if (l->count < 0 || l->waiters > 0) { 4908 mutex_exit(&l->lock); 4909 return (0); 4910 } 4911 l->count++; 4912 } else { 4913 ASSERT(rw == RW_WRITER); 4914 /* 4915 * If there are readers active or a writer active, deny the 4916 * lock. Otherwise, set the owner field to curthread and 4917 * decrement count to indicate that a writer is active. 4918 */ 4919 if (l->count != 0) { 4920 mutex_exit(&l->lock); 4921 return (0); 4922 } 4923 ASSERT(l->owner == NULL); 4924 l->owner = curthread; 4925 l->count--; 4926 } 4927 4928 mutex_exit(&l->lock); 4929 4930 return (1); 4931 } 4932 4933 void 4934 nfs_rw_exit(nfs_rwlock_t *l) 4935 { 4936 4937 mutex_enter(&l->lock); 4938 4939 if (l->owner != NULL) { 4940 ASSERT(l->owner == curthread); 4941 4942 /* 4943 * To release a writer lock increment count to indicate that 4944 * there is one less writer active. If this was the last of 4945 * possibly nested writer locks, then clear the owner field as 4946 * well to indicate that there is no writer active. 4947 */ 4948 ASSERT(l->count < 0); 4949 l->count++; 4950 if (l->count == 0) { 4951 l->owner = NULL; 4952 4953 /* 4954 * If there are no writers waiting then wakeup all of 4955 * the waiting readers (if any). 4956 */ 4957 if (l->waiters == 0) 4958 cv_broadcast(&l->cv_rd); 4959 } 4960 } else { 4961 /* 4962 * To release a reader lock just decrement count to indicate 4963 * that there is one less reader active. 4964 */ 4965 ASSERT(l->count > 0); 4966 l->count--; 4967 } 4968 4969 /* 4970 * If there are no readers active nor a writer active and there is a 4971 * writer waiting we need to wake up it. 4972 */ 4973 if (l->count == 0 && l->waiters > 0) 4974 cv_signal(&l->cv); 4975 mutex_exit(&l->lock); 4976 } 4977 4978 int 4979 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4980 { 4981 4982 if (rw == RW_READER) 4983 return (l->count > 0); 4984 ASSERT(rw == RW_WRITER); 4985 return (l->count < 0); 4986 } 4987 4988 /* ARGSUSED */ 4989 void 4990 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4991 { 4992 4993 l->count = 0; 4994 l->waiters = 0; 4995 l->owner = NULL; 4996 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4997 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4998 cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL); 4999 } 5000 5001 void 5002 nfs_rw_destroy(nfs_rwlock_t *l) 5003 { 5004 5005 mutex_destroy(&l->lock); 5006 cv_destroy(&l->cv); 5007 cv_destroy(&l->cv_rd); 5008 } 5009 5010 int 5011 nfs3_rddir_compar(const void *x, const void *y) 5012 { 5013 rddir_cache *a = (rddir_cache *)x; 5014 rddir_cache *b = (rddir_cache *)y; 5015 5016 if (a->nfs3_cookie == b->nfs3_cookie) { 5017 if (a->buflen == b->buflen) 5018 return (0); 5019 if (a->buflen < b->buflen) 5020 return (-1); 5021 return (1); 5022 } 5023 5024 if (a->nfs3_cookie < b->nfs3_cookie) 5025 return (-1); 5026 5027 return (1); 5028 } 5029 5030 int 5031 nfs_rddir_compar(const void *x, const void *y) 5032 { 5033 rddir_cache *a = (rddir_cache *)x; 5034 rddir_cache *b = (rddir_cache *)y; 5035 5036 if (a->nfs_cookie == b->nfs_cookie) { 5037 if (a->buflen == b->buflen) 5038 return (0); 5039 if (a->buflen < b->buflen) 5040 return (-1); 5041 return (1); 5042 } 5043 5044 if (a->nfs_cookie < b->nfs_cookie) 5045 return (-1); 5046 5047 return (1); 5048 } 5049 5050 static char * 5051 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 5052 { 5053 servinfo_t *s; 5054 char *srvnames; 5055 char *namep; 5056 size_t length; 5057 5058 /* 5059 * Calculate the length of the string required to hold all 5060 * of the server names plus either a comma or a null 5061 * character following each individual one. 5062 */ 5063 length = 0; 5064 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 5065 length += s->sv_hostnamelen; 5066 5067 srvnames = kmem_alloc(length, KM_SLEEP); 5068 5069 namep = srvnames; 5070 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 5071 (void) strcpy(namep, s->sv_hostname); 5072 namep += s->sv_hostnamelen - 1; 5073 *namep++ = ','; 5074 } 5075 *--namep = '\0'; 5076 5077 *len = length; 5078 5079 return (srvnames); 5080 } 5081 5082 /* 5083 * These two functions are temporary and designed for the upgrade-workaround 5084 * only. They cannot be used for general zone-crossing NFS client support, and 5085 * will be removed shortly. 5086 * 5087 * When the workaround is enabled, all NFS traffic is forced into the global 5088 * zone. These functions are called when the code needs to refer to the state 5089 * of the underlying network connection. They're not called when the function 5090 * needs to refer to the state of the process that invoked the system call. 5091 * (E.g., when checking whether the zone is shutting down during the mount() 5092 * call.) 5093 */ 5094 5095 struct zone * 5096 nfs_zone(void) 5097 { 5098 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5099 } 5100 5101 zoneid_t 5102 nfs_zoneid(void) 5103 { 5104 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5105 } 5106 5107 /* 5108 * nfs_mount_label_policy: 5109 * Determine whether the mount is allowed according to MAC check, 5110 * by comparing (where appropriate) label of the remote server 5111 * against the label of the zone being mounted into. 5112 * 5113 * Returns: 5114 * 0 : access allowed 5115 * -1 : read-only access allowed (i.e., read-down) 5116 * >0 : error code, such as EACCES 5117 */ 5118 int 5119 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5120 struct knetconfig *knconf, cred_t *cr) 5121 { 5122 int addr_type; 5123 void *ipaddr; 5124 bslabel_t *server_sl, *mntlabel; 5125 zone_t *mntzone = NULL; 5126 ts_label_t *zlabel; 5127 tsol_tpc_t *tp; 5128 ts_label_t *tsl = NULL; 5129 int retv; 5130 5131 /* 5132 * Get the zone's label. Each zone on a labeled system has a label. 5133 */ 5134 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5135 zlabel = mntzone->zone_slabel; 5136 ASSERT(zlabel != NULL); 5137 label_hold(zlabel); 5138 5139 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5140 addr_type = IPV4_VERSION; 5141 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5142 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5143 addr_type = IPV6_VERSION; 5144 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5145 } else { 5146 retv = 0; 5147 goto out; 5148 } 5149 5150 retv = EACCES; /* assume the worst */ 5151 5152 /* 5153 * Next, get the assigned label of the remote server. 5154 */ 5155 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5156 if (tp == NULL) 5157 goto out; /* error getting host entry */ 5158 5159 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5160 goto rel_tpc; /* invalid domain */ 5161 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5162 (tp->tpc_tp.host_type != UNLABELED)) 5163 goto rel_tpc; /* invalid hosttype */ 5164 5165 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5166 tsl = getflabel_cipso(vfsp); 5167 if (tsl == NULL) 5168 goto rel_tpc; /* error getting server lbl */ 5169 5170 server_sl = label2bslabel(tsl); 5171 } else { /* UNLABELED */ 5172 server_sl = &tp->tpc_tp.tp_def_label; 5173 } 5174 5175 mntlabel = label2bslabel(zlabel); 5176 5177 /* 5178 * Now compare labels to complete the MAC check. If the labels 5179 * are equal or if the requestor is in the global zone and has 5180 * NET_MAC_AWARE, then allow read-write access. (Except for 5181 * mounts into the global zone itself; restrict these to 5182 * read-only.) 5183 * 5184 * If the requestor is in some other zone, but his label 5185 * dominates the server, then allow read-down. 5186 * 5187 * Otherwise, access is denied. 5188 */ 5189 if (blequal(mntlabel, server_sl) || 5190 (crgetzoneid(cr) == GLOBAL_ZONEID && 5191 getpflags(NET_MAC_AWARE, cr) != 0)) { 5192 if ((mntzone == global_zone) || 5193 !blequal(mntlabel, server_sl)) 5194 retv = -1; /* read-only */ 5195 else 5196 retv = 0; /* access OK */ 5197 } else if (bldominates(mntlabel, server_sl)) { 5198 retv = -1; /* read-only */ 5199 } else { 5200 retv = EACCES; 5201 } 5202 5203 if (tsl != NULL) 5204 label_rele(tsl); 5205 5206 rel_tpc: 5207 TPC_RELE(tp); 5208 out: 5209 if (mntzone) 5210 zone_rele(mntzone); 5211 label_rele(zlabel); 5212 return (retv); 5213 } 5214 5215 boolean_t 5216 nfs_has_ctty(void) 5217 { 5218 boolean_t rv; 5219 mutex_enter(&curproc->p_splock); 5220 rv = (curproc->p_sessp->s_vp != NULL); 5221 mutex_exit(&curproc->p_splock); 5222 return (rv); 5223 } 5224 5225 /* 5226 * See if xattr directory to see if it has any generic user attributes 5227 */ 5228 int 5229 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) 5230 { 5231 struct uio uio; 5232 struct iovec iov; 5233 char *dbuf; 5234 struct dirent64 *dp; 5235 size_t dlen = 8 * 1024; 5236 size_t dbuflen; 5237 int eof = 0; 5238 int error; 5239 5240 *valp = 0; 5241 dbuf = kmem_alloc(dlen, KM_SLEEP); 5242 uio.uio_iov = &iov; 5243 uio.uio_iovcnt = 1; 5244 uio.uio_segflg = UIO_SYSSPACE; 5245 uio.uio_fmode = 0; 5246 uio.uio_extflg = UIO_COPY_CACHED; 5247 uio.uio_loffset = 0; 5248 uio.uio_resid = dlen; 5249 iov.iov_base = dbuf; 5250 iov.iov_len = dlen; 5251 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 5252 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); 5253 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 5254 5255 dbuflen = dlen - uio.uio_resid; 5256 5257 if (error || dbuflen == 0) { 5258 kmem_free(dbuf, dlen); 5259 return (error); 5260 } 5261 5262 dp = (dirent64_t *)dbuf; 5263 5264 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { 5265 if (strcmp(dp->d_name, ".") == 0 || 5266 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, 5267 VIEW_READWRITE) == 0 || strcmp(dp->d_name, 5268 VIEW_READONLY) == 0) { 5269 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); 5270 continue; 5271 } 5272 5273 *valp = 1; 5274 break; 5275 } 5276 kmem_free(dbuf, dlen); 5277 return (0); 5278 } 5279