/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The hash queues for the access to active and cached rnodes * are organized as doubly linked lists. A reader/writer lock * for each hash bucket is used to control access and to synchronize * lookups, additions, and deletions from the hash queue. * * The rnode freelist is organized as a doubly linked list with * a head pointer. Additions and deletions are synchronized via * a single mutex. * * In order to add an rnode to the free list, it must be hashed into * a hash queue and the exclusive lock to the hash queue be held. * If an rnode is not hashed into a hash queue, then it is destroyed * because it represents no valuable information that can be reused * about the file. The exclusive lock to the hash queue must be * held in order to prevent a lookup in the hash queue from finding * the rnode and using it and assuming that the rnode is not on the * freelist. The lookup in the hash queue will have the hash queue * locked, either exclusive or shared. * * The vnode reference count for each rnode is not allowed to drop * below 1. This prevents external entities, such as the VM * subsystem, from acquiring references to vnodes already on the * freelist and then trying to place them back on the freelist * when their reference is released. This means that the when an * rnode is looked up in the hash queues, then either the rnode * is removed from the freelist and that reference is transferred to * the new reference or the vnode reference count must be incremented * accordingly. The mutex for the freelist must be held in order to * accurately test to see if the rnode is on the freelist or not. * The hash queue lock might be held shared and it is possible that * two different threads may race to remove the rnode from the * freelist. This race can be resolved by holding the mutex for the * freelist. Please note that the mutex for the freelist does not * need to held if the rnode is not on the freelist. It can not be * placed on the freelist due to the requirement that the thread * putting the rnode on the freelist must hold the exclusive lock * to the hash queue and the thread doing the lookup in the hash * queue is holding either a shared or exclusive lock to the hash * queue. * * The lock ordering is: * * hash bucket lock -> vnode lock * hash bucket lock -> freelist lock */ static rhashq_t *rtable; static kmutex_t rpfreelist_lock; static rnode_t *rpfreelist = NULL; static long rnew = 0; long nrnode = 0; static int rtablesize; static int rtablemask; static int hashlen = 4; static struct kmem_cache *rnode_cache; /* * Mutex to protect the following variables: * nfs_major * nfs_minor */ kmutex_t nfs_minor_lock; int nfs_major; int nfs_minor; /* Do we allow preepoch (negative) time values otw? */ bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ /* * Access cache */ static acache_hash_t *acache; static long nacache; /* used strictly to size the number of hash queues */ static int acachesize; static int acachemask; static struct kmem_cache *acache_cache; /* * Client side utilities */ /* * client side statistics */ static const struct clstat clstat_tmpl = { { "calls", KSTAT_DATA_UINT64 }, { "badcalls", KSTAT_DATA_UINT64 }, { "clgets", KSTAT_DATA_UINT64 }, { "cltoomany", KSTAT_DATA_UINT64 }, #ifdef DEBUG { "clalloc", KSTAT_DATA_UINT64 }, { "noresponse", KSTAT_DATA_UINT64 }, { "failover", KSTAT_DATA_UINT64 }, { "remap", KSTAT_DATA_UINT64 }, #endif }; /* * The following are statistics that describe behavior of the system as a whole * and doesn't correspond to any one particular zone. */ #ifdef DEBUG static struct clstat_debug { kstat_named_t nrnode; /* number of allocated rnodes */ kstat_named_t access; /* size of access cache */ kstat_named_t dirent; /* size of readdir cache */ kstat_named_t dirents; /* size of readdir buf cache */ kstat_named_t reclaim; /* number of reclaims */ kstat_named_t clreclaim; /* number of cl reclaims */ kstat_named_t f_reclaim; /* number of free reclaims */ kstat_named_t a_reclaim; /* number of active reclaims */ kstat_named_t r_reclaim; /* number of rnode reclaims */ kstat_named_t rpath; /* bytes used to store rpaths */ } clstat_debug = { { "nrnode", KSTAT_DATA_UINT64 }, { "access", KSTAT_DATA_UINT64 }, { "dirent", KSTAT_DATA_UINT64 }, { "dirents", KSTAT_DATA_UINT64 }, { "reclaim", KSTAT_DATA_UINT64 }, { "clreclaim", KSTAT_DATA_UINT64 }, { "f_reclaim", KSTAT_DATA_UINT64 }, { "a_reclaim", KSTAT_DATA_UINT64 }, { "r_reclaim", KSTAT_DATA_UINT64 }, { "r_path", KSTAT_DATA_UINT64 }, }; #endif /* DEBUG */ /* * We keep a global list of per-zone client data, so we can clean up all zones * if we get low on memory. */ static list_t nfs_clnt_list; static kmutex_t nfs_clnt_list_lock; static zone_key_t nfsclnt_zone_key; static struct kmem_cache *chtab_cache; /* * Some servers do not properly update the attributes of the * directory when changes are made. To allow interoperability * with these broken servers, the nfs_disable_rddir_cache * parameter must be set in /etc/system */ int nfs_disable_rddir_cache = 0; int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, struct chtab **); void clfree(CLIENT *, struct chtab *); static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, struct chtab **, struct nfs_clnt *); static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, struct chtab **, struct nfs_clnt *); static void clreclaim(void *); static int nfs_feedback(int, int, mntinfo_t *); static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, caddr_t, cred_t *, int *, enum clnt_stat *, int, failinfo_t *); static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, caddr_t, cred_t *, int *, int, failinfo_t *); static void rinactive(rnode_t *, cred_t *); static int rtablehash(nfs_fhandle *); static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, struct vnodeops *, int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), int (*)(const void *, const void *), int *, cred_t *, char *, char *); static void rp_rmfree(rnode_t *); static void rp_addhash(rnode_t *); static void rp_rmhash_locked(rnode_t *); static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); static void destroy_rnode(rnode_t *); static void rddir_cache_free(rddir_cache *); static int nfs_free_data_reclaim(rnode_t *); static int nfs_active_data_reclaim(rnode_t *); static int nfs_free_reclaim(void); static int nfs_active_reclaim(void); static int nfs_rnode_reclaim(void); static void nfs_reclaim(void *); static int failover_safe(failinfo_t *); static void failover_newserver(mntinfo_t *mi); static void failover_thread(mntinfo_t *mi); static int failover_wait(mntinfo_t *); static int failover_remap(failinfo_t *); static int failover_lookup(char *, vnode_t *, int (*)(vnode_t *, char *, vnode_t **, struct pathname *, int, vnode_t *, cred_t *, int), int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), vnode_t **); static void nfs_free_r_path(rnode_t *); static void nfs_set_vroot(vnode_t *); static char *nfs_getsrvnames(mntinfo_t *, size_t *); /* * from rpcsec module (common/rpcsec) */ extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); extern void sec_clnt_freeh(AUTH *); extern void sec_clnt_freeinfo(struct sec_data *); /* * used in mount policy */ extern ts_label_t *getflabel_cipso(vfs_t *); /* * EIO or EINTR are not recoverable errors. */ #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) #ifdef DEBUG #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n" #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n" #else #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n" #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n" #endif /* * Common handle get program for NFS, NFS ACL, and NFS AUTH client. */ static int clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, struct chtab **chp, struct nfs_clnt *nfscl) { struct chhead *ch, *newch; struct chhead **plistp; struct chtab *cp; int error; k_sigset_t smask; if (newcl == NULL || chp == NULL || ci == NULL) return (EINVAL); *newcl = NULL; *chp = NULL; /* * Find an unused handle or create one */ newch = NULL; nfscl->nfscl_stat.clgets.value.ui64++; top: /* * Find the correct entry in the cache to check for free * client handles. The search is based on the RPC program * number, program version number, dev_t for the transport * device, and the protocol family. */ mutex_enter(&nfscl->nfscl_chtable_lock); plistp = &nfscl->nfscl_chtable; for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { if (ch->ch_prog == ci->cl_prog && ch->ch_vers == ci->cl_vers && ch->ch_dev == svp->sv_knconf->knc_rdev && (strcmp(ch->ch_protofmly, svp->sv_knconf->knc_protofmly) == 0)) break; plistp = &ch->ch_next; } /* * If we didn't find a cache entry for this quadruple, then * create one. If we don't have one already preallocated, * then drop the cache lock, create one, and then start over. * If we did have a preallocated entry, then just add it to * the front of the list. */ if (ch == NULL) { if (newch == NULL) { mutex_exit(&nfscl->nfscl_chtable_lock); newch = kmem_alloc(sizeof (*newch), KM_SLEEP); newch->ch_timesused = 0; newch->ch_prog = ci->cl_prog; newch->ch_vers = ci->cl_vers; newch->ch_dev = svp->sv_knconf->knc_rdev; newch->ch_protofmly = kmem_alloc( strlen(svp->sv_knconf->knc_protofmly) + 1, KM_SLEEP); (void) strcpy(newch->ch_protofmly, svp->sv_knconf->knc_protofmly); newch->ch_list = NULL; goto top; } ch = newch; newch = NULL; ch->ch_next = nfscl->nfscl_chtable; nfscl->nfscl_chtable = ch; /* * We found a cache entry, but if it isn't on the front of the * list, then move it to the front of the list to try to take * advantage of locality of operations. */ } else if (ch != nfscl->nfscl_chtable) { *plistp = ch->ch_next; ch->ch_next = nfscl->nfscl_chtable; nfscl->nfscl_chtable = ch; } /* * If there was a free client handle cached, then remove it * from the list, init it, and use it. */ if (ch->ch_list != NULL) { cp = ch->ch_list; ch->ch_list = cp->ch_list; mutex_exit(&nfscl->nfscl_chtable_lock); if (newch != NULL) { kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); kmem_free(newch, sizeof (*newch)); } (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, &cp->ch_client->cl_auth); if (error || cp->ch_client->cl_auth == NULL) { CLNT_DESTROY(cp->ch_client); kmem_cache_free(chtab_cache, cp); return ((error != 0) ? error : EINTR); } ch->ch_timesused++; *newcl = cp->ch_client; *chp = cp; return (0); } /* * There weren't any free client handles which fit, so allocate * a new one and use that. */ #ifdef DEBUG atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64); #endif mutex_exit(&nfscl->nfscl_chtable_lock); nfscl->nfscl_stat.cltoomany.value.ui64++; if (newch != NULL) { kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); kmem_free(newch, sizeof (*newch)); } cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); cp->ch_head = ch; sigintr(&smask, (int)ci->cl_flags & MI_INT); error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); sigunintr(&smask); if (error != 0) { kmem_cache_free(chtab_cache, cp); #ifdef DEBUG atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); #endif /* * Warning is unnecessary if error is EINTR. */ if (error != EINTR) { nfs_cmn_err(error, CE_WARN, "clget: couldn't create handle: %m\n"); } return (error); } (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); auth_destroy(cp->ch_client->cl_auth); error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, &cp->ch_client->cl_auth); if (error || cp->ch_client->cl_auth == NULL) { CLNT_DESTROY(cp->ch_client); kmem_cache_free(chtab_cache, cp); #ifdef DEBUG atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); #endif return ((error != 0) ? error : EINTR); } ch->ch_timesused++; *newcl = cp->ch_client; ASSERT(cp->ch_client->cl_nosignal == FALSE); *chp = cp; return (0); } int clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, struct chtab **chp) { struct nfs_clnt *nfscl; nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); ASSERT(nfscl != NULL); return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); } static int acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, struct chtab **chp, struct nfs_clnt *nfscl) { clinfo_t ci; int error; /* * Set read buffer size to rsize * and add room for RPC headers. */ ci.cl_readsize = mi->mi_tsize; if (ci.cl_readsize != 0) ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); /* * If soft mount and server is down just try once. * meaning: do not retransmit. */ if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) ci.cl_retrans = 0; else ci.cl_retrans = mi->mi_retrans; ci.cl_prog = NFS_ACL_PROGRAM; ci.cl_vers = mi->mi_vers; ci.cl_flags = mi->mi_flags; /* * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS * security flavor, the client tries to establish a security context * by contacting the server. If the connection is timed out or reset, * e.g. server reboot, we will try again. */ do { error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); if (error == 0) break; /* * For forced unmount or zone shutdown, bail out, no retry. */ if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { error = EIO; break; } /* do not retry for softmount */ if (!(mi->mi_flags & MI_HARD)) break; /* let the caller deal with the failover case */ if (FAILOVER_MOUNT(mi)) break; } while (error == ETIMEDOUT || error == ECONNRESET); return (error); } static int nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, struct chtab **chp, struct nfs_clnt *nfscl) { clinfo_t ci; int error; /* * Set read buffer size to rsize * and add room for RPC headers. */ ci.cl_readsize = mi->mi_tsize; if (ci.cl_readsize != 0) ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); /* * If soft mount and server is down just try once. * meaning: do not retransmit. */ if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) ci.cl_retrans = 0; else ci.cl_retrans = mi->mi_retrans; ci.cl_prog = mi->mi_prog; ci.cl_vers = mi->mi_vers; ci.cl_flags = mi->mi_flags; /* * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS * security flavor, the client tries to establish a security context * by contacting the server. If the connection is timed out or reset, * e.g. server reboot, we will try again. */ do { error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); if (error == 0) break; /* * For forced unmount or zone shutdown, bail out, no retry. */ if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { error = EIO; break; } /* do not retry for softmount */ if (!(mi->mi_flags & MI_HARD)) break; /* let the caller deal with the failover case */ if (FAILOVER_MOUNT(mi)) break; } while (error == ETIMEDOUT || error == ECONNRESET); return (error); } static void clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) { if (cl->cl_auth != NULL) { sec_clnt_freeh(cl->cl_auth); cl->cl_auth = NULL; } /* * Timestamp this cache entry so that we know when it was last * used. */ cp->ch_freed = gethrestime_sec(); /* * Add the free client handle to the front of the list. * This way, the list will be sorted in youngest to oldest * order. */ mutex_enter(&nfscl->nfscl_chtable_lock); cp->ch_list = cp->ch_head->ch_list; cp->ch_head->ch_list = cp; mutex_exit(&nfscl->nfscl_chtable_lock); } void clfree(CLIENT *cl, struct chtab *cp) { struct nfs_clnt *nfscl; nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); ASSERT(nfscl != NULL); clfree_impl(cl, cp, nfscl); } #define CL_HOLDTIME 60 /* time to hold client handles */ static void clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) { struct chhead *ch; struct chtab *cp; /* list of objects that can be reclaimed */ struct chtab *cpe; struct chtab *cpl; struct chtab **cpp; #ifdef DEBUG int n = 0; #endif /* * Need to reclaim some memory, so step through the cache * looking through the lists for entries which can be freed. */ cp = NULL; mutex_enter(&nfscl->nfscl_chtable_lock); /* * Here we step through each non-NULL quadruple and start to * construct the reclaim list pointed to by cp. Note that * cp will contain all eligible chtab entries. When this traversal * completes, chtab entries from the last quadruple will be at the * front of cp and entries from previously inspected quadruples have * been appended to the rear of cp. */ for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { if (ch->ch_list == NULL) continue; /* * Search each list for entries older then * cl_holdtime seconds. The lists are maintained * in youngest to oldest order so that when the * first entry is found which is old enough, then * all of the rest of the entries on the list will * be old enough as well. */ cpl = ch->ch_list; cpp = &ch->ch_list; while (cpl != NULL && cpl->ch_freed + cl_holdtime > gethrestime_sec()) { cpp = &cpl->ch_list; cpl = cpl->ch_list; } if (cpl != NULL) { *cpp = NULL; if (cp != NULL) { cpe = cpl; while (cpe->ch_list != NULL) cpe = cpe->ch_list; cpe->ch_list = cp; } cp = cpl; } } mutex_exit(&nfscl->nfscl_chtable_lock); /* * If cp is empty, then there is nothing to reclaim here. */ if (cp == NULL) return; /* * Step through the list of entries to free, destroying each client * handle and kmem_free'ing the memory for each entry. */ while (cp != NULL) { #ifdef DEBUG n++; #endif CLNT_DESTROY(cp->ch_client); cpl = cp->ch_list; kmem_cache_free(chtab_cache, cp); cp = cpl; } #ifdef DEBUG /* * Update clalloc so that nfsstat shows the current number * of allocated client handles. */ atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); #endif } /* ARGSUSED */ static void clreclaim(void *all) { struct nfs_clnt *nfscl; #ifdef DEBUG clstat_debug.clreclaim.value.ui64++; #endif /* * The system is low on memory; go through and try to reclaim some from * every zone on the system. */ mutex_enter(&nfs_clnt_list_lock); nfscl = list_head(&nfs_clnt_list); for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) clreclaim_zone(nfscl, CL_HOLDTIME); mutex_exit(&nfs_clnt_list_lock); } /* * Minimum time-out values indexed by call type * These units are in "eights" of a second to avoid multiplies */ static unsigned int minimum_timeo[] = { 6, 7, 10 }; /* * Back off for retransmission timeout, MAXTIMO is in hz of a sec */ #define MAXTIMO (20*hz) #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ /* * Function called when rfscall notices that we have been * re-transmitting, or when we get a response without retransmissions. * Return 1 if the transfer size was adjusted down - 0 if no change. */ static int nfs_feedback(int flag, int which, mntinfo_t *mi) { int kind; int r = 0; mutex_enter(&mi->mi_lock); if (flag == FEEDBACK_REXMIT1) { if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) goto done; if (mi->mi_curread > MIN_NFS_TSIZE) { mi->mi_curread /= 2; if (mi->mi_curread < MIN_NFS_TSIZE) mi->mi_curread = MIN_NFS_TSIZE; r = 1; } if (mi->mi_curwrite > MIN_NFS_TSIZE) { mi->mi_curwrite /= 2; if (mi->mi_curwrite < MIN_NFS_TSIZE) mi->mi_curwrite = MIN_NFS_TSIZE; r = 1; } } else if (flag == FEEDBACK_OK) { kind = mi->mi_timer_type[which]; if (kind == 0 || mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) goto done; if (kind == 1) { if (mi->mi_curread >= mi->mi_tsize) goto done; mi->mi_curread += MIN_NFS_TSIZE; if (mi->mi_curread > mi->mi_tsize/2) mi->mi_curread = mi->mi_tsize; } else if (kind == 2) { if (mi->mi_curwrite >= mi->mi_stsize) goto done; mi->mi_curwrite += MIN_NFS_TSIZE; if (mi->mi_curwrite > mi->mi_stsize/2) mi->mi_curwrite = mi->mi_stsize; } } done: mutex_exit(&mi->mi_lock); return (r); } #ifdef DEBUG static int rfs2call_hits = 0; static int rfs2call_misses = 0; #endif int rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, enum nfsstat *statusp, int flags, failinfo_t *fi) { int rpcerror; enum clnt_stat rpc_status; ASSERT(statusp != NULL); rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, cr, douprintf, &rpc_status, flags, fi); if (!rpcerror) { /* * See crnetadjust() for comments. */ if (*statusp == NFSERR_ACCES && (cr = crnetadjust(cr)) != NULL) { #ifdef DEBUG rfs2call_hits++; #endif rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, cr, douprintf, NULL, flags, fi); crfree(cr); #ifdef DEBUG if (*statusp == NFSERR_ACCES) rfs2call_misses++; #endif } } else if (rpc_status == RPC_PROCUNAVAIL) { *statusp = NFSERR_OPNOTSUPP; rpcerror = 0; } return (rpcerror); } #define NFS3_JUKEBOX_DELAY 10 * hz static clock_t nfs3_jukebox_delay = 0; #ifdef DEBUG static int rfs3call_hits = 0; static int rfs3call_misses = 0; #endif int rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, nfsstat3 *statusp, int flags, failinfo_t *fi) { int rpcerror; int user_informed; user_informed = 0; do { rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, cr, douprintf, NULL, flags, fi); if (!rpcerror) { cred_t *crr; if (*statusp == NFS3ERR_JUKEBOX) { if (ttoproc(curthread) == &p0) { rpcerror = EAGAIN; break; } if (!user_informed) { user_informed = 1; uprintf( "file temporarily unavailable on the server, retrying...\n"); } delay(nfs3_jukebox_delay); } /* * See crnetadjust() for comments. */ else if (*statusp == NFS3ERR_ACCES && (crr = crnetadjust(cr)) != NULL) { #ifdef DEBUG rfs3call_hits++; #endif rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, crr, douprintf, NULL, flags, fi); crfree(crr); #ifdef DEBUG if (*statusp == NFS3ERR_ACCES) rfs3call_misses++; #endif } } } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); return (rpcerror); } #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) #define INC_READERS(mi) { \ mi->mi_readers++; \ } #define DEC_READERS(mi) { \ mi->mi_readers--; \ if (mi->mi_readers == 0) \ cv_broadcast(&mi->mi_failover_cv); \ } static int rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, enum clnt_stat *rpc_status, int flags, failinfo_t *fi) { CLIENT *client; struct chtab *ch; cred_t *cr = icr; enum clnt_stat status; struct rpc_err rpcerr, rpcerr_tmp; struct timeval wait; int timeo; /* in units of hz */ int my_rsize, my_wsize; bool_t tryagain; bool_t cred_cloned = FALSE; k_sigset_t smask; servinfo_t *svp; struct nfs_clnt *nfscl; zoneid_t zoneid = getzoneid(); char *msg; #ifdef DEBUG char *bufp; #endif TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, "rfscall_start:which %d mi %p", which, mi); nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); ASSERT(nfscl != NULL); nfscl->nfscl_stat.calls.value.ui64++; mi->mi_reqs[which].value.ui64++; rpcerr.re_status = RPC_SUCCESS; /* * In case of forced unmount or zone shutdown, return EIO. */ if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { rpcerr.re_status = RPC_FAILED; rpcerr.re_errno = EIO; return (rpcerr.re_errno); } /* * Remember the transfer sizes in case * nfs_feedback changes them underneath us. */ my_rsize = mi->mi_curread; my_wsize = mi->mi_curwrite; /* * NFS client failover support * * If this rnode is not in sync with the current server (VALID_FH), * we'd like to do a remap to get in sync. We can be interrupted * in failover_remap(), and if so we'll bail. Otherwise, we'll * use the best info we have to try the RPC. Part of that is * unconditionally updating the filehandle copy kept for V3. * * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible * rw_enter(); we're trying to keep the current server from being * changed on us until we're done with the remapping and have a * matching client handle. We don't want to sending a filehandle * to the wrong host. */ failoverretry: if (FAILOVER_MOUNT(mi)) { mutex_enter(&mi->mi_lock); if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { if (failover_wait(mi)) { mutex_exit(&mi->mi_lock); return (EINTR); } } INC_READERS(mi); mutex_exit(&mi->mi_lock); if (fi) { if (!VALID_FH(fi) && !(flags & RFSCALL_SOFT) && failover_safe(fi)) { int remaperr; svp = mi->mi_curr_serv; remaperr = failover_remap(fi); if (remaperr != 0) { #ifdef DEBUG if (remaperr != EINTR) nfs_cmn_err(remaperr, CE_WARN, "rfscall couldn't failover: %m"); #endif mutex_enter(&mi->mi_lock); DEC_READERS(mi); mutex_exit(&mi->mi_lock); /* * If failover_remap returns ETIMEDOUT * and the filesystem is hard mounted * we have to retry the call with a new * server. */ if ((mi->mi_flags & MI_HARD) && IS_RECOVERABLE_ERROR(remaperr)) { if (svp == mi->mi_curr_serv) failover_newserver(mi); rpcerr.re_status = RPC_SUCCESS; goto failoverretry; } rpcerr.re_errno = remaperr; return (remaperr); } } if (fi->fhp && fi->copyproc) (*fi->copyproc)(fi->fhp, fi->vp); } } /* For TSOL, use a new cred which has net_mac_aware flag */ if (!cred_cloned && is_system_labeled()) { cred_cloned = TRUE; cr = crdup(icr); (void) setpflags(NET_MAC_AWARE, 1, cr); } /* * clget() calls clnt_tli_kinit() which clears the xid, so we * are guaranteed to reprocess the retry as a new request. */ svp = mi->mi_curr_serv; rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); if (FAILOVER_MOUNT(mi)) { mutex_enter(&mi->mi_lock); DEC_READERS(mi); mutex_exit(&mi->mi_lock); if ((rpcerr.re_errno == ETIMEDOUT || rpcerr.re_errno == ECONNRESET) && failover_safe(fi)) { if (svp == mi->mi_curr_serv) failover_newserver(mi); goto failoverretry; } } if (rpcerr.re_errno != 0) return (rpcerr.re_errno); if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || svp->sv_knconf->knc_semantics == NC_TPI_COTS) { timeo = (mi->mi_timeo * hz) / 10; } else { mutex_enter(&mi->mi_lock); timeo = CLNT_SETTIMERS(client, &(mi->mi_timers[mi->mi_timer_type[which]]), &(mi->mi_timers[NFS_CALLTYPES]), (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, (void (*)())NULL, (caddr_t)mi, 0); mutex_exit(&mi->mi_lock); } /* * If hard mounted fs, retry call forever unless hard error occurs. */ do { tryagain = FALSE; if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { status = RPC_FAILED; rpcerr.re_status = RPC_FAILED; rpcerr.re_errno = EIO; break; } TICK_TO_TIMEVAL(timeo, &wait); /* * Mask out all signals except SIGHUP, SIGINT, SIGQUIT * and SIGTERM. (Preserving the existing masks). * Mask out SIGINT if mount option nointr is specified. */ sigintr(&smask, (int)mi->mi_flags & MI_INT); if (!(mi->mi_flags & MI_INT)) client->cl_nosignal = TRUE; /* * If there is a current signal, then don't bother * even trying to send out the request because we * won't be able to block waiting for the response. * Simply assume RPC_INTR and get on with it. */ if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) status = RPC_INTR; else { status = CLNT_CALL(client, which, xdrargs, argsp, xdrres, resp, wait); } if (!(mi->mi_flags & MI_INT)) client->cl_nosignal = FALSE; /* * restore original signal mask */ sigunintr(&smask); switch (status) { case RPC_SUCCESS: if ((mi->mi_flags & MI_DYNAMIC) && mi->mi_timer_type[which] != 0 && (mi->mi_curread != my_rsize || mi->mi_curwrite != my_wsize)) (void) nfs_feedback(FEEDBACK_OK, which, mi); break; case RPC_INTR: /* * There is no way to recover from this error, * even if mount option nointr is specified. * SIGKILL, for example, cannot be blocked. */ rpcerr.re_status = RPC_INTR; rpcerr.re_errno = EINTR; break; case RPC_UDERROR: /* * If the NFS server is local (vold) and * it goes away then we get RPC_UDERROR. * This is a retryable error, so we would * loop, so check to see if the specific * error was ECONNRESET, indicating that * target did not exist at all. If so, * return with RPC_PROGUNAVAIL and * ECONNRESET to indicate why. */ CLNT_GETERR(client, &rpcerr); if (rpcerr.re_errno == ECONNRESET) { rpcerr.re_status = RPC_PROGUNAVAIL; rpcerr.re_errno = ECONNRESET; break; } /*FALLTHROUGH*/ default: /* probably RPC_TIMEDOUT */ if (IS_UNRECOVERABLE_RPC(status)) break; /* * increment server not responding count */ mutex_enter(&mi->mi_lock); mi->mi_noresponse++; mutex_exit(&mi->mi_lock); #ifdef DEBUG nfscl->nfscl_stat.noresponse.value.ui64++; #endif if (!(mi->mi_flags & MI_HARD)) { if (!(mi->mi_flags & MI_SEMISOFT) || (mi->mi_ss_call_type[which] == 0)) break; } /* * The call is in progress (over COTS). * Try the CLNT_CALL again, but don't * print a noisy error message. */ if (status == RPC_INPROGRESS) { tryagain = TRUE; break; } if (flags & RFSCALL_SOFT) break; /* * On zone shutdown, just move on. */ if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) { rpcerr.re_status = RPC_FAILED; rpcerr.re_errno = EIO; break; } /* * NFS client failover support * * If the current server just failed us, we'll * start the process of finding a new server. * After that, we can just retry. */ if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { if (svp == mi->mi_curr_serv) failover_newserver(mi); clfree_impl(client, ch, nfscl); goto failoverretry; } tryagain = TRUE; timeo = backoff(timeo); CLNT_GETERR(client, &rpcerr_tmp); if ((status == RPC_CANTSEND) && (rpcerr_tmp.re_errno == ENOBUFS)) msg = SRV_QFULL_MSG; else msg = SRV_NOTRESP_MSG; mutex_enter(&mi->mi_lock); if (!(mi->mi_flags & MI_PRINTED)) { mi->mi_flags |= MI_PRINTED; mutex_exit(&mi->mi_lock); #ifdef DEBUG zprintf(zoneid, msg, mi->mi_vers, svp->sv_hostname); #else zprintf(zoneid, msg, svp->sv_hostname); #endif } else mutex_exit(&mi->mi_lock); if (*douprintf && nfs_has_ctty()) { *douprintf = 0; if (!(mi->mi_flags & MI_NOPRINT)) #ifdef DEBUG uprintf(msg, mi->mi_vers, svp->sv_hostname); #else uprintf(msg, svp->sv_hostname); #endif } /* * If doing dynamic adjustment of transfer * size and if it's a read or write call * and if the transfer size changed while * retransmitting or if the feedback routine * changed the transfer size, * then exit rfscall so that the transfer * size can be adjusted at the vnops level. */ if ((mi->mi_flags & MI_DYNAMIC) && mi->mi_timer_type[which] != 0 && (mi->mi_curread != my_rsize || mi->mi_curwrite != my_wsize || nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { /* * On read or write calls, return * back to the vnode ops level if * the transfer size changed. */ clfree_impl(client, ch, nfscl); if (cred_cloned) crfree(cr); return (ENFS_TRYAGAIN); } } } while (tryagain); if (status != RPC_SUCCESS) { /* * Let soft mounts use the timed out message. */ if (status == RPC_INPROGRESS) status = RPC_TIMEDOUT; nfscl->nfscl_stat.badcalls.value.ui64++; if (status != RPC_INTR) { mutex_enter(&mi->mi_lock); mi->mi_flags |= MI_DOWN; mutex_exit(&mi->mi_lock); CLNT_GETERR(client, &rpcerr); #ifdef DEBUG bufp = clnt_sperror(client, svp->sv_hostname); zprintf(zoneid, "NFS%d %s failed for %s\n", mi->mi_vers, mi->mi_rfsnames[which], bufp); if (nfs_has_ctty()) { if (!(mi->mi_flags & MI_NOPRINT)) { uprintf("NFS%d %s failed for %s\n", mi->mi_vers, mi->mi_rfsnames[which], bufp); } } kmem_free(bufp, MAXPATHLEN); #else zprintf(zoneid, "NFS %s failed for server %s: error %d (%s)\n", mi->mi_rfsnames[which], svp->sv_hostname, status, clnt_sperrno(status)); if (nfs_has_ctty()) { if (!(mi->mi_flags & MI_NOPRINT)) { uprintf( "NFS %s failed for server %s: error %d (%s)\n", mi->mi_rfsnames[which], svp->sv_hostname, status, clnt_sperrno(status)); } } #endif /* * when CLNT_CALL() fails with RPC_AUTHERROR, * re_errno is set appropriately depending on * the authentication error */ if (status == RPC_VERSMISMATCH || status == RPC_PROGVERSMISMATCH) rpcerr.re_errno = EIO; } } else { /* * Test the value of mi_down and mi_printed without * holding the mi_lock mutex. If they are both zero, * then it is okay to skip the down and printed * processing. This saves on a mutex_enter and * mutex_exit pair for a normal, successful RPC. * This was just complete overhead. */ if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { mutex_enter(&mi->mi_lock); mi->mi_flags &= ~MI_DOWN; if (mi->mi_flags & MI_PRINTED) { mi->mi_flags &= ~MI_PRINTED; mutex_exit(&mi->mi_lock); #ifdef DEBUG if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) zprintf(zoneid, "NFS%d server %s ok\n", mi->mi_vers, svp->sv_hostname); #else if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) zprintf(zoneid, "NFS server %s ok\n", svp->sv_hostname); #endif } else mutex_exit(&mi->mi_lock); } if (*douprintf == 0) { if (!(mi->mi_flags & MI_NOPRINT)) #ifdef DEBUG if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) uprintf("NFS%d server %s ok\n", mi->mi_vers, svp->sv_hostname); #else if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) uprintf("NFS server %s ok\n", svp->sv_hostname); #endif *douprintf = 1; } } clfree_impl(client, ch, nfscl); if (cred_cloned) crfree(cr); ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); if (rpc_status != NULL) *rpc_status = rpcerr.re_status; TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", rpcerr.re_errno); return (rpcerr.re_errno); } #ifdef DEBUG static int acl2call_hits = 0; static int acl2call_misses = 0; #endif int acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, enum nfsstat *statusp, int flags, failinfo_t *fi) { int rpcerror; rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, cr, douprintf, flags, fi); if (!rpcerror) { /* * See comments with crnetadjust(). */ if (*statusp == NFSERR_ACCES && (cr = crnetadjust(cr)) != NULL) { #ifdef DEBUG acl2call_hits++; #endif rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, cr, douprintf, flags, fi); crfree(cr); #ifdef DEBUG if (*statusp == NFSERR_ACCES) acl2call_misses++; #endif } } return (rpcerror); } #ifdef DEBUG static int acl3call_hits = 0; static int acl3call_misses = 0; #endif int acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, nfsstat3 *statusp, int flags, failinfo_t *fi) { int rpcerror; int user_informed; user_informed = 0; do { rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, cr, douprintf, flags, fi); if (!rpcerror) { cred_t *crr; if (*statusp == NFS3ERR_JUKEBOX) { if (!user_informed) { user_informed = 1; uprintf( "file temporarily unavailable on the server, retrying...\n"); } delay(nfs3_jukebox_delay); } /* * See crnetadjust() for comments. */ else if (*statusp == NFS3ERR_ACCES && (crr = crnetadjust(cr)) != NULL) { #ifdef DEBUG acl3call_hits++; #endif rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, crr, douprintf, flags, fi); crfree(crr); #ifdef DEBUG if (*statusp == NFS3ERR_ACCES) acl3call_misses++; #endif } } } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); return (rpcerror); } static int aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, int flags, failinfo_t *fi) { CLIENT *client; struct chtab *ch; cred_t *cr = icr; bool_t cred_cloned = FALSE; enum clnt_stat status; struct rpc_err rpcerr; struct timeval wait; int timeo; /* in units of hz */ #if 0 /* notyet */ int my_rsize, my_wsize; #endif bool_t tryagain; k_sigset_t smask; servinfo_t *svp; struct nfs_clnt *nfscl; zoneid_t zoneid = getzoneid(); #ifdef DEBUG char *bufp; #endif #if 0 /* notyet */ TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, "rfscall_start:which %d mi %p", which, mi); #endif nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); ASSERT(nfscl != NULL); nfscl->nfscl_stat.calls.value.ui64++; mi->mi_aclreqs[which].value.ui64++; rpcerr.re_status = RPC_SUCCESS; if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { rpcerr.re_status = RPC_FAILED; rpcerr.re_errno = EIO; return (rpcerr.re_errno); } #if 0 /* notyet */ /* * Remember the transfer sizes in case * nfs_feedback changes them underneath us. */ my_rsize = mi->mi_curread; my_wsize = mi->mi_curwrite; #endif /* * NFS client failover support * * If this rnode is not in sync with the current server (VALID_FH), * we'd like to do a remap to get in sync. We can be interrupted * in failover_remap(), and if so we'll bail. Otherwise, we'll * use the best info we have to try the RPC. Part of that is * unconditionally updating the filehandle copy kept for V3. * * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible * rw_enter(); we're trying to keep the current server from being * changed on us until we're done with the remapping and have a * matching client handle. We don't want to sending a filehandle * to the wrong host. */ failoverretry: if (FAILOVER_MOUNT(mi)) { mutex_enter(&mi->mi_lock); if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { if (failover_wait(mi)) { mutex_exit(&mi->mi_lock); return (EINTR); } } INC_READERS(mi); mutex_exit(&mi->mi_lock); if (fi) { if (!VALID_FH(fi) && !(flags & RFSCALL_SOFT) && failover_safe(fi)) { int remaperr; svp = mi->mi_curr_serv; remaperr = failover_remap(fi); if (remaperr != 0) { #ifdef DEBUG if (remaperr != EINTR) nfs_cmn_err(remaperr, CE_WARN, "aclcall couldn't failover: %m"); #endif mutex_enter(&mi->mi_lock); DEC_READERS(mi); mutex_exit(&mi->mi_lock); /* * If failover_remap returns ETIMEDOUT * and the filesystem is hard mounted * we have to retry the call with a new * server. */ if ((mi->mi_flags & MI_HARD) && IS_RECOVERABLE_ERROR(remaperr)) { if (svp == mi->mi_curr_serv) failover_newserver(mi); rpcerr.re_status = RPC_SUCCESS; goto failoverretry; } return (remaperr); } } if (fi->fhp && fi->copyproc) (*fi->copyproc)(fi->fhp, fi->vp); } } /* For TSOL, use a new cred which has net_mac_aware flag */ if (!cred_cloned && is_system_labeled()) { cred_cloned = TRUE; cr = crdup(icr); (void) setpflags(NET_MAC_AWARE, 1, cr); } /* * acl_clget() calls clnt_tli_kinit() which clears the xid, so we * are guaranteed to reprocess the retry as a new request. */ svp = mi->mi_curr_serv; rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); if (FAILOVER_MOUNT(mi)) { mutex_enter(&mi->mi_lock); DEC_READERS(mi); mutex_exit(&mi->mi_lock); if ((rpcerr.re_errno == ETIMEDOUT || rpcerr.re_errno == ECONNRESET) && failover_safe(fi)) { if (svp == mi->mi_curr_serv) failover_newserver(mi); goto failoverretry; } } if (rpcerr.re_errno != 0) { if (cred_cloned) crfree(cr); return (rpcerr.re_errno); } if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || svp->sv_knconf->knc_semantics == NC_TPI_COTS) { timeo = (mi->mi_timeo * hz) / 10; } else { mutex_enter(&mi->mi_lock); timeo = CLNT_SETTIMERS(client, &(mi->mi_timers[mi->mi_acl_timer_type[which]]), &(mi->mi_timers[NFS_CALLTYPES]), (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, (void (*)()) 0, (caddr_t)mi, 0); mutex_exit(&mi->mi_lock); } /* * If hard mounted fs, retry call forever unless hard error occurs. */ do { tryagain = FALSE; if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { status = RPC_FAILED; rpcerr.re_status = RPC_FAILED; rpcerr.re_errno = EIO; break; } TICK_TO_TIMEVAL(timeo, &wait); /* * Mask out all signals except SIGHUP, SIGINT, SIGQUIT * and SIGTERM. (Preserving the existing masks). * Mask out SIGINT if mount option nointr is specified. */ sigintr(&smask, (int)mi->mi_flags & MI_INT); if (!(mi->mi_flags & MI_INT)) client->cl_nosignal = TRUE; /* * If there is a current signal, then don't bother * even trying to send out the request because we * won't be able to block waiting for the response. * Simply assume RPC_INTR and get on with it. */ if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) status = RPC_INTR; else { status = CLNT_CALL(client, which, xdrargs, argsp, xdrres, resp, wait); } if (!(mi->mi_flags & MI_INT)) client->cl_nosignal = FALSE; /* * restore original signal mask */ sigunintr(&smask); switch (status) { case RPC_SUCCESS: #if 0 /* notyet */ if ((mi->mi_flags & MI_DYNAMIC) && mi->mi_timer_type[which] != 0 && (mi->mi_curread != my_rsize || mi->mi_curwrite != my_wsize)) (void) nfs_feedback(FEEDBACK_OK, which, mi); #endif break; /* * Unfortunately, there are servers in the world which * are not coded correctly. They are not prepared to * handle RPC requests to the NFS port which are not * NFS requests. Thus, they may try to process the * NFS_ACL request as if it were an NFS request. This * does not work. Generally, an error will be generated * on the client because it will not be able to decode * the response from the server. However, it seems * possible that the server may not be able to decode * the arguments. Thus, the criteria for deciding * whether the server supports NFS_ACL or not is whether * the following RPC errors are returned from CLNT_CALL. */ case RPC_CANTDECODERES: case RPC_PROGUNAVAIL: case RPC_CANTDECODEARGS: case RPC_PROGVERSMISMATCH: mutex_enter(&mi->mi_lock); mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); mutex_exit(&mi->mi_lock); break; /* * If the server supports NFS_ACL but not the new ops * for extended attributes, make sure we don't retry. */ case RPC_PROCUNAVAIL: mutex_enter(&mi->mi_lock); mi->mi_flags &= ~MI_EXTATTR; mutex_exit(&mi->mi_lock); break; case RPC_INTR: /* * There is no way to recover from this error, * even if mount option nointr is specified. * SIGKILL, for example, cannot be blocked. */ rpcerr.re_status = RPC_INTR; rpcerr.re_errno = EINTR; break; case RPC_UDERROR: /* * If the NFS server is local (vold) and * it goes away then we get RPC_UDERROR. * This is a retryable error, so we would * loop, so check to see if the specific * error was ECONNRESET, indicating that * target did not exist at all. If so, * return with RPC_PROGUNAVAIL and * ECONNRESET to indicate why. */ CLNT_GETERR(client, &rpcerr); if (rpcerr.re_errno == ECONNRESET) { rpcerr.re_status = RPC_PROGUNAVAIL; rpcerr.re_errno = ECONNRESET; break; } /*FALLTHROUGH*/ default: /* probably RPC_TIMEDOUT */ if (IS_UNRECOVERABLE_RPC(status)) break; /* * increment server not responding count */ mutex_enter(&mi->mi_lock); mi->mi_noresponse++; mutex_exit(&mi->mi_lock); #ifdef DEBUG nfscl->nfscl_stat.noresponse.value.ui64++; #endif if (!(mi->mi_flags & MI_HARD)) { if (!(mi->mi_flags & MI_SEMISOFT) || (mi->mi_acl_ss_call_type[which] == 0)) break; } /* * The call is in progress (over COTS). * Try the CLNT_CALL again, but don't * print a noisy error message. */ if (status == RPC_INPROGRESS) { tryagain = TRUE; break; } if (flags & RFSCALL_SOFT) break; /* * On zone shutdown, just move on. */ if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) { rpcerr.re_status = RPC_FAILED; rpcerr.re_errno = EIO; break; } /* * NFS client failover support * * If the current server just failed us, we'll * start the process of finding a new server. * After that, we can just retry. */ if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { if (svp == mi->mi_curr_serv) failover_newserver(mi); clfree_impl(client, ch, nfscl); goto failoverretry; } tryagain = TRUE; timeo = backoff(timeo); mutex_enter(&mi->mi_lock); if (!(mi->mi_flags & MI_PRINTED)) { mi->mi_flags |= MI_PRINTED; mutex_exit(&mi->mi_lock); #ifdef DEBUG zprintf(zoneid, "NFS_ACL%d server %s not responding still trying\n", mi->mi_vers, svp->sv_hostname); #else zprintf(zoneid, "NFS server %s not responding still trying\n", svp->sv_hostname); #endif } else mutex_exit(&mi->mi_lock); if (*douprintf && nfs_has_ctty()) { *douprintf = 0; if (!(mi->mi_flags & MI_NOPRINT)) #ifdef DEBUG uprintf( "NFS_ACL%d server %s not responding still trying\n", mi->mi_vers, svp->sv_hostname); #else uprintf( "NFS server %s not responding still trying\n", svp->sv_hostname); #endif } #if 0 /* notyet */ /* * If doing dynamic adjustment of transfer * size and if it's a read or write call * and if the transfer size changed while * retransmitting or if the feedback routine * changed the transfer size, * then exit rfscall so that the transfer * size can be adjusted at the vnops level. */ if ((mi->mi_flags & MI_DYNAMIC) && mi->mi_acl_timer_type[which] != 0 && (mi->mi_curread != my_rsize || mi->mi_curwrite != my_wsize || nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { /* * On read or write calls, return * back to the vnode ops level if * the transfer size changed. */ clfree_impl(client, ch, nfscl); if (cred_cloned) crfree(cr); return (ENFS_TRYAGAIN); } #endif } } while (tryagain); if (status != RPC_SUCCESS) { /* * Let soft mounts use the timed out message. */ if (status == RPC_INPROGRESS) status = RPC_TIMEDOUT; nfscl->nfscl_stat.badcalls.value.ui64++; if (status == RPC_CANTDECODERES || status == RPC_PROGUNAVAIL || status == RPC_PROCUNAVAIL || status == RPC_CANTDECODEARGS || status == RPC_PROGVERSMISMATCH) CLNT_GETERR(client, &rpcerr); else if (status != RPC_INTR) { mutex_enter(&mi->mi_lock); mi->mi_flags |= MI_DOWN; mutex_exit(&mi->mi_lock); CLNT_GETERR(client, &rpcerr); #ifdef DEBUG bufp = clnt_sperror(client, svp->sv_hostname); zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", mi->mi_vers, mi->mi_aclnames[which], bufp); if (nfs_has_ctty()) { if (!(mi->mi_flags & MI_NOPRINT)) { uprintf("NFS_ACL%d %s failed for %s\n", mi->mi_vers, mi->mi_aclnames[which], bufp); } } kmem_free(bufp, MAXPATHLEN); #else zprintf(zoneid, "NFS %s failed for server %s: error %d (%s)\n", mi->mi_aclnames[which], svp->sv_hostname, status, clnt_sperrno(status)); if (nfs_has_ctty()) { if (!(mi->mi_flags & MI_NOPRINT)) uprintf( "NFS %s failed for server %s: error %d (%s)\n", mi->mi_aclnames[which], svp->sv_hostname, status, clnt_sperrno(status)); } #endif /* * when CLNT_CALL() fails with RPC_AUTHERROR, * re_errno is set appropriately depending on * the authentication error */ if (status == RPC_VERSMISMATCH || status == RPC_PROGVERSMISMATCH) rpcerr.re_errno = EIO; } } else { /* * Test the value of mi_down and mi_printed without * holding the mi_lock mutex. If they are both zero, * then it is okay to skip the down and printed * processing. This saves on a mutex_enter and * mutex_exit pair for a normal, successful RPC. * This was just complete overhead. */ if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { mutex_enter(&mi->mi_lock); mi->mi_flags &= ~MI_DOWN; if (mi->mi_flags & MI_PRINTED) { mi->mi_flags &= ~MI_PRINTED; mutex_exit(&mi->mi_lock); #ifdef DEBUG zprintf(zoneid, "NFS_ACL%d server %s ok\n", mi->mi_vers, svp->sv_hostname); #else zprintf(zoneid, "NFS server %s ok\n", svp->sv_hostname); #endif } else mutex_exit(&mi->mi_lock); } if (*douprintf == 0) { if (!(mi->mi_flags & MI_NOPRINT)) #ifdef DEBUG uprintf("NFS_ACL%d server %s ok\n", mi->mi_vers, svp->sv_hostname); #else uprintf("NFS server %s ok\n", svp->sv_hostname); #endif *douprintf = 1; } } clfree_impl(client, ch, nfscl); if (cred_cloned) crfree(cr); ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); #if 0 /* notyet */ TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", rpcerr.re_errno); #endif return (rpcerr.re_errno); } int vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) { uint_t mask = vap->va_mask; if (!(mask & AT_MODE)) sa->sa_mode = (uint32_t)-1; else sa->sa_mode = vap->va_mode; if (!(mask & AT_UID)) sa->sa_uid = (uint32_t)-1; else sa->sa_uid = (uint32_t)vap->va_uid; if (!(mask & AT_GID)) sa->sa_gid = (uint32_t)-1; else sa->sa_gid = (uint32_t)vap->va_gid; if (!(mask & AT_SIZE)) sa->sa_size = (uint32_t)-1; else sa->sa_size = (uint32_t)vap->va_size; if (!(mask & AT_ATIME)) sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; else { /* check time validity */ if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { return (EOVERFLOW); } sa->sa_atime.tv_sec = vap->va_atime.tv_sec; sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; } if (!(mask & AT_MTIME)) sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; else { /* check time validity */ if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { return (EOVERFLOW); } sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; } return (0); } int vattr_to_sattr3(struct vattr *vap, sattr3 *sa) { uint_t mask = vap->va_mask; if (!(mask & AT_MODE)) sa->mode.set_it = FALSE; else { sa->mode.set_it = TRUE; sa->mode.mode = (mode3)vap->va_mode; } if (!(mask & AT_UID)) sa->uid.set_it = FALSE; else { sa->uid.set_it = TRUE; sa->uid.uid = (uid3)vap->va_uid; } if (!(mask & AT_GID)) sa->gid.set_it = FALSE; else { sa->gid.set_it = TRUE; sa->gid.gid = (gid3)vap->va_gid; } if (!(mask & AT_SIZE)) sa->size.set_it = FALSE; else { sa->size.set_it = TRUE; sa->size.size = (size3)vap->va_size; } if (!(mask & AT_ATIME)) sa->atime.set_it = DONT_CHANGE; else { /* check time validity */ if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { return (EOVERFLOW); } sa->atime.set_it = SET_TO_CLIENT_TIME; sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; } if (!(mask & AT_MTIME)) sa->mtime.set_it = DONT_CHANGE; else { /* check time validity */ if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { return (EOVERFLOW); } sa->mtime.set_it = SET_TO_CLIENT_TIME; sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; } return (0); } void setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) { da->da_fhandle = VTOFH(dvp); da->da_name = nm; da->da_flags = 0; } void setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) { da->dirp = VTOFH3(dvp); da->name = nm; } int setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) { int error; rnode_t *rp; struct vattr va; va.va_mask = AT_MODE | AT_GID; error = VOP_GETATTR(dvp, &va, 0, cr, NULL); if (error) return (error); /* * To determine the expected group-id of the created file: * 1) If the filesystem was not mounted with the Old-BSD-compatible * GRPID option, and the directory's set-gid bit is clear, * then use the process's gid. * 2) Otherwise, set the group-id to the gid of the parent directory. */ rp = VTOR(dvp); mutex_enter(&rp->r_statelock); if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) *gidp = crgetgid(cr); else *gidp = va.va_gid; mutex_exit(&rp->r_statelock); return (0); } int setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) { int error; struct vattr va; va.va_mask = AT_MODE; error = VOP_GETATTR(dvp, &va, 0, cr, NULL); if (error) return (error); /* * Modify the expected mode (om) so that the set-gid bit matches * that of the parent directory (dvp). */ if (va.va_mode & VSGID) *omp |= VSGID; else *omp &= ~VSGID; return (0); } void nfs_setswaplike(vnode_t *vp, vattr_t *vap) { if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { if (!(vp->v_flag & VSWAPLIKE)) { mutex_enter(&vp->v_lock); vp->v_flag |= VSWAPLIKE; mutex_exit(&vp->v_lock); } } else { if (vp->v_flag & VSWAPLIKE) { mutex_enter(&vp->v_lock); vp->v_flag &= ~VSWAPLIKE; mutex_exit(&vp->v_lock); } } } /* * Free the resources associated with an rnode. */ static void rinactive(rnode_t *rp, cred_t *cr) { vnode_t *vp; cred_t *cred; char *contents; int size; vsecattr_t *vsp; int error; nfs3_pathconf_info *info; /* * Before freeing anything, wait until all asynchronous * activity is done on this rnode. This will allow all * asynchronous read ahead and write behind i/o's to * finish. */ mutex_enter(&rp->r_statelock); while (rp->r_count > 0) cv_wait(&rp->r_cv, &rp->r_statelock); mutex_exit(&rp->r_statelock); /* * Flush and invalidate all pages associated with the vnode. */ vp = RTOV(rp); if (vn_has_cached_data(vp)) { ASSERT(vp->v_type != VCHR); if ((rp->r_flags & RDIRTY) && !rp->r_error) { error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); if (error && (error == ENOSPC || error == EDQUOT)) { mutex_enter(&rp->r_statelock); if (!rp->r_error) rp->r_error = error; mutex_exit(&rp->r_statelock); } } nfs_invalidate_pages(vp, (u_offset_t)0, cr); } /* * Free any held credentials and caches which may be associated * with this rnode. */ mutex_enter(&rp->r_statelock); cred = rp->r_cred; rp->r_cred = NULL; contents = rp->r_symlink.contents; size = rp->r_symlink.size; rp->r_symlink.contents = NULL; vsp = rp->r_secattr; rp->r_secattr = NULL; info = rp->r_pathconf; rp->r_pathconf = NULL; mutex_exit(&rp->r_statelock); /* * Free the held credential. */ if (cred != NULL) crfree(cred); /* * Free the access cache entries. */ (void) nfs_access_purge_rp(rp); /* * Free the readdir cache entries. */ if (HAVE_RDDIR_CACHE(rp)) nfs_purge_rddir_cache(vp); /* * Free the symbolic link cache. */ if (contents != NULL) { kmem_free((void *)contents, size); } /* * Free any cached ACL. */ if (vsp != NULL) nfs_acl_free(vsp); /* * Free any cached pathconf information. */ if (info != NULL) kmem_free(info, sizeof (*info)); } /* * Return a vnode for the given NFS Version 2 file handle. * If no rnode exists for this fhandle, create one and put it * into the hash queues. If the rnode for this fhandle * already exists, return it. * * Note: make_rnode() may upgrade the hash bucket lock to exclusive. */ vnode_t * makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, hrtime_t t, cred_t *cr, char *dnm, char *nm) { int newnode; int index; vnode_t *vp; nfs_fhandle nfh; vattr_t va; nfh.fh_len = NFS_FHSIZE; bcopy(fh, nfh.fh_buf, NFS_FHSIZE); index = rtablehash(&nfh); rw_enter(&rtable[index].r_lock, RW_READER); vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); if (attr != NULL) { if (!newnode) { rw_exit(&rtable[index].r_lock); (void) nfs_cache_fattr(vp, attr, &va, t, cr); } else { if (attr->na_type < NFNON || attr->na_type > NFSOC) vp->v_type = VBAD; else vp->v_type = n2v_type(attr); /* * A translation here seems to be necessary * because this function can be called * with `attr' that has come from the wire, * and been operated on by vattr_to_nattr(). * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() * ->makenfsnode(). */ if ((attr->na_rdev & 0xffff0000) == 0) vp->v_rdev = nfsv2_expdev(attr->na_rdev); else vp->v_rdev = expldev(n2v_rdev(attr)); nfs_attrcache(vp, attr, t); rw_exit(&rtable[index].r_lock); } } else { if (newnode) { PURGE_ATTRCACHE(vp); } rw_exit(&rtable[index].r_lock); } return (vp); } /* * Return a vnode for the given NFS Version 3 file handle. * If no rnode exists for this fhandle, create one and put it * into the hash queues. If the rnode for this fhandle * already exists, return it. * * Note: make_rnode() may upgrade the hash bucket lock to exclusive. */ vnode_t * makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, cred_t *cr, char *dnm, char *nm) { int newnode; int index; vnode_t *vp; index = rtablehash((nfs_fhandle *)fh); rw_enter(&rtable[index].r_lock, RW_READER); vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, dnm, nm); if (vap == NULL) { if (newnode) { PURGE_ATTRCACHE(vp); } rw_exit(&rtable[index].r_lock); return (vp); } if (!newnode) { rw_exit(&rtable[index].r_lock); nfs_attr_cache(vp, vap, t, cr); } else { rnode_t *rp = VTOR(vp); vp->v_type = vap->va_type; vp->v_rdev = vap->va_rdev; mutex_enter(&rp->r_statelock); if (rp->r_mtime <= t) nfs_attrcache_va(vp, vap); mutex_exit(&rp->r_statelock); rw_exit(&rtable[index].r_lock); } return (vp); } vnode_t * makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, cred_t *cr, char *dnm, char *nm) { int newnode; int index; vnode_t *vp; vattr_t va; index = rtablehash((nfs_fhandle *)fh); rw_enter(&rtable[index].r_lock, RW_READER); vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, dnm, nm); if (attr == NULL) { if (newnode) { PURGE_ATTRCACHE(vp); } rw_exit(&rtable[index].r_lock); return (vp); } if (!newnode) { rw_exit(&rtable[index].r_lock); (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); } else { if (attr->type < NF3REG || attr->type > NF3FIFO) vp->v_type = VBAD; else vp->v_type = nf3_to_vt[attr->type]; vp->v_rdev = makedevice(attr->rdev.specdata1, attr->rdev.specdata2); nfs3_attrcache(vp, attr, t); rw_exit(&rtable[index].r_lock); } return (vp); } /* * Read this comment before making changes to rtablehash()! * This is a hash function in which seemingly obvious and harmless * changes can cause escalations costing million dollars! * Know what you are doing. * * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The * algorithm is currently detailed here: * * http://burtleburtle.net/bob/hash/doobs.html * * Of course, the above link may not be valid by the time you are reading * this, but suffice it to say that the one-at-a-time algorithm works well in * almost all cases. If you are changing the algorithm be sure to verify that * the hash algorithm still provides even distribution in all cases and with * any server returning filehandles in whatever order (sequential or random). */ static int rtablehash(nfs_fhandle *fh) { ulong_t hash, len, i; char *key; key = fh->fh_buf; len = (ulong_t)fh->fh_len; for (hash = 0, i = 0; i < len; i++) { hash += key[i]; hash += (hash << 10); hash ^= (hash >> 6); } hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return (hash & rtablemask); } static vnode_t * make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, struct vnodeops *vops, int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), int (*compar)(const void *, const void *), int *newnode, cred_t *cr, char *dnm, char *nm) { rnode_t *rp; rnode_t *trp; vnode_t *vp; mntinfo_t *mi; ASSERT(RW_READ_HELD(&rhtp->r_lock)); mi = VFTOMI(vfsp); start: if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { vp = RTOV(rp); nfs_set_vroot(vp); *newnode = 0; return (vp); } rw_exit(&rhtp->r_lock); mutex_enter(&rpfreelist_lock); if (rpfreelist != NULL && rnew >= nrnode) { rp = rpfreelist; rp_rmfree(rp); mutex_exit(&rpfreelist_lock); vp = RTOV(rp); if (rp->r_flags & RHASHED) { rw_enter(&rp->r_hashq->r_lock, RW_WRITER); mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; mutex_exit(&vp->v_lock); rw_exit(&rp->r_hashq->r_lock); rw_enter(&rhtp->r_lock, RW_READER); goto start; } mutex_exit(&vp->v_lock); rp_rmhash_locked(rp); rw_exit(&rp->r_hashq->r_lock); } rinactive(rp, cr); mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; mutex_exit(&vp->v_lock); rw_enter(&rhtp->r_lock, RW_READER); goto start; } mutex_exit(&vp->v_lock); vn_invalid(vp); /* * destroy old locks before bzero'ing and * recreating the locks below. */ nfs_rw_destroy(&rp->r_rwlock); nfs_rw_destroy(&rp->r_lkserlock); mutex_destroy(&rp->r_statelock); cv_destroy(&rp->r_cv); cv_destroy(&rp->r_commit.c_cv); nfs_free_r_path(rp); avl_destroy(&rp->r_dir); /* * Make sure that if rnode is recycled then * VFS count is decremented properly before * reuse. */ VFS_RELE(vp->v_vfsp); vn_reinit(vp); } else { vnode_t *new_vp; mutex_exit(&rpfreelist_lock); rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); new_vp = vn_alloc(KM_SLEEP); atomic_inc_ulong((ulong_t *)&rnew); #ifdef DEBUG clstat_debug.nrnode.value.ui64++; #endif vp = new_vp; } bzero(rp, sizeof (*rp)); rp->r_vnode = vp; nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); rp->r_fh.fh_len = fh->fh_len; bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); rp->r_server = mi->mi_curr_serv; if (FAILOVER_MOUNT(mi)) { /* * If replicated servers, stash pathnames */ if (dnm != NULL && nm != NULL) { char *s, *p; uint_t len; len = (uint_t)(strlen(dnm) + strlen(nm) + 2); rp->r_path = kmem_alloc(len, KM_SLEEP); #ifdef DEBUG clstat_debug.rpath.value.ui64 += len; #endif s = rp->r_path; for (p = dnm; *p; p++) *s++ = *p; *s++ = '/'; for (p = nm; *p; p++) *s++ = *p; *s = '\0'; } else { /* special case for root */ rp->r_path = kmem_alloc(2, KM_SLEEP); #ifdef DEBUG clstat_debug.rpath.value.ui64 += 2; #endif *rp->r_path = '.'; *(rp->r_path + 1) = '\0'; } } VFS_HOLD(vfsp); rp->r_putapage = putapage; rp->r_hashq = rhtp; rp->r_flags = RREADDIRPLUS; avl_create(&rp->r_dir, compar, sizeof (rddir_cache), offsetof(rddir_cache, tree)); vn_setops(vp, vops); vp->v_data = (caddr_t)rp; vp->v_vfsp = vfsp; vp->v_type = VNON; vp->v_flag |= VMODSORT; nfs_set_vroot(vp); /* * There is a race condition if someone else * alloc's the rnode while no locks are held, so we * check again and recover if found. */ rw_enter(&rhtp->r_lock, RW_WRITER); if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { vp = RTOV(trp); nfs_set_vroot(vp); *newnode = 0; rw_exit(&rhtp->r_lock); rp_addfree(rp, cr); rw_enter(&rhtp->r_lock, RW_READER); return (vp); } rp_addhash(rp); *newnode = 1; return (vp); } /* * Callback function to check if the page should be marked as * modified. In the positive case, p_fsdata is set to C_NOCOMMIT. */ int nfs_setmod_check(page_t *pp) { if (pp->p_fsdata != C_NOCOMMIT) { pp->p_fsdata = C_NOCOMMIT; return (1); } return (0); } static void nfs_set_vroot(vnode_t *vp) { rnode_t *rp; nfs_fhandle *rootfh; rp = VTOR(vp); rootfh = &rp->r_server->sv_fhandle; if (rootfh->fh_len == rp->r_fh.fh_len && bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { if (!(vp->v_flag & VROOT)) { mutex_enter(&vp->v_lock); vp->v_flag |= VROOT; mutex_exit(&vp->v_lock); } } } static void nfs_free_r_path(rnode_t *rp) { char *path; size_t len; path = rp->r_path; if (path) { rp->r_path = NULL; len = strlen(path) + 1; kmem_free(path, len); #ifdef DEBUG clstat_debug.rpath.value.ui64 -= len; #endif } } /* * Put an rnode on the free list. * * Rnodes which were allocated above and beyond the normal limit * are immediately freed. */ void rp_addfree(rnode_t *rp, cred_t *cr) { vnode_t *vp; struct vfs *vfsp; vp = RTOV(rp); ASSERT(vp->v_count >= 1); ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); /* * If we have too many rnodes allocated and there are no * references to this rnode, or if the rnode is no longer * accessible by it does not reside in the hash queues, * or if an i/o error occurred while writing to the file, * then just free it instead of putting it on the rnode * freelist. */ vfsp = vp->v_vfsp; if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { if (rp->r_flags & RHASHED) { rw_enter(&rp->r_hashq->r_lock, RW_WRITER); mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; mutex_exit(&vp->v_lock); rw_exit(&rp->r_hashq->r_lock); return; } mutex_exit(&vp->v_lock); rp_rmhash_locked(rp); rw_exit(&rp->r_hashq->r_lock); } rinactive(rp, cr); /* * Recheck the vnode reference count. We need to * make sure that another reference has not been * acquired while we were not holding v_lock. The * rnode is not in the rnode hash queues, so the * only way for a reference to have been acquired * is for a VOP_PUTPAGE because the rnode was marked * with RDIRTY or for a modified page. This * reference may have been acquired before our call * to rinactive. The i/o may have been completed, * thus allowing rinactive to complete, but the * reference to the vnode may not have been released * yet. In any case, the rnode can not be destroyed * until the other references to this vnode have been * released. The other references will take care of * either destroying the rnode or placing it on the * rnode freelist. If there are no other references, * then the rnode may be safely destroyed. */ mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; mutex_exit(&vp->v_lock); return; } mutex_exit(&vp->v_lock); destroy_rnode(rp); return; } /* * Lock the hash queue and then recheck the reference count * to ensure that no other threads have acquired a reference * to indicate that the rnode should not be placed on the * freelist. If another reference has been acquired, then * just release this one and let the other thread complete * the processing of adding this rnode to the freelist. */ rw_enter(&rp->r_hashq->r_lock, RW_WRITER); mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; mutex_exit(&vp->v_lock); rw_exit(&rp->r_hashq->r_lock); return; } mutex_exit(&vp->v_lock); /* * If there is no cached data or metadata for this file, then * put the rnode on the front of the freelist so that it will * be reused before other rnodes which may have cached data or * metadata associated with them. */ mutex_enter(&rpfreelist_lock); if (rpfreelist == NULL) { rp->r_freef = rp; rp->r_freeb = rp; rpfreelist = rp; } else { rp->r_freef = rpfreelist; rp->r_freeb = rpfreelist->r_freeb; rpfreelist->r_freeb->r_freef = rp; rpfreelist->r_freeb = rp; if (!vn_has_cached_data(vp) && !HAVE_RDDIR_CACHE(rp) && rp->r_symlink.contents == NULL && rp->r_secattr == NULL && rp->r_pathconf == NULL) rpfreelist = rp; } mutex_exit(&rpfreelist_lock); rw_exit(&rp->r_hashq->r_lock); } /* * Remove an rnode from the free list. * * The caller must be holding rpfreelist_lock and the rnode * must be on the freelist. */ static void rp_rmfree(rnode_t *rp) { ASSERT(MUTEX_HELD(&rpfreelist_lock)); ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); if (rp == rpfreelist) { rpfreelist = rp->r_freef; if (rp == rpfreelist) rpfreelist = NULL; } rp->r_freeb->r_freef = rp->r_freef; rp->r_freef->r_freeb = rp->r_freeb; rp->r_freef = rp->r_freeb = NULL; } /* * Put a rnode in the hash table. * * The caller must be holding the exclusive hash queue lock. */ static void rp_addhash(rnode_t *rp) { mntinfo_t *mi; ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); ASSERT(!(rp->r_flags & RHASHED)); rp->r_hashf = rp->r_hashq->r_hashf; rp->r_hashq->r_hashf = rp; rp->r_hashb = (rnode_t *)rp->r_hashq; rp->r_hashf->r_hashb = rp; mutex_enter(&rp->r_statelock); rp->r_flags |= RHASHED; mutex_exit(&rp->r_statelock); mi = VTOMI(RTOV(rp)); mutex_enter(&mi->mi_rnodes_lock); list_insert_tail(&mi->mi_rnodes, rp); mutex_exit(&mi->mi_rnodes_lock); } /* * Remove a rnode from the hash table. * * The caller must be holding the hash queue lock. */ static void rp_rmhash_locked(rnode_t *rp) { mntinfo_t *mi; ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); ASSERT(rp->r_flags & RHASHED); rp->r_hashb->r_hashf = rp->r_hashf; rp->r_hashf->r_hashb = rp->r_hashb; mutex_enter(&rp->r_statelock); rp->r_flags &= ~RHASHED; mutex_exit(&rp->r_statelock); mi = VTOMI(RTOV(rp)); mutex_enter(&mi->mi_rnodes_lock); if (list_link_active(&rp->r_mi_link)) list_remove(&mi->mi_rnodes, rp); mutex_exit(&mi->mi_rnodes_lock); } /* * Remove a rnode from the hash table. * * The caller must not be holding the hash queue lock. */ void rp_rmhash(rnode_t *rp) { rw_enter(&rp->r_hashq->r_lock, RW_WRITER); rp_rmhash_locked(rp); rw_exit(&rp->r_hashq->r_lock); } /* * Lookup a rnode by fhandle. * * The caller must be holding the hash queue lock, either shared or exclusive. */ static rnode_t * rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) { rnode_t *rp; vnode_t *vp; ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { vp = RTOV(rp); if (vp->v_vfsp == vfsp && rp->r_fh.fh_len == fh->fh_len && bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { /* * remove rnode from free list, if necessary. */ if (rp->r_freef != NULL) { mutex_enter(&rpfreelist_lock); /* * If the rnode is on the freelist, * then remove it and use that reference * as the new reference. Otherwise, * need to increment the reference count. */ if (rp->r_freef != NULL) { rp_rmfree(rp); mutex_exit(&rpfreelist_lock); } else { mutex_exit(&rpfreelist_lock); VN_HOLD(vp); } } else VN_HOLD(vp); return (rp); } } return (NULL); } /* * Return 1 if there is an active vnode belonging to this vfs in the * rtable cache. * * Several of these checks are done without holding the usual * locks. This is safe because destroy_rtable(), rp_addfree(), * etc. will redo the necessary checks before actually destroying * any rnodes. */ int check_rtable(struct vfs *vfsp) { rnode_t *rp; vnode_t *vp; mntinfo_t *mi; ASSERT(vfsp != NULL); mi = VFTOMI(vfsp); mutex_enter(&mi->mi_rnodes_lock); for (rp = list_head(&mi->mi_rnodes); rp != NULL; rp = list_next(&mi->mi_rnodes, rp)) { vp = RTOV(rp); if (rp->r_freef == NULL || (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) || rp->r_count > 0) { mutex_exit(&mi->mi_rnodes_lock); return (1); } } mutex_exit(&mi->mi_rnodes_lock); return (0); } /* * Destroy inactive vnodes from the hash queues which belong to this * vfs. It is essential that we destroy all inactive vnodes during a * forced unmount as well as during a normal unmount. */ void destroy_rtable(struct vfs *vfsp, cred_t *cr) { rnode_t *rp; mntinfo_t *mi; ASSERT(vfsp != NULL); mi = VFTOMI(vfsp); mutex_enter(&rpfreelist_lock); mutex_enter(&mi->mi_rnodes_lock); while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) { /* * If the rnode is no longer on the freelist it is not * ours and it will be handled by some other thread, so * skip it. */ if (rp->r_freef == NULL) continue; mutex_exit(&mi->mi_rnodes_lock); rp_rmfree(rp); mutex_exit(&rpfreelist_lock); rp_rmhash(rp); /* * This call to rp_addfree will end up destroying the * rnode, but in a safe way with the appropriate set * of checks done. */ rp_addfree(rp, cr); mutex_enter(&rpfreelist_lock); mutex_enter(&mi->mi_rnodes_lock); } mutex_exit(&mi->mi_rnodes_lock); mutex_exit(&rpfreelist_lock); } /* * This routine destroys all the resources associated with the rnode * and then the rnode itself. */ static void destroy_rnode(rnode_t *rp) { vnode_t *vp; vfs_t *vfsp; vp = RTOV(rp); vfsp = vp->v_vfsp; ASSERT(vp->v_count == 1); ASSERT(rp->r_count == 0); ASSERT(rp->r_lmpl == NULL); ASSERT(rp->r_mapcnt == 0); ASSERT(!(rp->r_flags & RHASHED)); ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); atomic_dec_ulong((ulong_t *)&rnew); #ifdef DEBUG clstat_debug.nrnode.value.ui64--; #endif nfs_rw_destroy(&rp->r_rwlock); nfs_rw_destroy(&rp->r_lkserlock); mutex_destroy(&rp->r_statelock); cv_destroy(&rp->r_cv); cv_destroy(&rp->r_commit.c_cv); if (rp->r_flags & RDELMAPLIST) list_destroy(&rp->r_indelmap); nfs_free_r_path(rp); avl_destroy(&rp->r_dir); vn_invalid(vp); vn_free(vp); kmem_cache_free(rnode_cache, rp); VFS_RELE(vfsp); } /* * Flush all vnodes in this (or every) vfs. * Used by nfs_sync and by nfs_unmount. */ void rflush(struct vfs *vfsp, cred_t *cr) { int index; rnode_t *rp; vnode_t *vp, **vplist; long num, cnt; /* * Check to see whether there is anything to do. */ num = rnew; if (num == 0) return; /* * Allocate a slot for all currently active rnodes on the * supposition that they all may need flushing. */ vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); cnt = 0; /* * If the vfs is known we can do fast path by iterating all rnodes that * belongs to this vfs. This is much faster than the traditional way * of iterating rtable (below) in a case there is a lot of rnodes that * does not belong to our vfs. */ if (vfsp != NULL) { mntinfo_t *mi = VFTOMI(vfsp); mutex_enter(&mi->mi_rnodes_lock); for (rp = list_head(&mi->mi_rnodes); rp != NULL; rp = list_next(&mi->mi_rnodes, rp)) { vp = RTOV(rp); /* * Don't bother sync'ing a vp if it * is part of virtual swap device or * if VFS is read-only */ if (IS_SWAPVP(vp) || vn_is_readonly(vp)) continue; /* * If the vnode has pages and is marked as either dirty * or mmap'd, hold and add this vnode to the list of * vnodes to flush. */ ASSERT(vp->v_vfsp == vfsp); if (vn_has_cached_data(vp) && ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { VN_HOLD(vp); vplist[cnt++] = vp; if (cnt == num) { /* * The vplist is full because there is * too many rnodes. We are done for * now. */ break; } } } mutex_exit(&mi->mi_rnodes_lock); goto done; } ASSERT(vfsp == NULL); /* * Walk the hash queues looking for rnodes with page * lists associated with them. Make a list of these * files. */ for (index = 0; index < rtablesize; index++) { rw_enter(&rtable[index].r_lock, RW_READER); for (rp = rtable[index].r_hashf; rp != (rnode_t *)(&rtable[index]); rp = rp->r_hashf) { vp = RTOV(rp); /* * Don't bother sync'ing a vp if it * is part of virtual swap device or * if VFS is read-only */ if (IS_SWAPVP(vp) || vn_is_readonly(vp)) continue; /* * If the vnode has pages and is marked as either dirty * or mmap'd, hold and add this vnode to the list of * vnodes to flush. */ if (vn_has_cached_data(vp) && ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { VN_HOLD(vp); vplist[cnt++] = vp; if (cnt == num) { rw_exit(&rtable[index].r_lock); /* * The vplist is full because there is * too many rnodes. We are done for * now. */ goto done; } } } rw_exit(&rtable[index].r_lock); } done: /* * Flush and release all of the files on the list. */ while (cnt-- > 0) { vp = vplist[cnt]; (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); VN_RELE(vp); } /* * Free the space allocated to hold the list. */ kmem_free(vplist, num * sizeof (*vplist)); } /* * This probably needs to be larger than or equal to * log2(sizeof (struct rnode)) due to the way that rnodes are * allocated. */ #define ACACHE_SHIFT_BITS 9 static int acachehash(rnode_t *rp, cred_t *cr) { return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & acachemask); } #ifdef DEBUG static long nfs_access_cache_hits = 0; static long nfs_access_cache_misses = 0; #endif nfs_access_type_t nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) { vnode_t *vp; acache_t *ap; acache_hash_t *hp; nfs_access_type_t all; vp = RTOV(rp); if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) return (NFS_ACCESS_UNKNOWN); if (rp->r_acache != NULL) { hp = &acache[acachehash(rp, cr)]; rw_enter(&hp->lock, RW_READER); ap = hp->next; while (ap != (acache_t *)hp) { if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { if ((ap->known & acc) == acc) { #ifdef DEBUG nfs_access_cache_hits++; #endif if ((ap->allowed & acc) == acc) all = NFS_ACCESS_ALLOWED; else all = NFS_ACCESS_DENIED; } else { #ifdef DEBUG nfs_access_cache_misses++; #endif all = NFS_ACCESS_UNKNOWN; } rw_exit(&hp->lock); return (all); } ap = ap->next; } rw_exit(&hp->lock); } #ifdef DEBUG nfs_access_cache_misses++; #endif return (NFS_ACCESS_UNKNOWN); } void nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) { acache_t *ap; acache_t *nap; acache_hash_t *hp; hp = &acache[acachehash(rp, cr)]; /* * Allocate now assuming that mostly an allocation will be * required. This allows the allocation to happen without * holding the hash bucket locked. */ nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); if (nap != NULL) { nap->known = acc; nap->allowed = resacc; nap->rnode = rp; crhold(cr); nap->cred = cr; nap->hashq = hp; } rw_enter(&hp->lock, RW_WRITER); if (rp->r_acache != NULL) { ap = hp->next; while (ap != (acache_t *)hp) { if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { ap->known |= acc; ap->allowed &= ~acc; ap->allowed |= resacc; rw_exit(&hp->lock); if (nap != NULL) { crfree(nap->cred); kmem_cache_free(acache_cache, nap); } return; } ap = ap->next; } } if (nap != NULL) { #ifdef DEBUG clstat_debug.access.value.ui64++; #endif nap->next = hp->next; hp->next = nap; nap->next->prev = nap; nap->prev = (acache_t *)hp; mutex_enter(&rp->r_statelock); nap->list = rp->r_acache; rp->r_acache = nap; mutex_exit(&rp->r_statelock); } rw_exit(&hp->lock); } int nfs_access_purge_rp(rnode_t *rp) { acache_t *ap; acache_t *tmpap; acache_t *rplist; /* * If there aren't any cached entries, then there is nothing * to free. */ if (rp->r_acache == NULL) return (0); mutex_enter(&rp->r_statelock); rplist = rp->r_acache; rp->r_acache = NULL; mutex_exit(&rp->r_statelock); /* * Loop through each entry in the list pointed to in the * rnode. Remove each of these entries from the hash * queue that it is on and remove it from the list in * the rnode. */ for (ap = rplist; ap != NULL; ap = tmpap) { rw_enter(&ap->hashq->lock, RW_WRITER); ap->prev->next = ap->next; ap->next->prev = ap->prev; rw_exit(&ap->hashq->lock); tmpap = ap->list; crfree(ap->cred); kmem_cache_free(acache_cache, ap); #ifdef DEBUG clstat_debug.access.value.ui64--; #endif } return (1); } static const char prefix[] = ".nfs"; static kmutex_t newnum_lock; int newnum(void) { static uint_t newnum = 0; uint_t id; mutex_enter(&newnum_lock); if (newnum == 0) newnum = gethrestime_sec() & 0xffff; id = newnum++; mutex_exit(&newnum_lock); return (id); } char * newname(void) { char *news; char *s; const char *p; uint_t id; id = newnum(); news = kmem_alloc(MAXNAMELEN, KM_SLEEP); s = news; p = prefix; while (*p != '\0') *s++ = *p++; while (id != 0) { *s++ = "0123456789ABCDEF"[id & 0x0f]; id >>= 4; } *s = '\0'; return (news); } /* * Snapshot callback for nfs:0:nfs_client as registered with the kstat * framework. */ static int cl_snapshot(kstat_t *ksp, void *buf, int rw) { ksp->ks_snaptime = gethrtime(); if (rw == KSTAT_WRITE) { bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); #ifdef DEBUG /* * Currently only the global zone can write to kstats, but we * add the check just for paranoia. */ if (INGLOBALZONE(curproc)) bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, sizeof (clstat_debug)); #endif } else { bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); #ifdef DEBUG /* * If we're displaying the "global" debug kstat values, we * display them as-is to all zones since in fact they apply to * the system as a whole. */ bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), sizeof (clstat_debug)); #endif } return (0); } static void * clinit_zone(zoneid_t zoneid) { kstat_t *nfs_client_kstat; struct nfs_clnt *nfscl; uint_t ndata; nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); nfscl->nfscl_chtable = NULL; nfscl->nfscl_zoneid = zoneid; bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); #ifdef DEBUG ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); #endif if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", "misc", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { nfs_client_kstat->ks_private = &nfscl->nfscl_stat; nfs_client_kstat->ks_snapshot = cl_snapshot; kstat_install(nfs_client_kstat); } mutex_enter(&nfs_clnt_list_lock); list_insert_head(&nfs_clnt_list, nfscl); mutex_exit(&nfs_clnt_list_lock); return (nfscl); } /*ARGSUSED*/ static void clfini_zone(zoneid_t zoneid, void *arg) { struct nfs_clnt *nfscl = arg; chhead_t *chp, *next; if (nfscl == NULL) return; mutex_enter(&nfs_clnt_list_lock); list_remove(&nfs_clnt_list, nfscl); mutex_exit(&nfs_clnt_list_lock); clreclaim_zone(nfscl, 0); for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { ASSERT(chp->ch_list == NULL); kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); next = chp->ch_next; kmem_free(chp, sizeof (*chp)); } kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); mutex_destroy(&nfscl->nfscl_chtable_lock); kmem_free(nfscl, sizeof (*nfscl)); } /* * Called by endpnt_destructor to make sure the client handles are * cleaned up before the RPC endpoints. This becomes a no-op if * clfini_zone (above) is called first. This function is needed * (rather than relying on clfini_zone to clean up) because the ZSD * callbacks have no ordering mechanism, so we have no way to ensure * that clfini_zone is called before endpnt_destructor. */ void clcleanup_zone(zoneid_t zoneid) { struct nfs_clnt *nfscl; mutex_enter(&nfs_clnt_list_lock); nfscl = list_head(&nfs_clnt_list); for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { if (nfscl->nfscl_zoneid == zoneid) { clreclaim_zone(nfscl, 0); break; } } mutex_exit(&nfs_clnt_list_lock); } int nfs_subrinit(void) { int i; ulong_t nrnode_max; /* * Allocate and initialize the rnode hash queues */ if (nrnode <= 0) nrnode = ncsize; nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { zcmn_err(GLOBAL_ZONEID, CE_NOTE, "!setting nrnode to max value of %ld", nrnode_max); nrnode = nrnode_max; } rtablesize = 1 << highbit(nrnode / hashlen); rtablemask = rtablesize - 1; rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); for (i = 0; i < rtablesize; i++) { rtable[i].r_hashf = (rnode_t *)(&rtable[i]); rtable[i].r_hashb = (rnode_t *)(&rtable[i]); rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); } rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); /* * Allocate and initialize the access cache */ /* * Initial guess is one access cache entry per rnode unless * nacache is set to a non-zero value and then it is used to * indicate a guess at the number of access cache entries. */ if (nacache > 0) acachesize = 1 << highbit(nacache / hashlen); else acachesize = rtablesize; acachemask = acachesize - 1; acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); for (i = 0; i < acachesize; i++) { acache[i].next = (acache_t *)&acache[i]; acache[i].prev = (acache_t *)&acache[i]; rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); } acache_cache = kmem_cache_create("nfs_access_cache", sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * Allocate and initialize the client handle cache */ chtab_cache = kmem_cache_create("client_handle_cache", sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); /* * Initialize the list of per-zone client handles (and associated data). * This needs to be done before we call zone_key_create(). */ list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), offsetof(struct nfs_clnt, nfscl_node)); /* * Initialize the zone_key for per-zone client handle lists. */ zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); /* * Initialize the various mutexes and reader/writer locks */ mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); /* * Assign unique major number for all nfs mounts */ if ((nfs_major = getudev()) == -1) { zcmn_err(GLOBAL_ZONEID, CE_WARN, "nfs: init: can't get unique device number"); nfs_major = 0; } nfs_minor = 0; if (nfs3_jukebox_delay == 0) nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; return (0); } void nfs_subrfini(void) { int i; /* * Deallocate the rnode hash queues */ kmem_cache_destroy(rnode_cache); for (i = 0; i < rtablesize; i++) rw_destroy(&rtable[i].r_lock); kmem_free(rtable, rtablesize * sizeof (*rtable)); /* * Deallocated the access cache */ kmem_cache_destroy(acache_cache); for (i = 0; i < acachesize; i++) rw_destroy(&acache[i].lock); kmem_free(acache, acachesize * sizeof (*acache)); /* * Deallocate the client handle cache */ kmem_cache_destroy(chtab_cache); /* * Destroy the various mutexes and reader/writer locks */ mutex_destroy(&rpfreelist_lock); mutex_destroy(&newnum_lock); mutex_destroy(&nfs_minor_lock); (void) zone_key_delete(nfsclnt_zone_key); } enum nfsstat puterrno(int error) { switch (error) { case EOPNOTSUPP: return (NFSERR_OPNOTSUPP); case ENAMETOOLONG: return (NFSERR_NAMETOOLONG); case ENOTEMPTY: return (NFSERR_NOTEMPTY); case EDQUOT: return (NFSERR_DQUOT); case ESTALE: return (NFSERR_STALE); case EREMOTE: return (NFSERR_REMOTE); case ENOSYS: return (NFSERR_OPNOTSUPP); case EOVERFLOW: return (NFSERR_INVAL); default: return ((enum nfsstat)error); } /* NOTREACHED */ } int geterrno(enum nfsstat status) { switch (status) { case NFSERR_OPNOTSUPP: return (EOPNOTSUPP); case NFSERR_NAMETOOLONG: return (ENAMETOOLONG); case NFSERR_NOTEMPTY: return (ENOTEMPTY); case NFSERR_DQUOT: return (EDQUOT); case NFSERR_STALE: return (ESTALE); case NFSERR_REMOTE: return (EREMOTE); case NFSERR_WFLUSH: return (EIO); default: return ((int)status); } /* NOTREACHED */ } enum nfsstat3 puterrno3(int error) { #ifdef DEBUG switch (error) { case 0: return (NFS3_OK); case EPERM: return (NFS3ERR_PERM); case ENOENT: return (NFS3ERR_NOENT); case EIO: return (NFS3ERR_IO); case ENXIO: return (NFS3ERR_NXIO); case EACCES: return (NFS3ERR_ACCES); case EEXIST: return (NFS3ERR_EXIST); case EXDEV: return (NFS3ERR_XDEV); case ENODEV: return (NFS3ERR_NODEV); case ENOTDIR: return (NFS3ERR_NOTDIR); case EISDIR: return (NFS3ERR_ISDIR); case EINVAL: return (NFS3ERR_INVAL); case EFBIG: return (NFS3ERR_FBIG); case ENOSPC: return (NFS3ERR_NOSPC); case EROFS: return (NFS3ERR_ROFS); case EMLINK: return (NFS3ERR_MLINK); case ENAMETOOLONG: return (NFS3ERR_NAMETOOLONG); case ENOTEMPTY: return (NFS3ERR_NOTEMPTY); case EDQUOT: return (NFS3ERR_DQUOT); case ESTALE: return (NFS3ERR_STALE); case EREMOTE: return (NFS3ERR_REMOTE); case ENOSYS: case EOPNOTSUPP: return (NFS3ERR_NOTSUPP); case EOVERFLOW: return (NFS3ERR_INVAL); default: zcmn_err(getzoneid(), CE_WARN, "puterrno3: got error %d", error); return ((enum nfsstat3)error); } #else switch (error) { case ENAMETOOLONG: return (NFS3ERR_NAMETOOLONG); case ENOTEMPTY: return (NFS3ERR_NOTEMPTY); case EDQUOT: return (NFS3ERR_DQUOT); case ESTALE: return (NFS3ERR_STALE); case ENOSYS: case EOPNOTSUPP: return (NFS3ERR_NOTSUPP); case EREMOTE: return (NFS3ERR_REMOTE); case EOVERFLOW: return (NFS3ERR_INVAL); default: return ((enum nfsstat3)error); } #endif } int geterrno3(enum nfsstat3 status) { #ifdef DEBUG switch (status) { case NFS3_OK: return (0); case NFS3ERR_PERM: return (EPERM); case NFS3ERR_NOENT: return (ENOENT); case NFS3ERR_IO: return (EIO); case NFS3ERR_NXIO: return (ENXIO); case NFS3ERR_ACCES: return (EACCES); case NFS3ERR_EXIST: return (EEXIST); case NFS3ERR_XDEV: return (EXDEV); case NFS3ERR_NODEV: return (ENODEV); case NFS3ERR_NOTDIR: return (ENOTDIR); case NFS3ERR_ISDIR: return (EISDIR); case NFS3ERR_INVAL: return (EINVAL); case NFS3ERR_FBIG: return (EFBIG); case NFS3ERR_NOSPC: return (ENOSPC); case NFS3ERR_ROFS: return (EROFS); case NFS3ERR_MLINK: return (EMLINK); case NFS3ERR_NAMETOOLONG: return (ENAMETOOLONG); case NFS3ERR_NOTEMPTY: return (ENOTEMPTY); case NFS3ERR_DQUOT: return (EDQUOT); case NFS3ERR_STALE: return (ESTALE); case NFS3ERR_REMOTE: return (EREMOTE); case NFS3ERR_BADHANDLE: return (ESTALE); case NFS3ERR_NOT_SYNC: return (EINVAL); case NFS3ERR_BAD_COOKIE: return (ENOENT); case NFS3ERR_NOTSUPP: return (EOPNOTSUPP); case NFS3ERR_TOOSMALL: return (EINVAL); case NFS3ERR_SERVERFAULT: return (EIO); case NFS3ERR_BADTYPE: return (EINVAL); case NFS3ERR_JUKEBOX: return (ENXIO); default: zcmn_err(getzoneid(), CE_WARN, "geterrno3: got status %d", status); return ((int)status); } #else switch (status) { case NFS3ERR_NAMETOOLONG: return (ENAMETOOLONG); case NFS3ERR_NOTEMPTY: return (ENOTEMPTY); case NFS3ERR_DQUOT: return (EDQUOT); case NFS3ERR_STALE: case NFS3ERR_BADHANDLE: return (ESTALE); case NFS3ERR_NOTSUPP: return (EOPNOTSUPP); case NFS3ERR_REMOTE: return (EREMOTE); case NFS3ERR_NOT_SYNC: case NFS3ERR_TOOSMALL: case NFS3ERR_BADTYPE: return (EINVAL); case NFS3ERR_BAD_COOKIE: return (ENOENT); case NFS3ERR_SERVERFAULT: return (EIO); case NFS3ERR_JUKEBOX: return (ENXIO); default: return ((int)status); } #endif } rddir_cache * rddir_cache_alloc(int flags) { rddir_cache *rc; rc = kmem_alloc(sizeof (*rc), flags); if (rc != NULL) { rc->entries = NULL; rc->flags = RDDIR; cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); rc->count = 1; #ifdef DEBUG atomic_inc_64(&clstat_debug.dirent.value.ui64); #endif } return (rc); } static void rddir_cache_free(rddir_cache *rc) { #ifdef DEBUG atomic_dec_64(&clstat_debug.dirent.value.ui64); #endif if (rc->entries != NULL) { #ifdef DEBUG rddir_cache_buf_free(rc->entries, rc->buflen); #else kmem_free(rc->entries, rc->buflen); #endif } cv_destroy(&rc->cv); mutex_destroy(&rc->lock); kmem_free(rc, sizeof (*rc)); } void rddir_cache_hold(rddir_cache *rc) { mutex_enter(&rc->lock); rc->count++; mutex_exit(&rc->lock); } void rddir_cache_rele(rddir_cache *rc) { mutex_enter(&rc->lock); ASSERT(rc->count > 0); if (--rc->count == 0) { mutex_exit(&rc->lock); rddir_cache_free(rc); } else mutex_exit(&rc->lock); } #ifdef DEBUG char * rddir_cache_buf_alloc(size_t size, int flags) { char *rc; rc = kmem_alloc(size, flags); if (rc != NULL) atomic_add_64(&clstat_debug.dirents.value.ui64, size); return (rc); } void rddir_cache_buf_free(void *addr, size_t size) { atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); kmem_free(addr, size); } #endif static int nfs_free_data_reclaim(rnode_t *rp) { char *contents; int size; vsecattr_t *vsp; nfs3_pathconf_info *info; int freed; cred_t *cred; /* * Free any held credentials and caches which * may be associated with this rnode. */ mutex_enter(&rp->r_statelock); cred = rp->r_cred; rp->r_cred = NULL; contents = rp->r_symlink.contents; size = rp->r_symlink.size; rp->r_symlink.contents = NULL; vsp = rp->r_secattr; rp->r_secattr = NULL; info = rp->r_pathconf; rp->r_pathconf = NULL; mutex_exit(&rp->r_statelock); if (cred != NULL) crfree(cred); /* * Free the access cache entries. */ freed = nfs_access_purge_rp(rp); if (!HAVE_RDDIR_CACHE(rp) && contents == NULL && vsp == NULL && info == NULL) return (freed); /* * Free the readdir cache entries */ if (HAVE_RDDIR_CACHE(rp)) nfs_purge_rddir_cache(RTOV(rp)); /* * Free the symbolic link cache. */ if (contents != NULL) { kmem_free((void *)contents, size); } /* * Free any cached ACL. */ if (vsp != NULL) nfs_acl_free(vsp); /* * Free any cached pathconf information. */ if (info != NULL) kmem_free(info, sizeof (*info)); return (1); } static int nfs_active_data_reclaim(rnode_t *rp) { char *contents; int size; vsecattr_t *vsp; nfs3_pathconf_info *info; int freed; /* * Free any held credentials and caches which * may be associated with this rnode. */ if (!mutex_tryenter(&rp->r_statelock)) return (0); contents = rp->r_symlink.contents; size = rp->r_symlink.size; rp->r_symlink.contents = NULL; vsp = rp->r_secattr; rp->r_secattr = NULL; info = rp->r_pathconf; rp->r_pathconf = NULL; mutex_exit(&rp->r_statelock); /* * Free the access cache entries. */ freed = nfs_access_purge_rp(rp); if (!HAVE_RDDIR_CACHE(rp) && contents == NULL && vsp == NULL && info == NULL) return (freed); /* * Free the readdir cache entries */ if (HAVE_RDDIR_CACHE(rp)) nfs_purge_rddir_cache(RTOV(rp)); /* * Free the symbolic link cache. */ if (contents != NULL) { kmem_free((void *)contents, size); } /* * Free any cached ACL. */ if (vsp != NULL) nfs_acl_free(vsp); /* * Free any cached pathconf information. */ if (info != NULL) kmem_free(info, sizeof (*info)); return (1); } static int nfs_free_reclaim(void) { int freed; rnode_t *rp; #ifdef DEBUG clstat_debug.f_reclaim.value.ui64++; #endif freed = 0; mutex_enter(&rpfreelist_lock); rp = rpfreelist; if (rp != NULL) { do { if (nfs_free_data_reclaim(rp)) freed = 1; } while ((rp = rp->r_freef) != rpfreelist); } mutex_exit(&rpfreelist_lock); return (freed); } static int nfs_active_reclaim(void) { int freed; int index; rnode_t *rp; #ifdef DEBUG clstat_debug.a_reclaim.value.ui64++; #endif freed = 0; for (index = 0; index < rtablesize; index++) { rw_enter(&rtable[index].r_lock, RW_READER); for (rp = rtable[index].r_hashf; rp != (rnode_t *)(&rtable[index]); rp = rp->r_hashf) { if (nfs_active_data_reclaim(rp)) freed = 1; } rw_exit(&rtable[index].r_lock); } return (freed); } static int nfs_rnode_reclaim(void) { int freed; rnode_t *rp; vnode_t *vp; #ifdef DEBUG clstat_debug.r_reclaim.value.ui64++; #endif freed = 0; mutex_enter(&rpfreelist_lock); while ((rp = rpfreelist) != NULL) { rp_rmfree(rp); mutex_exit(&rpfreelist_lock); if (rp->r_flags & RHASHED) { vp = RTOV(rp); rw_enter(&rp->r_hashq->r_lock, RW_WRITER); mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; mutex_exit(&vp->v_lock); rw_exit(&rp->r_hashq->r_lock); mutex_enter(&rpfreelist_lock); continue; } mutex_exit(&vp->v_lock); rp_rmhash_locked(rp); rw_exit(&rp->r_hashq->r_lock); } /* * This call to rp_addfree will end up destroying the * rnode, but in a safe way with the appropriate set * of checks done. */ rp_addfree(rp, CRED()); mutex_enter(&rpfreelist_lock); } mutex_exit(&rpfreelist_lock); return (freed); } /*ARGSUSED*/ static void nfs_reclaim(void *cdrarg) { #ifdef DEBUG clstat_debug.reclaim.value.ui64++; #endif if (nfs_free_reclaim()) return; if (nfs_active_reclaim()) return; (void) nfs_rnode_reclaim(); } /* * NFS client failover support * * Routines to copy filehandles */ void nfscopyfh(caddr_t fhp, vnode_t *vp) { fhandle_t *dest = (fhandle_t *)fhp; if (dest != NULL) *dest = *VTOFH(vp); } void nfs3copyfh(caddr_t fhp, vnode_t *vp) { nfs_fh3 *dest = (nfs_fh3 *)fhp; if (dest != NULL) *dest = *VTOFH3(vp); } /* * NFS client failover support * * failover_safe() will test various conditions to ensure that * failover is permitted for this vnode. It will be denied * if: * 1) the operation in progress does not support failover (NULL fi) * 2) there are no available replicas (NULL mi_servers->sv_next) * 3) any locks are outstanding on this file */ static int failover_safe(failinfo_t *fi) { /* * Does this op permit failover? */ if (fi == NULL || fi->vp == NULL) return (0); /* * Are there any alternates to failover to? */ if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) return (0); /* * Disable check; we've forced local locking * * if (flk_has_remote_locks(fi->vp)) * return (0); */ /* * If we have no partial path, we can't do anything */ if (VTOR(fi->vp)->r_path == NULL) return (0); return (1); } #include /* * NFS client failover support * * failover_newserver() will start a search for a new server, * preferably by starting an async thread to do the work. If * someone is already doing this (recognizable by MI_BINDINPROG * being set), it will simply return and the calling thread * will queue on the mi_failover_cv condition variable. */ static void failover_newserver(mntinfo_t *mi) { /* * Check if someone else is doing this already */ mutex_enter(&mi->mi_lock); if (mi->mi_flags & MI_BINDINPROG) { mutex_exit(&mi->mi_lock); return; } mi->mi_flags |= MI_BINDINPROG; /* * Need to hold the vfs struct so that it can't be released * while the failover thread is selecting a new server. */ VFS_HOLD(mi->mi_vfsp); /* * Start a thread to do the real searching. */ (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); mutex_exit(&mi->mi_lock); } /* * NFS client failover support * * failover_thread() will find a new server to replace the one * currently in use, wake up other threads waiting on this mount * point, and die. It will start at the head of the server list * and poll servers until it finds one with an NFS server which is * registered and responds to a NULL procedure ping. * * XXX failover_thread is unsafe within the scope of the * present model defined for cpr to suspend the system. * Specifically, over-the-wire calls made by the thread * are unsafe. The thread needs to be reevaluated in case of * future updates to the cpr suspend model. */ static void failover_thread(mntinfo_t *mi) { servinfo_t *svp = NULL; CLIENT *cl; enum clnt_stat status; struct timeval tv; int error; int oncethru = 0; callb_cpr_t cprinfo; rnode_t *rp; int index; char *srvnames; size_t srvnames_len; struct nfs_clnt *nfscl = NULL; zoneid_t zoneid = getzoneid(); #ifdef DEBUG /* * This is currently only needed to access counters which exist on * DEBUG kernels, hence we don't want to pay the penalty of the lookup * on non-DEBUG kernels. */ nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); ASSERT(nfscl != NULL); #endif /* * Its safe to piggyback on the mi_lock since failover_newserver() * code guarantees that there will be only one failover thread * per mountinfo at any instance. */ CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, "failover_thread"); mutex_enter(&mi->mi_lock); while (mi->mi_readers) { CALLB_CPR_SAFE_BEGIN(&cprinfo); cv_wait(&mi->mi_failover_cv, &mi->mi_lock); CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); } mutex_exit(&mi->mi_lock); tv.tv_sec = 2; tv.tv_usec = 0; /* * Ping the null NFS procedure of every server in * the list until one responds. We always start * at the head of the list and always skip the one * that is current, since it's caused us a problem. */ while (svp == NULL) { for (svp = mi->mi_servers; svp; svp = svp->sv_next) { if (!oncethru && svp == mi->mi_curr_serv) continue; /* * If the file system was forcibly umounted * while trying to do a failover, then just * give up on the failover. It won't matter * what the server is. */ if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { svp = NULL; goto done; } error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); if (error) continue; if (!(mi->mi_flags & MI_INT)) cl->cl_nosignal = TRUE; status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL, tv); if (!(mi->mi_flags & MI_INT)) cl->cl_nosignal = FALSE; AUTH_DESTROY(cl->cl_auth); CLNT_DESTROY(cl); if (status == RPC_SUCCESS) { if (svp == mi->mi_curr_serv) { #ifdef DEBUG zcmn_err(zoneid, CE_NOTE, "NFS%d: failing over: selecting original server %s", mi->mi_vers, svp->sv_hostname); #else zcmn_err(zoneid, CE_NOTE, "NFS: failing over: selecting original server %s", svp->sv_hostname); #endif } else { #ifdef DEBUG zcmn_err(zoneid, CE_NOTE, "NFS%d: failing over from %s to %s", mi->mi_vers, mi->mi_curr_serv->sv_hostname, svp->sv_hostname); #else zcmn_err(zoneid, CE_NOTE, "NFS: failing over from %s to %s", mi->mi_curr_serv->sv_hostname, svp->sv_hostname); #endif } break; } } if (svp == NULL) { if (!oncethru) { srvnames = nfs_getsrvnames(mi, &srvnames_len); #ifdef DEBUG zprintf(zoneid, "NFS%d servers %s not responding " "still trying\n", mi->mi_vers, srvnames); #else zprintf(zoneid, "NFS servers %s not responding " "still trying\n", srvnames); #endif oncethru = 1; } mutex_enter(&mi->mi_lock); CALLB_CPR_SAFE_BEGIN(&cprinfo); mutex_exit(&mi->mi_lock); delay(hz); mutex_enter(&mi->mi_lock); CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); mutex_exit(&mi->mi_lock); } } if (oncethru) { #ifdef DEBUG zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); #else zprintf(zoneid, "NFS servers %s ok\n", srvnames); #endif } if (svp != mi->mi_curr_serv) { (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); index = rtablehash(&mi->mi_curr_serv->sv_fhandle); rw_enter(&rtable[index].r_lock, RW_WRITER); rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, mi->mi_vfsp); if (rp != NULL) { if (rp->r_flags & RHASHED) rp_rmhash_locked(rp); rw_exit(&rtable[index].r_lock); rp->r_server = svp; rp->r_fh = svp->sv_fhandle; (void) nfs_free_data_reclaim(rp); index = rtablehash(&rp->r_fh); rp->r_hashq = &rtable[index]; rw_enter(&rp->r_hashq->r_lock, RW_WRITER); vn_exists(RTOV(rp)); rp_addhash(rp); rw_exit(&rp->r_hashq->r_lock); VN_RELE(RTOV(rp)); } else rw_exit(&rtable[index].r_lock); } done: if (oncethru) kmem_free(srvnames, srvnames_len); mutex_enter(&mi->mi_lock); mi->mi_flags &= ~MI_BINDINPROG; if (svp != NULL) { mi->mi_curr_serv = svp; mi->mi_failover++; #ifdef DEBUG nfscl->nfscl_stat.failover.value.ui64++; #endif } cv_broadcast(&mi->mi_failover_cv); CALLB_CPR_EXIT(&cprinfo); VFS_RELE(mi->mi_vfsp); zthread_exit(); /* NOTREACHED */ } /* * NFS client failover support * * failover_wait() will put the thread to sleep until MI_BINDINPROG * is cleared, meaning that failover is complete. Called with * mi_lock mutex held. */ static int failover_wait(mntinfo_t *mi) { k_sigset_t smask; /* * If someone else is hunting for a living server, * sleep until it's done. After our sleep, we may * be bound to the right server and get off cheaply. */ while (mi->mi_flags & MI_BINDINPROG) { /* * Mask out all signals except SIGHUP, SIGINT, SIGQUIT * and SIGTERM. (Preserving the existing masks). * Mask out SIGINT if mount option nointr is specified. */ sigintr(&smask, (int)mi->mi_flags & MI_INT); if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { /* * restore original signal mask */ sigunintr(&smask); return (EINTR); } /* * restore original signal mask */ sigunintr(&smask); } return (0); } /* * NFS client failover support * * failover_remap() will do a partial pathname lookup and find the * desired vnode on the current server. The interim vnode will be * discarded after we pilfer the new filehandle. * * Side effects: * - This routine will also update the filehandle in the args structure * pointed to by the fi->fhp pointer if it is non-NULL. */ static int failover_remap(failinfo_t *fi) { vnode_t *vp, *nvp, *rootvp; rnode_t *rp, *nrp; mntinfo_t *mi; int error; #ifdef DEBUG struct nfs_clnt *nfscl; nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); ASSERT(nfscl != NULL); #endif /* * Sanity check */ if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) return (EINVAL); vp = fi->vp; rp = VTOR(vp); mi = VTOMI(vp); if (!(vp->v_flag & VROOT)) { /* * Given the root fh, use the path stored in * the rnode to find the fh for the new server. */ error = VFS_ROOT(mi->mi_vfsp, &rootvp); if (error) return (error); error = failover_lookup(rp->r_path, rootvp, fi->lookupproc, fi->xattrdirproc, &nvp); VN_RELE(rootvp); if (error) return (error); /* * If we found the same rnode, we're done now */ if (nvp == vp) { /* * Failed and the new server may physically be same * OR may share a same disk subsystem. In this case * file handle for a particular file path is not going * to change, given the same filehandle lookup will * always locate the same rnode as the existing one. * All we might need to do is to update the r_server * with the current servinfo. */ if (!VALID_FH(fi)) { rp->r_server = mi->mi_curr_serv; } VN_RELE(nvp); return (0); } /* * Try to make it so that no one else will find this * vnode because it is just a temporary to hold the * new file handle until that file handle can be * copied to the original vnode/rnode. */ nrp = VTOR(nvp); mutex_enter(&mi->mi_remap_lock); /* * Some other thread could have raced in here and could * have done the remap for this particular rnode before * this thread here. Check for rp->r_server and * mi->mi_curr_serv and return if they are same. */ if (VALID_FH(fi)) { mutex_exit(&mi->mi_remap_lock); VN_RELE(nvp); return (0); } if (nrp->r_flags & RHASHED) rp_rmhash(nrp); /* * As a heuristic check on the validity of the new * file, check that the size and type match against * that we remember from the old version. */ if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { mutex_exit(&mi->mi_remap_lock); zcmn_err(mi->mi_zone->zone_id, CE_WARN, "NFS replicas %s and %s: file %s not same.", rp->r_server->sv_hostname, nrp->r_server->sv_hostname, rp->r_path); VN_RELE(nvp); return (EINVAL); } /* * snarf the filehandle from the new rnode * then release it, again while updating the * hash queues for the rnode. */ if (rp->r_flags & RHASHED) rp_rmhash(rp); rp->r_server = mi->mi_curr_serv; rp->r_fh = nrp->r_fh; rp->r_hashq = nrp->r_hashq; /* * Copy the attributes from the new rnode to the old * rnode. This will help to reduce unnecessary page * cache flushes. */ rp->r_attr = nrp->r_attr; rp->r_attrtime = nrp->r_attrtime; rp->r_mtime = nrp->r_mtime; (void) nfs_free_data_reclaim(rp); nfs_setswaplike(vp, &rp->r_attr); rw_enter(&rp->r_hashq->r_lock, RW_WRITER); rp_addhash(rp); rw_exit(&rp->r_hashq->r_lock); mutex_exit(&mi->mi_remap_lock); VN_RELE(nvp); } /* * Update successful failover remap count */ mutex_enter(&mi->mi_lock); mi->mi_remap++; mutex_exit(&mi->mi_lock); #ifdef DEBUG nfscl->nfscl_stat.remap.value.ui64++; #endif /* * If we have a copied filehandle to update, do it now. */ if (fi->fhp != NULL && fi->copyproc != NULL) (*fi->copyproc)(fi->fhp, vp); return (0); } /* * NFS client failover support * * We want a simple pathname lookup routine to parse the pieces * of path in rp->r_path. We know that the path was a created * as rnodes were made, so we know we have only to deal with * paths that look like: * dir1/dir2/dir3/file * Any evidence of anything like .., symlinks, and ENOTDIR * are hard errors, because they mean something in this filesystem * is different from the one we came from, or has changed under * us in some way. If this is true, we want the failure. * * Extended attributes: if the filesystem is mounted with extended * attributes enabled (-o xattr), the attribute directory will be * represented in the r_path as the magic name XATTR_RPATH. So if * we see that name in the pathname, is must be because this node * is an extended attribute. Therefore, look it up that way. */ static int failover_lookup(char *path, vnode_t *root, int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, vnode_t *, cred_t *, int), int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), vnode_t **new) { vnode_t *dvp, *nvp; int error = EINVAL; char *s, *p, *tmppath; size_t len; mntinfo_t *mi; bool_t xattr; /* Make local copy of path */ len = strlen(path) + 1; tmppath = kmem_alloc(len, KM_SLEEP); (void) strcpy(tmppath, path); s = tmppath; dvp = root; VN_HOLD(dvp); mi = VTOMI(root); xattr = mi->mi_flags & MI_EXTATTR; do { p = strchr(s, '/'); if (p != NULL) *p = '\0'; if (xattr && strcmp(s, XATTR_RPATH) == 0) { error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), RFSCALL_SOFT); } else { error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, CRED(), RFSCALL_SOFT); } if (p != NULL) *p++ = '/'; if (error) { VN_RELE(dvp); kmem_free(tmppath, len); return (error); } s = p; VN_RELE(dvp); dvp = nvp; } while (p != NULL); if (nvp != NULL && new != NULL) *new = nvp; kmem_free(tmppath, len); return (0); } /* * NFS client failover support * * sv_free() frees the malloc'd portion of a "servinfo_t". */ void sv_free(servinfo_t *svp) { servinfo_t *next; struct knetconfig *knconf; while (svp != NULL) { next = svp->sv_next; if (svp->sv_secdata) sec_clnt_freeinfo(svp->sv_secdata); if (svp->sv_hostname && svp->sv_hostnamelen > 0) kmem_free(svp->sv_hostname, svp->sv_hostnamelen); knconf = svp->sv_knconf; if (knconf != NULL) { if (knconf->knc_protofmly != NULL) kmem_free(knconf->knc_protofmly, KNC_STRSIZE); if (knconf->knc_proto != NULL) kmem_free(knconf->knc_proto, KNC_STRSIZE); kmem_free(knconf, sizeof (*knconf)); } knconf = svp->sv_origknconf; if (knconf != NULL) { if (knconf->knc_protofmly != NULL) kmem_free(knconf->knc_protofmly, KNC_STRSIZE); if (knconf->knc_proto != NULL) kmem_free(knconf->knc_proto, KNC_STRSIZE); kmem_free(knconf, sizeof (*knconf)); } if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); mutex_destroy(&svp->sv_lock); kmem_free(svp, sizeof (*svp)); svp = next; } } /* * Only can return non-zero if intr != 0. */ int nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) { mutex_enter(&l->lock); /* * If this is a nested enter, then allow it. There * must be as many exits as enters through. */ if (l->owner == curthread) { /* lock is held for writing by current thread */ ASSERT(rw == RW_READER || rw == RW_WRITER); l->count--; } else if (rw == RW_READER) { /* * While there is a writer active or writers waiting, * then wait for them to finish up and move on. Then, * increment the count to indicate that a reader is * active. */ while (l->count < 0 || l->waiters > 0) { if (intr) { klwp_t *lwp = ttolwp(curthread); if (lwp != NULL) lwp->lwp_nostop++; if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) { if (lwp != NULL) lwp->lwp_nostop--; mutex_exit(&l->lock); return (EINTR); } if (lwp != NULL) lwp->lwp_nostop--; } else cv_wait(&l->cv_rd, &l->lock); } ASSERT(l->count < INT_MAX); #ifdef DEBUG if ((l->count % 10000) == 9999) cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" "rwlock @ %p\n", l->count, (void *)&l); #endif l->count++; } else { ASSERT(rw == RW_WRITER); /* * While there are readers active or a writer * active, then wait for all of the readers * to finish or for the writer to finish. * Then, set the owner field to curthread and * decrement count to indicate that a writer * is active. */ while (l->count != 0) { l->waiters++; if (intr) { klwp_t *lwp = ttolwp(curthread); if (lwp != NULL) lwp->lwp_nostop++; if (cv_wait_sig(&l->cv, &l->lock) == 0) { if (lwp != NULL) lwp->lwp_nostop--; l->waiters--; /* * If there are readers active and no * writers waiting then wake up all of * the waiting readers (if any). */ if (l->count > 0 && l->waiters == 0) cv_broadcast(&l->cv_rd); mutex_exit(&l->lock); return (EINTR); } if (lwp != NULL) lwp->lwp_nostop--; } else cv_wait(&l->cv, &l->lock); l->waiters--; } ASSERT(l->owner == NULL); l->owner = curthread; l->count--; } mutex_exit(&l->lock); return (0); } /* * If the lock is available, obtain it and return non-zero. If there is * already a conflicting lock, return 0 immediately. */ int nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) { mutex_enter(&l->lock); /* * If this is a nested enter, then allow it. There * must be as many exits as enters through. */ if (l->owner == curthread) { /* lock is held for writing by current thread */ ASSERT(rw == RW_READER || rw == RW_WRITER); l->count--; } else if (rw == RW_READER) { /* * If there is a writer active or writers waiting, deny the * lock. Otherwise, bump the count of readers. */ if (l->count < 0 || l->waiters > 0) { mutex_exit(&l->lock); return (0); } l->count++; } else { ASSERT(rw == RW_WRITER); /* * If there are readers active or a writer active, deny the * lock. Otherwise, set the owner field to curthread and * decrement count to indicate that a writer is active. */ if (l->count != 0) { mutex_exit(&l->lock); return (0); } ASSERT(l->owner == NULL); l->owner = curthread; l->count--; } mutex_exit(&l->lock); return (1); } void nfs_rw_exit(nfs_rwlock_t *l) { mutex_enter(&l->lock); if (l->owner != NULL) { ASSERT(l->owner == curthread); /* * To release a writer lock increment count to indicate that * there is one less writer active. If this was the last of * possibly nested writer locks, then clear the owner field as * well to indicate that there is no writer active. */ ASSERT(l->count < 0); l->count++; if (l->count == 0) { l->owner = NULL; /* * If there are no writers waiting then wakeup all of * the waiting readers (if any). */ if (l->waiters == 0) cv_broadcast(&l->cv_rd); } } else { /* * To release a reader lock just decrement count to indicate * that there is one less reader active. */ ASSERT(l->count > 0); l->count--; } /* * If there are no readers active nor a writer active and there is a * writer waiting we need to wake up it. */ if (l->count == 0 && l->waiters > 0) cv_signal(&l->cv); mutex_exit(&l->lock); } int nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) { if (rw == RW_READER) return (l->count > 0); ASSERT(rw == RW_WRITER); return (l->count < 0); } /* ARGSUSED */ void nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) { l->count = 0; l->waiters = 0; l->owner = NULL; mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l->cv, NULL, CV_DEFAULT, NULL); cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL); } void nfs_rw_destroy(nfs_rwlock_t *l) { mutex_destroy(&l->lock); cv_destroy(&l->cv); cv_destroy(&l->cv_rd); } int nfs3_rddir_compar(const void *x, const void *y) { rddir_cache *a = (rddir_cache *)x; rddir_cache *b = (rddir_cache *)y; if (a->nfs3_cookie == b->nfs3_cookie) { if (a->buflen == b->buflen) return (0); if (a->buflen < b->buflen) return (-1); return (1); } if (a->nfs3_cookie < b->nfs3_cookie) return (-1); return (1); } int nfs_rddir_compar(const void *x, const void *y) { rddir_cache *a = (rddir_cache *)x; rddir_cache *b = (rddir_cache *)y; if (a->nfs_cookie == b->nfs_cookie) { if (a->buflen == b->buflen) return (0); if (a->buflen < b->buflen) return (-1); return (1); } if (a->nfs_cookie < b->nfs_cookie) return (-1); return (1); } static char * nfs_getsrvnames(mntinfo_t *mi, size_t *len) { servinfo_t *s; char *srvnames; char *namep; size_t length; /* * Calculate the length of the string required to hold all * of the server names plus either a comma or a null * character following each individual one. */ length = 0; for (s = mi->mi_servers; s != NULL; s = s->sv_next) length += s->sv_hostnamelen; srvnames = kmem_alloc(length, KM_SLEEP); namep = srvnames; for (s = mi->mi_servers; s != NULL; s = s->sv_next) { (void) strcpy(namep, s->sv_hostname); namep += s->sv_hostnamelen - 1; *namep++ = ','; } *--namep = '\0'; *len = length; return (srvnames); } /* * These two functions are temporary and designed for the upgrade-workaround * only. They cannot be used for general zone-crossing NFS client support, and * will be removed shortly. * * When the workaround is enabled, all NFS traffic is forced into the global * zone. These functions are called when the code needs to refer to the state * of the underlying network connection. They're not called when the function * needs to refer to the state of the process that invoked the system call. * (E.g., when checking whether the zone is shutting down during the mount() * call.) */ struct zone * nfs_zone(void) { return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); } zoneid_t nfs_zoneid(void) { return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); } /* * nfs_mount_label_policy: * Determine whether the mount is allowed according to MAC check, * by comparing (where appropriate) label of the remote server * against the label of the zone being mounted into. * * Returns: * 0 : access allowed * -1 : read-only access allowed (i.e., read-down) * >0 : error code, such as EACCES */ int nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, struct knetconfig *knconf, cred_t *cr) { int addr_type; void *ipaddr; bslabel_t *server_sl, *mntlabel; zone_t *mntzone = NULL; ts_label_t *zlabel; tsol_tpc_t *tp; ts_label_t *tsl = NULL; int retv; /* * Get the zone's label. Each zone on a labeled system has a label. */ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); zlabel = mntzone->zone_slabel; ASSERT(zlabel != NULL); label_hold(zlabel); if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { addr_type = IPV4_VERSION; ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { addr_type = IPV6_VERSION; ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; } else { retv = 0; goto out; } retv = EACCES; /* assume the worst */ /* * Next, get the assigned label of the remote server. */ tp = find_tpc(ipaddr, addr_type, B_FALSE); if (tp == NULL) goto out; /* error getting host entry */ if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) goto rel_tpc; /* invalid domain */ if ((tp->tpc_tp.host_type != SUN_CIPSO) && (tp->tpc_tp.host_type != UNLABELED)) goto rel_tpc; /* invalid hosttype */ if (tp->tpc_tp.host_type == SUN_CIPSO) { tsl = getflabel_cipso(vfsp); if (tsl == NULL) goto rel_tpc; /* error getting server lbl */ server_sl = label2bslabel(tsl); } else { /* UNLABELED */ server_sl = &tp->tpc_tp.tp_def_label; } mntlabel = label2bslabel(zlabel); /* * Now compare labels to complete the MAC check. If the labels * are equal or if the requestor is in the global zone and has * NET_MAC_AWARE, then allow read-write access. (Except for * mounts into the global zone itself; restrict these to * read-only.) * * If the requestor is in some other zone, but his label * dominates the server, then allow read-down. * * Otherwise, access is denied. */ if (blequal(mntlabel, server_sl) || (crgetzoneid(cr) == GLOBAL_ZONEID && getpflags(NET_MAC_AWARE, cr) != 0)) { if ((mntzone == global_zone) || !blequal(mntlabel, server_sl)) retv = -1; /* read-only */ else retv = 0; /* access OK */ } else if (bldominates(mntlabel, server_sl)) { retv = -1; /* read-only */ } else { retv = EACCES; } if (tsl != NULL) label_rele(tsl); rel_tpc: TPC_RELE(tp); out: if (mntzone) zone_rele(mntzone); label_rele(zlabel); return (retv); } boolean_t nfs_has_ctty(void) { boolean_t rv; mutex_enter(&curproc->p_splock); rv = (curproc->p_sessp->s_vp != NULL); mutex_exit(&curproc->p_splock); return (rv); } /* * See if xattr directory to see if it has any generic user attributes */ int do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) { struct uio uio; struct iovec iov; char *dbuf; struct dirent64 *dp; size_t dlen = 8 * 1024; size_t dbuflen; int eof = 0; int error; *valp = 0; dbuf = kmem_alloc(dlen, KM_SLEEP); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_segflg = UIO_SYSSPACE; uio.uio_fmode = 0; uio.uio_extflg = UIO_COPY_CACHED; uio.uio_loffset = 0; uio.uio_resid = dlen; iov.iov_base = dbuf; iov.iov_len = dlen; (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); dbuflen = dlen - uio.uio_resid; if (error || dbuflen == 0) { kmem_free(dbuf, dlen); return (error); } dp = (dirent64_t *)dbuf; while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, VIEW_READWRITE) == 0 || strcmp(dp->d_name, VIEW_READONLY) == 0) { dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); continue; } *valp = 1; break; } kmem_free(dbuf, dlen); return (0); }