xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs_subr.c (revision 92034044e95e6f6e8fb6a3dddf68ddc2561c6870)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/systm.h>
33 #include <sys/cred.h>
34 #include <sys/proc.h>
35 #include <sys/user.h>
36 #include <sys/time.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/socket.h>
41 #include <sys/uio.h>
42 #include <sys/tiuser.h>
43 #include <sys/swap.h>
44 #include <sys/errno.h>
45 #include <sys/debug.h>
46 #include <sys/kmem.h>
47 #include <sys/kstat.h>
48 #include <sys/cmn_err.h>
49 #include <sys/vtrace.h>
50 #include <sys/session.h>
51 #include <sys/dnlc.h>
52 #include <sys/bitmap.h>
53 #include <sys/acl.h>
54 #include <sys/ddi.h>
55 #include <sys/pathname.h>
56 #include <sys/flock.h>
57 #include <sys/dirent.h>
58 #include <sys/flock.h>
59 #include <sys/callb.h>
60 #include <sys/atomic.h>
61 #include <sys/list.h>
62 #include <sys/tsol/tnet.h>
63 #include <sys/priv.h>
64 #include <sys/sdt.h>
65 #include <sys/attr.h>
66 
67 #include <inet/ip6.h>
68 
69 #include <rpc/types.h>
70 #include <rpc/xdr.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73 
74 #include <nfs/nfs.h>
75 #include <nfs/nfs4.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/rnode.h>
78 #include <nfs/nfs_acl.h>
79 
80 #include <sys/tsol/label.h>
81 
82 /*
83  * The hash queues for the access to active and cached rnodes
84  * are organized as doubly linked lists.  A reader/writer lock
85  * for each hash bucket is used to control access and to synchronize
86  * lookups, additions, and deletions from the hash queue.
87  *
88  * The rnode freelist is organized as a doubly linked list with
89  * a head pointer.  Additions and deletions are synchronized via
90  * a single mutex.
91  *
92  * In order to add an rnode to the free list, it must be hashed into
93  * a hash queue and the exclusive lock to the hash queue be held.
94  * If an rnode is not hashed into a hash queue, then it is destroyed
95  * because it represents no valuable information that can be reused
96  * about the file.  The exclusive lock to the hash queue must be
97  * held in order to prevent a lookup in the hash queue from finding
98  * the rnode and using it and assuming that the rnode is not on the
99  * freelist.  The lookup in the hash queue will have the hash queue
100  * locked, either exclusive or shared.
101  *
102  * The vnode reference count for each rnode is not allowed to drop
103  * below 1.  This prevents external entities, such as the VM
104  * subsystem, from acquiring references to vnodes already on the
105  * freelist and then trying to place them back on the freelist
106  * when their reference is released.  This means that the when an
107  * rnode is looked up in the hash queues, then either the rnode
108  * is removed from the freelist and that reference is transferred to
109  * the new reference or the vnode reference count must be incremented
110  * accordingly.  The mutex for the freelist must be held in order to
111  * accurately test to see if the rnode is on the freelist or not.
112  * The hash queue lock might be held shared and it is possible that
113  * two different threads may race to remove the rnode from the
114  * freelist.  This race can be resolved by holding the mutex for the
115  * freelist.  Please note that the mutex for the freelist does not
116  * need to held if the rnode is not on the freelist.  It can not be
117  * placed on the freelist due to the requirement that the thread
118  * putting the rnode on the freelist must hold the exclusive lock
119  * to the hash queue and the thread doing the lookup in the hash
120  * queue is holding either a shared or exclusive lock to the hash
121  * queue.
122  *
123  * The lock ordering is:
124  *
125  *	hash bucket lock -> vnode lock
126  *	hash bucket lock -> freelist lock
127  */
128 static rhashq_t *rtable;
129 
130 static kmutex_t rpfreelist_lock;
131 static rnode_t *rpfreelist = NULL;
132 static long rnew = 0;
133 long nrnode = 0;
134 
135 static int rtablesize;
136 static int rtablemask;
137 
138 static int hashlen = 4;
139 
140 static struct kmem_cache *rnode_cache;
141 
142 /*
143  * Mutex to protect the following variables:
144  *	nfs_major
145  *	nfs_minor
146  */
147 kmutex_t nfs_minor_lock;
148 int nfs_major;
149 int nfs_minor;
150 
151 /* Do we allow preepoch (negative) time values otw? */
152 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
153 
154 /*
155  * Access cache
156  */
157 static acache_hash_t *acache;
158 static long nacache;	/* used strictly to size the number of hash queues */
159 
160 static int acachesize;
161 static int acachemask;
162 static struct kmem_cache *acache_cache;
163 
164 /*
165  * Client side utilities
166  */
167 
168 /*
169  * client side statistics
170  */
171 static const struct clstat clstat_tmpl = {
172 	{ "calls",	KSTAT_DATA_UINT64 },
173 	{ "badcalls",	KSTAT_DATA_UINT64 },
174 	{ "clgets",	KSTAT_DATA_UINT64 },
175 	{ "cltoomany",	KSTAT_DATA_UINT64 },
176 #ifdef DEBUG
177 	{ "clalloc",	KSTAT_DATA_UINT64 },
178 	{ "noresponse",	KSTAT_DATA_UINT64 },
179 	{ "failover",	KSTAT_DATA_UINT64 },
180 	{ "remap",	KSTAT_DATA_UINT64 },
181 #endif
182 };
183 
184 /*
185  * The following are statistics that describe behavior of the system as a whole
186  * and doesn't correspond to any one particular zone.
187  */
188 #ifdef DEBUG
189 static struct clstat_debug {
190 	kstat_named_t	nrnode;			/* number of allocated rnodes */
191 	kstat_named_t	access;			/* size of access cache */
192 	kstat_named_t	dirent;			/* size of readdir cache */
193 	kstat_named_t	dirents;		/* size of readdir buf cache */
194 	kstat_named_t	reclaim;		/* number of reclaims */
195 	kstat_named_t	clreclaim;		/* number of cl reclaims */
196 	kstat_named_t	f_reclaim;		/* number of free reclaims */
197 	kstat_named_t	a_reclaim;		/* number of active reclaims */
198 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
199 	kstat_named_t	rpath;			/* bytes used to store rpaths */
200 } clstat_debug = {
201 	{ "nrnode",	KSTAT_DATA_UINT64 },
202 	{ "access",	KSTAT_DATA_UINT64 },
203 	{ "dirent",	KSTAT_DATA_UINT64 },
204 	{ "dirents",	KSTAT_DATA_UINT64 },
205 	{ "reclaim",	KSTAT_DATA_UINT64 },
206 	{ "clreclaim",	KSTAT_DATA_UINT64 },
207 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
208 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
209 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
210 	{ "r_path",	KSTAT_DATA_UINT64 },
211 };
212 #endif	/* DEBUG */
213 
214 /*
215  * We keep a global list of per-zone client data, so we can clean up all zones
216  * if we get low on memory.
217  */
218 static list_t nfs_clnt_list;
219 static kmutex_t nfs_clnt_list_lock;
220 static zone_key_t nfsclnt_zone_key;
221 
222 static struct kmem_cache *chtab_cache;
223 
224 /*
225  * Some servers do not properly update the attributes of the
226  * directory when changes are made.  To allow interoperability
227  * with these broken servers, the nfs_disable_rddir_cache
228  * parameter must be set in /etc/system
229  */
230 int nfs_disable_rddir_cache = 0;
231 
232 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
233 		    struct chtab **);
234 void		clfree(CLIENT *, struct chtab *);
235 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
236 		    struct chtab **, struct nfs_clnt *);
237 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
238 		    struct chtab **, struct nfs_clnt *);
239 static void	clreclaim(void *);
240 static int	nfs_feedback(int, int, mntinfo_t *);
241 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
242 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
243 		    failinfo_t *);
244 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
245 		    caddr_t, cred_t *, int *, int, failinfo_t *);
246 static void	rinactive(rnode_t *, cred_t *);
247 static int	rtablehash(nfs_fhandle *);
248 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
249 		    struct vnodeops *,
250 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
251 			cred_t *),
252 		    int (*)(const void *, const void *), int *, cred_t *,
253 		    char *, char *);
254 static void	rp_rmfree(rnode_t *);
255 static void	rp_addhash(rnode_t *);
256 static void	rp_rmhash_locked(rnode_t *);
257 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
258 static void	destroy_rnode(rnode_t *);
259 static void	rddir_cache_free(rddir_cache *);
260 static int	nfs_free_data_reclaim(rnode_t *);
261 static int	nfs_active_data_reclaim(rnode_t *);
262 static int	nfs_free_reclaim(void);
263 static int	nfs_active_reclaim(void);
264 static int	nfs_rnode_reclaim(void);
265 static void	nfs_reclaim(void *);
266 static int	failover_safe(failinfo_t *);
267 static void	failover_newserver(mntinfo_t *mi);
268 static void	failover_thread(mntinfo_t *mi);
269 static int	failover_wait(mntinfo_t *);
270 static int	failover_remap(failinfo_t *);
271 static int	failover_lookup(char *, vnode_t *,
272 		    int (*)(vnode_t *, char *, vnode_t **,
273 			struct pathname *, int, vnode_t *, cred_t *, int),
274 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
275 		    vnode_t **);
276 static void	nfs_free_r_path(rnode_t *);
277 static void	nfs_set_vroot(vnode_t *);
278 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
279 
280 /*
281  * from rpcsec module (common/rpcsec)
282  */
283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
284 extern void sec_clnt_freeh(AUTH *);
285 extern void sec_clnt_freeinfo(struct sec_data *);
286 
287 /*
288  * used in mount policy
289  */
290 extern ts_label_t *getflabel_cipso(vfs_t *);
291 
292 /*
293  * EIO or EINTR are not recoverable errors.
294  */
295 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
296 
297 #ifdef DEBUG
298 #define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
299 #define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
300 #else
301 #define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
302 #define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
303 #endif
304 /*
305  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
306  */
307 static int
clget_impl(clinfo_t * ci,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)308 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
309     struct chtab **chp, struct nfs_clnt *nfscl)
310 {
311 	struct chhead *ch, *newch;
312 	struct chhead **plistp;
313 	struct chtab *cp;
314 	int error;
315 	k_sigset_t smask;
316 
317 	if (newcl == NULL || chp == NULL || ci == NULL)
318 		return (EINVAL);
319 
320 	*newcl = NULL;
321 	*chp = NULL;
322 
323 	/*
324 	 * Find an unused handle or create one
325 	 */
326 	newch = NULL;
327 	nfscl->nfscl_stat.clgets.value.ui64++;
328 top:
329 	/*
330 	 * Find the correct entry in the cache to check for free
331 	 * client handles.  The search is based on the RPC program
332 	 * number, program version number, dev_t for the transport
333 	 * device, and the protocol family.
334 	 */
335 	mutex_enter(&nfscl->nfscl_chtable_lock);
336 	plistp = &nfscl->nfscl_chtable;
337 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
338 		if (ch->ch_prog == ci->cl_prog &&
339 		    ch->ch_vers == ci->cl_vers &&
340 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
341 		    (strcmp(ch->ch_protofmly,
342 		    svp->sv_knconf->knc_protofmly) == 0))
343 			break;
344 		plistp = &ch->ch_next;
345 	}
346 
347 	/*
348 	 * If we didn't find a cache entry for this quadruple, then
349 	 * create one.  If we don't have one already preallocated,
350 	 * then drop the cache lock, create one, and then start over.
351 	 * If we did have a preallocated entry, then just add it to
352 	 * the front of the list.
353 	 */
354 	if (ch == NULL) {
355 		if (newch == NULL) {
356 			mutex_exit(&nfscl->nfscl_chtable_lock);
357 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
358 			newch->ch_timesused = 0;
359 			newch->ch_prog = ci->cl_prog;
360 			newch->ch_vers = ci->cl_vers;
361 			newch->ch_dev = svp->sv_knconf->knc_rdev;
362 			newch->ch_protofmly = kmem_alloc(
363 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
364 			    KM_SLEEP);
365 			(void) strcpy(newch->ch_protofmly,
366 			    svp->sv_knconf->knc_protofmly);
367 			newch->ch_list = NULL;
368 			goto top;
369 		}
370 		ch = newch;
371 		newch = NULL;
372 		ch->ch_next = nfscl->nfscl_chtable;
373 		nfscl->nfscl_chtable = ch;
374 	/*
375 	 * We found a cache entry, but if it isn't on the front of the
376 	 * list, then move it to the front of the list to try to take
377 	 * advantage of locality of operations.
378 	 */
379 	} else if (ch != nfscl->nfscl_chtable) {
380 		*plistp = ch->ch_next;
381 		ch->ch_next = nfscl->nfscl_chtable;
382 		nfscl->nfscl_chtable = ch;
383 	}
384 
385 	/*
386 	 * If there was a free client handle cached, then remove it
387 	 * from the list, init it, and use it.
388 	 */
389 	if (ch->ch_list != NULL) {
390 		cp = ch->ch_list;
391 		ch->ch_list = cp->ch_list;
392 		mutex_exit(&nfscl->nfscl_chtable_lock);
393 		if (newch != NULL) {
394 			kmem_free(newch->ch_protofmly,
395 			    strlen(newch->ch_protofmly) + 1);
396 			kmem_free(newch, sizeof (*newch));
397 		}
398 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
399 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
400 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
401 		    &cp->ch_client->cl_auth);
402 		if (error || cp->ch_client->cl_auth == NULL) {
403 			CLNT_DESTROY(cp->ch_client);
404 			kmem_cache_free(chtab_cache, cp);
405 			return ((error != 0) ? error : EINTR);
406 		}
407 		ch->ch_timesused++;
408 		*newcl = cp->ch_client;
409 		*chp = cp;
410 		return (0);
411 	}
412 
413 	/*
414 	 * There weren't any free client handles which fit, so allocate
415 	 * a new one and use that.
416 	 */
417 #ifdef DEBUG
418 	atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
419 #endif
420 	mutex_exit(&nfscl->nfscl_chtable_lock);
421 
422 	nfscl->nfscl_stat.cltoomany.value.ui64++;
423 	if (newch != NULL) {
424 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
425 		kmem_free(newch, sizeof (*newch));
426 	}
427 
428 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
429 	cp->ch_head = ch;
430 
431 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
432 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
433 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
434 	sigunintr(&smask);
435 
436 	if (error != 0) {
437 		kmem_cache_free(chtab_cache, cp);
438 #ifdef DEBUG
439 		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
440 #endif
441 		/*
442 		 * Warning is unnecessary if error is EINTR.
443 		 */
444 		if (error != EINTR) {
445 			nfs_cmn_err(error, CE_WARN,
446 			    "clget: couldn't create handle: %m\n");
447 		}
448 		return (error);
449 	}
450 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
451 	auth_destroy(cp->ch_client->cl_auth);
452 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
453 	    &cp->ch_client->cl_auth);
454 	if (error || cp->ch_client->cl_auth == NULL) {
455 		CLNT_DESTROY(cp->ch_client);
456 		kmem_cache_free(chtab_cache, cp);
457 #ifdef DEBUG
458 		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
459 #endif
460 		return ((error != 0) ? error : EINTR);
461 	}
462 	ch->ch_timesused++;
463 	*newcl = cp->ch_client;
464 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
465 	*chp = cp;
466 	return (0);
467 }
468 
469 int
clget(clinfo_t * ci,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp)470 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
471     struct chtab **chp)
472 {
473 	struct nfs_clnt *nfscl;
474 
475 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
476 	ASSERT(nfscl != NULL);
477 
478 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
479 }
480 
481 static int
acl_clget(mntinfo_t * mi,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)482 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
483     struct chtab **chp, struct nfs_clnt *nfscl)
484 {
485 	clinfo_t ci;
486 	int error;
487 
488 	/*
489 	 * Set read buffer size to rsize
490 	 * and add room for RPC headers.
491 	 */
492 	ci.cl_readsize = mi->mi_tsize;
493 	if (ci.cl_readsize != 0)
494 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
495 
496 	/*
497 	 * If soft mount and server is down just try once.
498 	 * meaning: do not retransmit.
499 	 */
500 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
501 		ci.cl_retrans = 0;
502 	else
503 		ci.cl_retrans = mi->mi_retrans;
504 
505 	ci.cl_prog = NFS_ACL_PROGRAM;
506 	ci.cl_vers = mi->mi_vers;
507 	ci.cl_flags = mi->mi_flags;
508 
509 	/*
510 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
511 	 * security flavor, the client tries to establish a security context
512 	 * by contacting the server. If the connection is timed out or reset,
513 	 * e.g. server reboot, we will try again.
514 	 */
515 	do {
516 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
517 
518 		if (error == 0)
519 			break;
520 
521 		/*
522 		 * For forced unmount or zone shutdown, bail out, no retry.
523 		 */
524 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
525 			error = EIO;
526 			break;
527 		}
528 
529 		/* do not retry for softmount */
530 		if (!(mi->mi_flags & MI_HARD))
531 			break;
532 
533 		/* let the caller deal with the failover case */
534 		if (FAILOVER_MOUNT(mi))
535 			break;
536 
537 	} while (error == ETIMEDOUT || error == ECONNRESET);
538 
539 	return (error);
540 }
541 
542 static int
nfs_clget(mntinfo_t * mi,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)543 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
544     struct chtab **chp, struct nfs_clnt *nfscl)
545 {
546 	clinfo_t ci;
547 	int error;
548 
549 	/*
550 	 * Set read buffer size to rsize
551 	 * and add room for RPC headers.
552 	 */
553 	ci.cl_readsize = mi->mi_tsize;
554 	if (ci.cl_readsize != 0)
555 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
556 
557 	/*
558 	 * If soft mount and server is down just try once.
559 	 * meaning: do not retransmit.
560 	 */
561 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
562 		ci.cl_retrans = 0;
563 	else
564 		ci.cl_retrans = mi->mi_retrans;
565 
566 	ci.cl_prog = mi->mi_prog;
567 	ci.cl_vers = mi->mi_vers;
568 	ci.cl_flags = mi->mi_flags;
569 
570 	/*
571 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
572 	 * security flavor, the client tries to establish a security context
573 	 * by contacting the server. If the connection is timed out or reset,
574 	 * e.g. server reboot, we will try again.
575 	 */
576 	do {
577 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
578 
579 		if (error == 0)
580 			break;
581 
582 		/*
583 		 * For forced unmount or zone shutdown, bail out, no retry.
584 		 */
585 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
586 			error = EIO;
587 			break;
588 		}
589 
590 		/* do not retry for softmount */
591 		if (!(mi->mi_flags & MI_HARD))
592 			break;
593 
594 		/* let the caller deal with the failover case */
595 		if (FAILOVER_MOUNT(mi))
596 			break;
597 
598 	} while (error == ETIMEDOUT || error == ECONNRESET);
599 
600 	return (error);
601 }
602 
603 static void
clfree_impl(CLIENT * cl,struct chtab * cp,struct nfs_clnt * nfscl)604 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
605 {
606 	if (cl->cl_auth != NULL) {
607 		sec_clnt_freeh(cl->cl_auth);
608 		cl->cl_auth = NULL;
609 	}
610 
611 	/*
612 	 * Timestamp this cache entry so that we know when it was last
613 	 * used.
614 	 */
615 	cp->ch_freed = gethrestime_sec();
616 
617 	/*
618 	 * Add the free client handle to the front of the list.
619 	 * This way, the list will be sorted in youngest to oldest
620 	 * order.
621 	 */
622 	mutex_enter(&nfscl->nfscl_chtable_lock);
623 	cp->ch_list = cp->ch_head->ch_list;
624 	cp->ch_head->ch_list = cp;
625 	mutex_exit(&nfscl->nfscl_chtable_lock);
626 }
627 
628 void
clfree(CLIENT * cl,struct chtab * cp)629 clfree(CLIENT *cl, struct chtab *cp)
630 {
631 	struct nfs_clnt *nfscl;
632 
633 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
634 	ASSERT(nfscl != NULL);
635 
636 	clfree_impl(cl, cp, nfscl);
637 }
638 
639 #define	CL_HOLDTIME	60	/* time to hold client handles */
640 
641 static void
clreclaim_zone(struct nfs_clnt * nfscl,uint_t cl_holdtime)642 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
643 {
644 	struct chhead *ch;
645 	struct chtab *cp;	/* list of objects that can be reclaimed */
646 	struct chtab *cpe;
647 	struct chtab *cpl;
648 	struct chtab **cpp;
649 #ifdef DEBUG
650 	int n = 0;
651 #endif
652 
653 	/*
654 	 * Need to reclaim some memory, so step through the cache
655 	 * looking through the lists for entries which can be freed.
656 	 */
657 	cp = NULL;
658 
659 	mutex_enter(&nfscl->nfscl_chtable_lock);
660 
661 	/*
662 	 * Here we step through each non-NULL quadruple and start to
663 	 * construct the reclaim list pointed to by cp.  Note that
664 	 * cp will contain all eligible chtab entries.  When this traversal
665 	 * completes, chtab entries from the last quadruple will be at the
666 	 * front of cp and entries from previously inspected quadruples have
667 	 * been appended to the rear of cp.
668 	 */
669 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
670 		if (ch->ch_list == NULL)
671 			continue;
672 		/*
673 		 * Search each list for entries older then
674 		 * cl_holdtime seconds.  The lists are maintained
675 		 * in youngest to oldest order so that when the
676 		 * first entry is found which is old enough, then
677 		 * all of the rest of the entries on the list will
678 		 * be old enough as well.
679 		 */
680 		cpl = ch->ch_list;
681 		cpp = &ch->ch_list;
682 		while (cpl != NULL &&
683 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
684 			cpp = &cpl->ch_list;
685 			cpl = cpl->ch_list;
686 		}
687 		if (cpl != NULL) {
688 			*cpp = NULL;
689 			if (cp != NULL) {
690 				cpe = cpl;
691 				while (cpe->ch_list != NULL)
692 					cpe = cpe->ch_list;
693 				cpe->ch_list = cp;
694 			}
695 			cp = cpl;
696 		}
697 	}
698 
699 	mutex_exit(&nfscl->nfscl_chtable_lock);
700 
701 	/*
702 	 * If cp is empty, then there is nothing to reclaim here.
703 	 */
704 	if (cp == NULL)
705 		return;
706 
707 	/*
708 	 * Step through the list of entries to free, destroying each client
709 	 * handle and kmem_free'ing the memory for each entry.
710 	 */
711 	while (cp != NULL) {
712 #ifdef DEBUG
713 		n++;
714 #endif
715 		CLNT_DESTROY(cp->ch_client);
716 		cpl = cp->ch_list;
717 		kmem_cache_free(chtab_cache, cp);
718 		cp = cpl;
719 	}
720 
721 #ifdef DEBUG
722 	/*
723 	 * Update clalloc so that nfsstat shows the current number
724 	 * of allocated client handles.
725 	 */
726 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
727 #endif
728 }
729 
730 /* ARGSUSED */
731 static void
clreclaim(void * all)732 clreclaim(void *all)
733 {
734 	struct nfs_clnt *nfscl;
735 
736 #ifdef DEBUG
737 	clstat_debug.clreclaim.value.ui64++;
738 #endif
739 	/*
740 	 * The system is low on memory; go through and try to reclaim some from
741 	 * every zone on the system.
742 	 */
743 	mutex_enter(&nfs_clnt_list_lock);
744 	nfscl = list_head(&nfs_clnt_list);
745 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
746 		clreclaim_zone(nfscl, CL_HOLDTIME);
747 	mutex_exit(&nfs_clnt_list_lock);
748 }
749 
750 /*
751  * Minimum time-out values indexed by call type
752  * These units are in "eights" of a second to avoid multiplies
753  */
754 static unsigned int minimum_timeo[] = {
755 	6, 7, 10
756 };
757 
758 /*
759  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
760  */
761 #define	MAXTIMO	(20*hz)
762 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
763 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
764 
765 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
766 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
767 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
768 
769 /*
770  * Function called when rfscall notices that we have been
771  * re-transmitting, or when we get a response without retransmissions.
772  * Return 1 if the transfer size was adjusted down - 0 if no change.
773  */
774 static int
nfs_feedback(int flag,int which,mntinfo_t * mi)775 nfs_feedback(int flag, int which, mntinfo_t *mi)
776 {
777 	int kind;
778 	int r = 0;
779 
780 	mutex_enter(&mi->mi_lock);
781 	if (flag == FEEDBACK_REXMIT1) {
782 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
783 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
784 			goto done;
785 		if (mi->mi_curread > MIN_NFS_TSIZE) {
786 			mi->mi_curread /= 2;
787 			if (mi->mi_curread < MIN_NFS_TSIZE)
788 				mi->mi_curread = MIN_NFS_TSIZE;
789 			r = 1;
790 		}
791 
792 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
793 			mi->mi_curwrite /= 2;
794 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
795 				mi->mi_curwrite = MIN_NFS_TSIZE;
796 			r = 1;
797 		}
798 	} else if (flag == FEEDBACK_OK) {
799 		kind = mi->mi_timer_type[which];
800 		if (kind == 0 ||
801 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
802 			goto done;
803 		if (kind == 1) {
804 			if (mi->mi_curread >= mi->mi_tsize)
805 				goto done;
806 			mi->mi_curread +=  MIN_NFS_TSIZE;
807 			if (mi->mi_curread > mi->mi_tsize/2)
808 				mi->mi_curread = mi->mi_tsize;
809 		} else if (kind == 2) {
810 			if (mi->mi_curwrite >= mi->mi_stsize)
811 				goto done;
812 			mi->mi_curwrite += MIN_NFS_TSIZE;
813 			if (mi->mi_curwrite > mi->mi_stsize/2)
814 				mi->mi_curwrite = mi->mi_stsize;
815 		}
816 	}
817 done:
818 	mutex_exit(&mi->mi_lock);
819 	return (r);
820 }
821 
822 #ifdef DEBUG
823 static int rfs2call_hits = 0;
824 static int rfs2call_misses = 0;
825 #endif
826 
827 int
rfs2call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,enum nfsstat * statusp,int flags,failinfo_t * fi)828 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
829     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
830     enum nfsstat *statusp, int flags, failinfo_t *fi)
831 {
832 	int rpcerror;
833 	enum clnt_stat rpc_status;
834 
835 	ASSERT(statusp != NULL);
836 
837 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
838 	    cr, douprintf, &rpc_status, flags, fi);
839 	if (!rpcerror) {
840 		/*
841 		 * See crnetadjust() for comments.
842 		 */
843 		if (*statusp == NFSERR_ACCES &&
844 		    (cr = crnetadjust(cr)) != NULL) {
845 #ifdef DEBUG
846 			rfs2call_hits++;
847 #endif
848 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
849 			    resp, cr, douprintf, NULL, flags, fi);
850 			crfree(cr);
851 #ifdef DEBUG
852 			if (*statusp == NFSERR_ACCES)
853 				rfs2call_misses++;
854 #endif
855 		}
856 	} else if (rpc_status == RPC_PROCUNAVAIL) {
857 		*statusp = NFSERR_OPNOTSUPP;
858 		rpcerror = 0;
859 	}
860 
861 	return (rpcerror);
862 }
863 
864 #define	NFS3_JUKEBOX_DELAY	10 * hz
865 
866 static clock_t nfs3_jukebox_delay = 0;
867 
868 #ifdef DEBUG
869 static int rfs3call_hits = 0;
870 static int rfs3call_misses = 0;
871 #endif
872 
873 int
rfs3call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,nfsstat3 * statusp,int flags,failinfo_t * fi)874 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
875     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
876     nfsstat3 *statusp, int flags, failinfo_t *fi)
877 {
878 	int rpcerror;
879 	int user_informed;
880 
881 	user_informed = 0;
882 	do {
883 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
884 		    cr, douprintf, NULL, flags, fi);
885 		if (!rpcerror) {
886 			cred_t *crr;
887 			if (*statusp == NFS3ERR_JUKEBOX) {
888 				if (ttoproc(curthread) == &p0) {
889 					rpcerror = EAGAIN;
890 					break;
891 				}
892 				if (!user_informed) {
893 					user_informed = 1;
894 					uprintf(
895 		"file temporarily unavailable on the server, retrying...\n");
896 				}
897 				delay(nfs3_jukebox_delay);
898 			}
899 			/*
900 			 * See crnetadjust() for comments.
901 			 */
902 			else if (*statusp == NFS3ERR_ACCES &&
903 			    (crr = crnetadjust(cr)) != NULL) {
904 #ifdef DEBUG
905 				rfs3call_hits++;
906 #endif
907 				rpcerror = rfscall(mi, which, xdrargs, argsp,
908 				    xdrres, resp, crr, douprintf,
909 				    NULL, flags, fi);
910 
911 				crfree(crr);
912 #ifdef DEBUG
913 				if (*statusp == NFS3ERR_ACCES)
914 					rfs3call_misses++;
915 #endif
916 			}
917 		}
918 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
919 
920 	return (rpcerror);
921 }
922 
923 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
924 #define	INC_READERS(mi)		{ \
925 	mi->mi_readers++; \
926 }
927 #define	DEC_READERS(mi)		{ \
928 	mi->mi_readers--; \
929 	if (mi->mi_readers == 0) \
930 		cv_broadcast(&mi->mi_failover_cv); \
931 }
932 
933 static int
rfscall(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * douprintf,enum clnt_stat * rpc_status,int flags,failinfo_t * fi)934 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
935     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
936     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
937 {
938 	CLIENT *client;
939 	struct chtab *ch;
940 	cred_t *cr = icr;
941 	enum clnt_stat status;
942 	struct rpc_err rpcerr, rpcerr_tmp;
943 	struct timeval wait;
944 	int timeo;		/* in units of hz */
945 	int my_rsize, my_wsize;
946 	bool_t tryagain;
947 	bool_t cred_cloned = FALSE;
948 	k_sigset_t smask;
949 	servinfo_t *svp;
950 	struct nfs_clnt *nfscl;
951 	zoneid_t zoneid = getzoneid();
952 	char *msg;
953 #ifdef DEBUG
954 	char *bufp;
955 #endif
956 
957 
958 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
959 	    "rfscall_start:which %d mi %p", which, mi);
960 
961 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
962 	ASSERT(nfscl != NULL);
963 
964 	nfscl->nfscl_stat.calls.value.ui64++;
965 	mi->mi_reqs[which].value.ui64++;
966 
967 	rpcerr.re_status = RPC_SUCCESS;
968 
969 	/*
970 	 * In case of forced unmount or zone shutdown, return EIO.
971 	 */
972 
973 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
974 		rpcerr.re_status = RPC_FAILED;
975 		rpcerr.re_errno = EIO;
976 		return (rpcerr.re_errno);
977 	}
978 
979 	/*
980 	 * Remember the transfer sizes in case
981 	 * nfs_feedback changes them underneath us.
982 	 */
983 	my_rsize = mi->mi_curread;
984 	my_wsize = mi->mi_curwrite;
985 
986 	/*
987 	 * NFS client failover support
988 	 *
989 	 * If this rnode is not in sync with the current server (VALID_FH),
990 	 * we'd like to do a remap to get in sync.  We can be interrupted
991 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
992 	 * use the best info we have to try the RPC.  Part of that is
993 	 * unconditionally updating the filehandle copy kept for V3.
994 	 *
995 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
996 	 * rw_enter(); we're trying to keep the current server from being
997 	 * changed on us until we're done with the remapping and have a
998 	 * matching client handle.  We don't want to sending a filehandle
999 	 * to the wrong host.
1000 	 */
1001 failoverretry:
1002 	if (FAILOVER_MOUNT(mi)) {
1003 		mutex_enter(&mi->mi_lock);
1004 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1005 			if (failover_wait(mi)) {
1006 				mutex_exit(&mi->mi_lock);
1007 				return (EINTR);
1008 			}
1009 		}
1010 		INC_READERS(mi);
1011 		mutex_exit(&mi->mi_lock);
1012 		if (fi) {
1013 			if (!VALID_FH(fi) &&
1014 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1015 				int remaperr;
1016 
1017 				svp = mi->mi_curr_serv;
1018 				remaperr = failover_remap(fi);
1019 				if (remaperr != 0) {
1020 #ifdef DEBUG
1021 					if (remaperr != EINTR)
1022 						nfs_cmn_err(remaperr, CE_WARN,
1023 					    "rfscall couldn't failover: %m");
1024 #endif
1025 					mutex_enter(&mi->mi_lock);
1026 					DEC_READERS(mi);
1027 					mutex_exit(&mi->mi_lock);
1028 					/*
1029 					 * If failover_remap returns ETIMEDOUT
1030 					 * and the filesystem is hard mounted
1031 					 * we have to retry the call with a new
1032 					 * server.
1033 					 */
1034 					if ((mi->mi_flags & MI_HARD) &&
1035 					    IS_RECOVERABLE_ERROR(remaperr)) {
1036 						if (svp == mi->mi_curr_serv)
1037 							failover_newserver(mi);
1038 						rpcerr.re_status = RPC_SUCCESS;
1039 						goto failoverretry;
1040 					}
1041 					rpcerr.re_errno = remaperr;
1042 					return (remaperr);
1043 				}
1044 			}
1045 			if (fi->fhp && fi->copyproc)
1046 				(*fi->copyproc)(fi->fhp, fi->vp);
1047 		}
1048 	}
1049 
1050 	/* For TSOL, use a new cred which has net_mac_aware flag */
1051 	if (!cred_cloned && is_system_labeled()) {
1052 		cred_cloned = TRUE;
1053 		cr = crdup(icr);
1054 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1055 	}
1056 
1057 	/*
1058 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1059 	 * are guaranteed to reprocess the retry as a new request.
1060 	 */
1061 	svp = mi->mi_curr_serv;
1062 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1063 
1064 	if (FAILOVER_MOUNT(mi)) {
1065 		mutex_enter(&mi->mi_lock);
1066 		DEC_READERS(mi);
1067 		mutex_exit(&mi->mi_lock);
1068 
1069 		if ((rpcerr.re_errno == ETIMEDOUT ||
1070 		    rpcerr.re_errno == ECONNRESET) &&
1071 		    failover_safe(fi)) {
1072 			if (svp == mi->mi_curr_serv)
1073 				failover_newserver(mi);
1074 			goto failoverretry;
1075 		}
1076 	}
1077 	if (rpcerr.re_errno != 0)
1078 		return (rpcerr.re_errno);
1079 
1080 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1081 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1082 		timeo = (mi->mi_timeo * hz) / 10;
1083 	} else {
1084 		mutex_enter(&mi->mi_lock);
1085 		timeo = CLNT_SETTIMERS(client,
1086 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1087 		    &(mi->mi_timers[NFS_CALLTYPES]),
1088 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1089 		    (void (*)())NULL, (caddr_t)mi, 0);
1090 		mutex_exit(&mi->mi_lock);
1091 	}
1092 
1093 	/*
1094 	 * If hard mounted fs, retry call forever unless hard error occurs.
1095 	 */
1096 	do {
1097 		tryagain = FALSE;
1098 
1099 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1100 			status = RPC_FAILED;
1101 			rpcerr.re_status = RPC_FAILED;
1102 			rpcerr.re_errno = EIO;
1103 			break;
1104 		}
1105 
1106 		TICK_TO_TIMEVAL(timeo, &wait);
1107 
1108 		/*
1109 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1110 		 * and SIGTERM. (Preserving the existing masks).
1111 		 * Mask out SIGINT if mount option nointr is specified.
1112 		 */
1113 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1114 		if (!(mi->mi_flags & MI_INT))
1115 			client->cl_nosignal = TRUE;
1116 
1117 		/*
1118 		 * If there is a current signal, then don't bother
1119 		 * even trying to send out the request because we
1120 		 * won't be able to block waiting for the response.
1121 		 * Simply assume RPC_INTR and get on with it.
1122 		 */
1123 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1124 			status = RPC_INTR;
1125 		else {
1126 			status = CLNT_CALL(client, which, xdrargs, argsp,
1127 			    xdrres, resp, wait);
1128 		}
1129 
1130 		if (!(mi->mi_flags & MI_INT))
1131 			client->cl_nosignal = FALSE;
1132 		/*
1133 		 * restore original signal mask
1134 		 */
1135 		sigunintr(&smask);
1136 
1137 		switch (status) {
1138 		case RPC_SUCCESS:
1139 			if ((mi->mi_flags & MI_DYNAMIC) &&
1140 			    mi->mi_timer_type[which] != 0 &&
1141 			    (mi->mi_curread != my_rsize ||
1142 			    mi->mi_curwrite != my_wsize))
1143 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1144 			break;
1145 
1146 		case RPC_INTR:
1147 			/*
1148 			 * There is no way to recover from this error,
1149 			 * even if mount option nointr is specified.
1150 			 * SIGKILL, for example, cannot be blocked.
1151 			 */
1152 			rpcerr.re_status = RPC_INTR;
1153 			rpcerr.re_errno = EINTR;
1154 			break;
1155 
1156 		case RPC_UDERROR:
1157 			/*
1158 			 * If the NFS server is local (vold) and
1159 			 * it goes away then we get RPC_UDERROR.
1160 			 * This is a retryable error, so we would
1161 			 * loop, so check to see if the specific
1162 			 * error was ECONNRESET, indicating that
1163 			 * target did not exist at all.  If so,
1164 			 * return with RPC_PROGUNAVAIL and
1165 			 * ECONNRESET to indicate why.
1166 			 */
1167 			CLNT_GETERR(client, &rpcerr);
1168 			if (rpcerr.re_errno == ECONNRESET) {
1169 				rpcerr.re_status = RPC_PROGUNAVAIL;
1170 				rpcerr.re_errno = ECONNRESET;
1171 				break;
1172 			}
1173 			/*FALLTHROUGH*/
1174 
1175 		default:		/* probably RPC_TIMEDOUT */
1176 			if (IS_UNRECOVERABLE_RPC(status))
1177 				break;
1178 
1179 			/*
1180 			 * increment server not responding count
1181 			 */
1182 			mutex_enter(&mi->mi_lock);
1183 			mi->mi_noresponse++;
1184 			mutex_exit(&mi->mi_lock);
1185 #ifdef DEBUG
1186 			nfscl->nfscl_stat.noresponse.value.ui64++;
1187 #endif
1188 
1189 			if (!(mi->mi_flags & MI_HARD)) {
1190 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1191 				    (mi->mi_ss_call_type[which] == 0))
1192 					break;
1193 			}
1194 
1195 			/*
1196 			 * The call is in progress (over COTS).
1197 			 * Try the CLNT_CALL again, but don't
1198 			 * print a noisy error message.
1199 			 */
1200 			if (status == RPC_INPROGRESS) {
1201 				tryagain = TRUE;
1202 				break;
1203 			}
1204 
1205 			if (flags & RFSCALL_SOFT)
1206 				break;
1207 
1208 			/*
1209 			 * On zone shutdown, just move on.
1210 			 */
1211 			if (zone_status_get(curproc->p_zone) >=
1212 			    ZONE_IS_SHUTTING_DOWN) {
1213 				rpcerr.re_status = RPC_FAILED;
1214 				rpcerr.re_errno = EIO;
1215 				break;
1216 			}
1217 
1218 			/*
1219 			 * NFS client failover support
1220 			 *
1221 			 * If the current server just failed us, we'll
1222 			 * start the process of finding a new server.
1223 			 * After that, we can just retry.
1224 			 */
1225 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1226 				if (svp == mi->mi_curr_serv)
1227 					failover_newserver(mi);
1228 				clfree_impl(client, ch, nfscl);
1229 				goto failoverretry;
1230 			}
1231 
1232 			tryagain = TRUE;
1233 			timeo = backoff(timeo);
1234 
1235 			CLNT_GETERR(client, &rpcerr_tmp);
1236 			if ((status == RPC_CANTSEND) &&
1237 			    (rpcerr_tmp.re_errno == ENOBUFS))
1238 				msg = SRV_QFULL_MSG;
1239 			else
1240 				msg = SRV_NOTRESP_MSG;
1241 
1242 			mutex_enter(&mi->mi_lock);
1243 			if (!(mi->mi_flags & MI_PRINTED)) {
1244 				mi->mi_flags |= MI_PRINTED;
1245 				mutex_exit(&mi->mi_lock);
1246 #ifdef DEBUG
1247 				zprintf(zoneid, msg, mi->mi_vers,
1248 				    svp->sv_hostname);
1249 #else
1250 				zprintf(zoneid, msg, svp->sv_hostname);
1251 #endif
1252 			} else
1253 				mutex_exit(&mi->mi_lock);
1254 			if (*douprintf && nfs_has_ctty()) {
1255 				*douprintf = 0;
1256 				if (!(mi->mi_flags & MI_NOPRINT))
1257 #ifdef DEBUG
1258 					uprintf(msg, mi->mi_vers,
1259 					    svp->sv_hostname);
1260 #else
1261 					uprintf(msg, svp->sv_hostname);
1262 #endif
1263 			}
1264 
1265 			/*
1266 			 * If doing dynamic adjustment of transfer
1267 			 * size and if it's a read or write call
1268 			 * and if the transfer size changed while
1269 			 * retransmitting or if the feedback routine
1270 			 * changed the transfer size,
1271 			 * then exit rfscall so that the transfer
1272 			 * size can be adjusted at the vnops level.
1273 			 */
1274 			if ((mi->mi_flags & MI_DYNAMIC) &&
1275 			    mi->mi_timer_type[which] != 0 &&
1276 			    (mi->mi_curread != my_rsize ||
1277 			    mi->mi_curwrite != my_wsize ||
1278 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1279 				/*
1280 				 * On read or write calls, return
1281 				 * back to the vnode ops level if
1282 				 * the transfer size changed.
1283 				 */
1284 				clfree_impl(client, ch, nfscl);
1285 				if (cred_cloned)
1286 					crfree(cr);
1287 				return (ENFS_TRYAGAIN);
1288 			}
1289 		}
1290 	} while (tryagain);
1291 
1292 	if (status != RPC_SUCCESS) {
1293 		/*
1294 		 * Let soft mounts use the timed out message.
1295 		 */
1296 		if (status == RPC_INPROGRESS)
1297 			status = RPC_TIMEDOUT;
1298 		nfscl->nfscl_stat.badcalls.value.ui64++;
1299 		if (status != RPC_INTR) {
1300 			mutex_enter(&mi->mi_lock);
1301 			mi->mi_flags |= MI_DOWN;
1302 			mutex_exit(&mi->mi_lock);
1303 			CLNT_GETERR(client, &rpcerr);
1304 #ifdef DEBUG
1305 			bufp = clnt_sperror(client, svp->sv_hostname);
1306 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1307 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1308 			if (nfs_has_ctty()) {
1309 				if (!(mi->mi_flags & MI_NOPRINT)) {
1310 					uprintf("NFS%d %s failed for %s\n",
1311 					    mi->mi_vers, mi->mi_rfsnames[which],
1312 					    bufp);
1313 				}
1314 			}
1315 			kmem_free(bufp, MAXPATHLEN);
1316 #else
1317 			zprintf(zoneid,
1318 			    "NFS %s failed for server %s: error %d (%s)\n",
1319 			    mi->mi_rfsnames[which], svp->sv_hostname,
1320 			    status, clnt_sperrno(status));
1321 			if (nfs_has_ctty()) {
1322 				if (!(mi->mi_flags & MI_NOPRINT)) {
1323 					uprintf(
1324 				"NFS %s failed for server %s: error %d (%s)\n",
1325 					    mi->mi_rfsnames[which],
1326 					    svp->sv_hostname, status,
1327 					    clnt_sperrno(status));
1328 				}
1329 			}
1330 #endif
1331 			/*
1332 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1333 			 * re_errno is set appropriately depending on
1334 			 * the authentication error
1335 			 */
1336 			if (status == RPC_VERSMISMATCH ||
1337 			    status == RPC_PROGVERSMISMATCH)
1338 				rpcerr.re_errno = EIO;
1339 		}
1340 	} else {
1341 		/*
1342 		 * Test the value of mi_down and mi_printed without
1343 		 * holding the mi_lock mutex.  If they are both zero,
1344 		 * then it is okay to skip the down and printed
1345 		 * processing.  This saves on a mutex_enter and
1346 		 * mutex_exit pair for a normal, successful RPC.
1347 		 * This was just complete overhead.
1348 		 */
1349 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1350 			mutex_enter(&mi->mi_lock);
1351 			mi->mi_flags &= ~MI_DOWN;
1352 			if (mi->mi_flags & MI_PRINTED) {
1353 				mi->mi_flags &= ~MI_PRINTED;
1354 				mutex_exit(&mi->mi_lock);
1355 #ifdef DEBUG
1356 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357 				zprintf(zoneid, "NFS%d server %s ok\n",
1358 				    mi->mi_vers, svp->sv_hostname);
1359 #else
1360 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1361 				zprintf(zoneid, "NFS server %s ok\n",
1362 				    svp->sv_hostname);
1363 #endif
1364 			} else
1365 				mutex_exit(&mi->mi_lock);
1366 		}
1367 
1368 		if (*douprintf == 0) {
1369 			if (!(mi->mi_flags & MI_NOPRINT))
1370 #ifdef DEBUG
1371 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372 					uprintf("NFS%d server %s ok\n",
1373 					    mi->mi_vers, svp->sv_hostname);
1374 #else
1375 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1376 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1377 #endif
1378 			*douprintf = 1;
1379 		}
1380 	}
1381 
1382 	clfree_impl(client, ch, nfscl);
1383 	if (cred_cloned)
1384 		crfree(cr);
1385 
1386 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1387 
1388 	if (rpc_status != NULL)
1389 		*rpc_status = rpcerr.re_status;
1390 
1391 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1392 	    rpcerr.re_errno);
1393 
1394 	return (rpcerr.re_errno);
1395 }
1396 
1397 #ifdef DEBUG
1398 static int acl2call_hits = 0;
1399 static int acl2call_misses = 0;
1400 #endif
1401 
1402 int
acl2call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,enum nfsstat * statusp,int flags,failinfo_t * fi)1403 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1404     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1405     enum nfsstat *statusp, int flags, failinfo_t *fi)
1406 {
1407 	int rpcerror;
1408 
1409 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1410 	    cr, douprintf, flags, fi);
1411 	if (!rpcerror) {
1412 		/*
1413 		 * See comments with crnetadjust().
1414 		 */
1415 		if (*statusp == NFSERR_ACCES &&
1416 		    (cr = crnetadjust(cr)) != NULL) {
1417 #ifdef DEBUG
1418 			acl2call_hits++;
1419 #endif
1420 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1421 			    resp, cr, douprintf, flags, fi);
1422 			crfree(cr);
1423 #ifdef DEBUG
1424 			if (*statusp == NFSERR_ACCES)
1425 				acl2call_misses++;
1426 #endif
1427 		}
1428 	}
1429 
1430 	return (rpcerror);
1431 }
1432 
1433 #ifdef DEBUG
1434 static int acl3call_hits = 0;
1435 static int acl3call_misses = 0;
1436 #endif
1437 
1438 int
acl3call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,nfsstat3 * statusp,int flags,failinfo_t * fi)1439 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1440     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1441     nfsstat3 *statusp, int flags, failinfo_t *fi)
1442 {
1443 	int rpcerror;
1444 	int user_informed;
1445 
1446 	user_informed = 0;
1447 
1448 	do {
1449 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1450 		    cr, douprintf, flags, fi);
1451 		if (!rpcerror) {
1452 			cred_t *crr;
1453 			if (*statusp == NFS3ERR_JUKEBOX) {
1454 				if (!user_informed) {
1455 					user_informed = 1;
1456 					uprintf(
1457 		"file temporarily unavailable on the server, retrying...\n");
1458 				}
1459 				delay(nfs3_jukebox_delay);
1460 			}
1461 			/*
1462 			 * See crnetadjust() for comments.
1463 			 */
1464 			else if (*statusp == NFS3ERR_ACCES &&
1465 			    (crr = crnetadjust(cr)) != NULL) {
1466 #ifdef DEBUG
1467 				acl3call_hits++;
1468 #endif
1469 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1470 				    xdrres, resp, crr, douprintf, flags, fi);
1471 
1472 				crfree(crr);
1473 #ifdef DEBUG
1474 				if (*statusp == NFS3ERR_ACCES)
1475 					acl3call_misses++;
1476 #endif
1477 			}
1478 		}
1479 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1480 
1481 	return (rpcerror);
1482 }
1483 
1484 static int
aclcall(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * douprintf,int flags,failinfo_t * fi)1485 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1486     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1487     int flags, failinfo_t *fi)
1488 {
1489 	CLIENT *client;
1490 	struct chtab *ch;
1491 	cred_t *cr = icr;
1492 	bool_t cred_cloned = FALSE;
1493 	enum clnt_stat status;
1494 	struct rpc_err rpcerr;
1495 	struct timeval wait;
1496 	int timeo;		/* in units of hz */
1497 #if 0 /* notyet */
1498 	int my_rsize, my_wsize;
1499 #endif
1500 	bool_t tryagain;
1501 	k_sigset_t smask;
1502 	servinfo_t *svp;
1503 	struct nfs_clnt *nfscl;
1504 	zoneid_t zoneid = getzoneid();
1505 #ifdef DEBUG
1506 	char *bufp;
1507 #endif
1508 
1509 #if 0 /* notyet */
1510 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1511 	    "rfscall_start:which %d mi %p", which, mi);
1512 #endif
1513 
1514 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1515 	ASSERT(nfscl != NULL);
1516 
1517 	nfscl->nfscl_stat.calls.value.ui64++;
1518 	mi->mi_aclreqs[which].value.ui64++;
1519 
1520 	rpcerr.re_status = RPC_SUCCESS;
1521 
1522 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1523 		rpcerr.re_status = RPC_FAILED;
1524 		rpcerr.re_errno = EIO;
1525 		return (rpcerr.re_errno);
1526 	}
1527 
1528 #if 0 /* notyet */
1529 	/*
1530 	 * Remember the transfer sizes in case
1531 	 * nfs_feedback changes them underneath us.
1532 	 */
1533 	my_rsize = mi->mi_curread;
1534 	my_wsize = mi->mi_curwrite;
1535 #endif
1536 
1537 	/*
1538 	 * NFS client failover support
1539 	 *
1540 	 * If this rnode is not in sync with the current server (VALID_FH),
1541 	 * we'd like to do a remap to get in sync.  We can be interrupted
1542 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1543 	 * use the best info we have to try the RPC.  Part of that is
1544 	 * unconditionally updating the filehandle copy kept for V3.
1545 	 *
1546 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1547 	 * rw_enter(); we're trying to keep the current server from being
1548 	 * changed on us until we're done with the remapping and have a
1549 	 * matching client handle.  We don't want to sending a filehandle
1550 	 * to the wrong host.
1551 	 */
1552 failoverretry:
1553 	if (FAILOVER_MOUNT(mi)) {
1554 		mutex_enter(&mi->mi_lock);
1555 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1556 			if (failover_wait(mi)) {
1557 				mutex_exit(&mi->mi_lock);
1558 				return (EINTR);
1559 			}
1560 		}
1561 		INC_READERS(mi);
1562 		mutex_exit(&mi->mi_lock);
1563 		if (fi) {
1564 			if (!VALID_FH(fi) &&
1565 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1566 				int remaperr;
1567 
1568 				svp = mi->mi_curr_serv;
1569 				remaperr = failover_remap(fi);
1570 				if (remaperr != 0) {
1571 #ifdef DEBUG
1572 					if (remaperr != EINTR)
1573 						nfs_cmn_err(remaperr, CE_WARN,
1574 					    "aclcall couldn't failover: %m");
1575 #endif
1576 					mutex_enter(&mi->mi_lock);
1577 					DEC_READERS(mi);
1578 					mutex_exit(&mi->mi_lock);
1579 
1580 					/*
1581 					 * If failover_remap returns ETIMEDOUT
1582 					 * and the filesystem is hard mounted
1583 					 * we have to retry the call with a new
1584 					 * server.
1585 					 */
1586 					if ((mi->mi_flags & MI_HARD) &&
1587 					    IS_RECOVERABLE_ERROR(remaperr)) {
1588 						if (svp == mi->mi_curr_serv)
1589 							failover_newserver(mi);
1590 						rpcerr.re_status = RPC_SUCCESS;
1591 						goto failoverretry;
1592 					}
1593 					return (remaperr);
1594 				}
1595 			}
1596 			if (fi->fhp && fi->copyproc)
1597 				(*fi->copyproc)(fi->fhp, fi->vp);
1598 		}
1599 	}
1600 
1601 	/* For TSOL, use a new cred which has net_mac_aware flag */
1602 	if (!cred_cloned && is_system_labeled()) {
1603 		cred_cloned = TRUE;
1604 		cr = crdup(icr);
1605 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1606 	}
1607 
1608 	/*
1609 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1610 	 * are guaranteed to reprocess the retry as a new request.
1611 	 */
1612 	svp = mi->mi_curr_serv;
1613 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1614 	if (FAILOVER_MOUNT(mi)) {
1615 		mutex_enter(&mi->mi_lock);
1616 		DEC_READERS(mi);
1617 		mutex_exit(&mi->mi_lock);
1618 
1619 		if ((rpcerr.re_errno == ETIMEDOUT ||
1620 		    rpcerr.re_errno == ECONNRESET) &&
1621 		    failover_safe(fi)) {
1622 			if (svp == mi->mi_curr_serv)
1623 				failover_newserver(mi);
1624 			goto failoverretry;
1625 		}
1626 	}
1627 	if (rpcerr.re_errno != 0) {
1628 		if (cred_cloned)
1629 			crfree(cr);
1630 		return (rpcerr.re_errno);
1631 	}
1632 
1633 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1634 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1635 		timeo = (mi->mi_timeo * hz) / 10;
1636 	} else {
1637 		mutex_enter(&mi->mi_lock);
1638 		timeo = CLNT_SETTIMERS(client,
1639 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1640 		    &(mi->mi_timers[NFS_CALLTYPES]),
1641 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1642 		    (void (*)()) 0, (caddr_t)mi, 0);
1643 		mutex_exit(&mi->mi_lock);
1644 	}
1645 
1646 	/*
1647 	 * If hard mounted fs, retry call forever unless hard error occurs.
1648 	 */
1649 	do {
1650 		tryagain = FALSE;
1651 
1652 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1653 			status = RPC_FAILED;
1654 			rpcerr.re_status = RPC_FAILED;
1655 			rpcerr.re_errno = EIO;
1656 			break;
1657 		}
1658 
1659 		TICK_TO_TIMEVAL(timeo, &wait);
1660 
1661 		/*
1662 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1663 		 * and SIGTERM. (Preserving the existing masks).
1664 		 * Mask out SIGINT if mount option nointr is specified.
1665 		 */
1666 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1667 		if (!(mi->mi_flags & MI_INT))
1668 			client->cl_nosignal = TRUE;
1669 
1670 		/*
1671 		 * If there is a current signal, then don't bother
1672 		 * even trying to send out the request because we
1673 		 * won't be able to block waiting for the response.
1674 		 * Simply assume RPC_INTR and get on with it.
1675 		 */
1676 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1677 			status = RPC_INTR;
1678 		else {
1679 			status = CLNT_CALL(client, which, xdrargs, argsp,
1680 			    xdrres, resp, wait);
1681 		}
1682 
1683 		if (!(mi->mi_flags & MI_INT))
1684 			client->cl_nosignal = FALSE;
1685 		/*
1686 		 * restore original signal mask
1687 		 */
1688 		sigunintr(&smask);
1689 
1690 		switch (status) {
1691 		case RPC_SUCCESS:
1692 #if 0 /* notyet */
1693 			if ((mi->mi_flags & MI_DYNAMIC) &&
1694 			    mi->mi_timer_type[which] != 0 &&
1695 			    (mi->mi_curread != my_rsize ||
1696 			    mi->mi_curwrite != my_wsize))
1697 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1698 #endif
1699 			break;
1700 
1701 		/*
1702 		 * Unfortunately, there are servers in the world which
1703 		 * are not coded correctly.  They are not prepared to
1704 		 * handle RPC requests to the NFS port which are not
1705 		 * NFS requests.  Thus, they may try to process the
1706 		 * NFS_ACL request as if it were an NFS request.  This
1707 		 * does not work.  Generally, an error will be generated
1708 		 * on the client because it will not be able to decode
1709 		 * the response from the server.  However, it seems
1710 		 * possible that the server may not be able to decode
1711 		 * the arguments.  Thus, the criteria for deciding
1712 		 * whether the server supports NFS_ACL or not is whether
1713 		 * the following RPC errors are returned from CLNT_CALL.
1714 		 */
1715 		case RPC_CANTDECODERES:
1716 		case RPC_PROGUNAVAIL:
1717 		case RPC_CANTDECODEARGS:
1718 		case RPC_PROGVERSMISMATCH:
1719 			mutex_enter(&mi->mi_lock);
1720 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1721 			mutex_exit(&mi->mi_lock);
1722 			break;
1723 
1724 		/*
1725 		 * If the server supports NFS_ACL but not the new ops
1726 		 * for extended attributes, make sure we don't retry.
1727 		 */
1728 		case RPC_PROCUNAVAIL:
1729 			mutex_enter(&mi->mi_lock);
1730 			mi->mi_flags &= ~MI_EXTATTR;
1731 			mutex_exit(&mi->mi_lock);
1732 			break;
1733 
1734 		case RPC_INTR:
1735 			/*
1736 			 * There is no way to recover from this error,
1737 			 * even if mount option nointr is specified.
1738 			 * SIGKILL, for example, cannot be blocked.
1739 			 */
1740 			rpcerr.re_status = RPC_INTR;
1741 			rpcerr.re_errno = EINTR;
1742 			break;
1743 
1744 		case RPC_UDERROR:
1745 			/*
1746 			 * If the NFS server is local (vold) and
1747 			 * it goes away then we get RPC_UDERROR.
1748 			 * This is a retryable error, so we would
1749 			 * loop, so check to see if the specific
1750 			 * error was ECONNRESET, indicating that
1751 			 * target did not exist at all.  If so,
1752 			 * return with RPC_PROGUNAVAIL and
1753 			 * ECONNRESET to indicate why.
1754 			 */
1755 			CLNT_GETERR(client, &rpcerr);
1756 			if (rpcerr.re_errno == ECONNRESET) {
1757 				rpcerr.re_status = RPC_PROGUNAVAIL;
1758 				rpcerr.re_errno = ECONNRESET;
1759 				break;
1760 			}
1761 			/*FALLTHROUGH*/
1762 
1763 		default:		/* probably RPC_TIMEDOUT */
1764 			if (IS_UNRECOVERABLE_RPC(status))
1765 				break;
1766 
1767 			/*
1768 			 * increment server not responding count
1769 			 */
1770 			mutex_enter(&mi->mi_lock);
1771 			mi->mi_noresponse++;
1772 			mutex_exit(&mi->mi_lock);
1773 #ifdef DEBUG
1774 			nfscl->nfscl_stat.noresponse.value.ui64++;
1775 #endif
1776 
1777 			if (!(mi->mi_flags & MI_HARD)) {
1778 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1779 				    (mi->mi_acl_ss_call_type[which] == 0))
1780 					break;
1781 			}
1782 
1783 			/*
1784 			 * The call is in progress (over COTS).
1785 			 * Try the CLNT_CALL again, but don't
1786 			 * print a noisy error message.
1787 			 */
1788 			if (status == RPC_INPROGRESS) {
1789 				tryagain = TRUE;
1790 				break;
1791 			}
1792 
1793 			if (flags & RFSCALL_SOFT)
1794 				break;
1795 
1796 			/*
1797 			 * On zone shutdown, just move on.
1798 			 */
1799 			if (zone_status_get(curproc->p_zone) >=
1800 			    ZONE_IS_SHUTTING_DOWN) {
1801 				rpcerr.re_status = RPC_FAILED;
1802 				rpcerr.re_errno = EIO;
1803 				break;
1804 			}
1805 
1806 			/*
1807 			 * NFS client failover support
1808 			 *
1809 			 * If the current server just failed us, we'll
1810 			 * start the process of finding a new server.
1811 			 * After that, we can just retry.
1812 			 */
1813 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1814 				if (svp == mi->mi_curr_serv)
1815 					failover_newserver(mi);
1816 				clfree_impl(client, ch, nfscl);
1817 				goto failoverretry;
1818 			}
1819 
1820 			tryagain = TRUE;
1821 			timeo = backoff(timeo);
1822 			mutex_enter(&mi->mi_lock);
1823 			if (!(mi->mi_flags & MI_PRINTED)) {
1824 				mi->mi_flags |= MI_PRINTED;
1825 				mutex_exit(&mi->mi_lock);
1826 #ifdef DEBUG
1827 				zprintf(zoneid,
1828 			"NFS_ACL%d server %s not responding still trying\n",
1829 				    mi->mi_vers, svp->sv_hostname);
1830 #else
1831 				zprintf(zoneid,
1832 			    "NFS server %s not responding still trying\n",
1833 				    svp->sv_hostname);
1834 #endif
1835 			} else
1836 				mutex_exit(&mi->mi_lock);
1837 			if (*douprintf && nfs_has_ctty()) {
1838 				*douprintf = 0;
1839 				if (!(mi->mi_flags & MI_NOPRINT))
1840 #ifdef DEBUG
1841 					uprintf(
1842 			"NFS_ACL%d server %s not responding still trying\n",
1843 					    mi->mi_vers, svp->sv_hostname);
1844 #else
1845 					uprintf(
1846 			    "NFS server %s not responding still trying\n",
1847 					    svp->sv_hostname);
1848 #endif
1849 			}
1850 
1851 #if 0 /* notyet */
1852 			/*
1853 			 * If doing dynamic adjustment of transfer
1854 			 * size and if it's a read or write call
1855 			 * and if the transfer size changed while
1856 			 * retransmitting or if the feedback routine
1857 			 * changed the transfer size,
1858 			 * then exit rfscall so that the transfer
1859 			 * size can be adjusted at the vnops level.
1860 			 */
1861 			if ((mi->mi_flags & MI_DYNAMIC) &&
1862 			    mi->mi_acl_timer_type[which] != 0 &&
1863 			    (mi->mi_curread != my_rsize ||
1864 			    mi->mi_curwrite != my_wsize ||
1865 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1866 				/*
1867 				 * On read or write calls, return
1868 				 * back to the vnode ops level if
1869 				 * the transfer size changed.
1870 				 */
1871 				clfree_impl(client, ch, nfscl);
1872 				if (cred_cloned)
1873 					crfree(cr);
1874 				return (ENFS_TRYAGAIN);
1875 			}
1876 #endif
1877 		}
1878 	} while (tryagain);
1879 
1880 	if (status != RPC_SUCCESS) {
1881 		/*
1882 		 * Let soft mounts use the timed out message.
1883 		 */
1884 		if (status == RPC_INPROGRESS)
1885 			status = RPC_TIMEDOUT;
1886 		nfscl->nfscl_stat.badcalls.value.ui64++;
1887 		if (status == RPC_CANTDECODERES ||
1888 		    status == RPC_PROGUNAVAIL ||
1889 		    status == RPC_PROCUNAVAIL ||
1890 		    status == RPC_CANTDECODEARGS ||
1891 		    status == RPC_PROGVERSMISMATCH)
1892 			CLNT_GETERR(client, &rpcerr);
1893 		else if (status != RPC_INTR) {
1894 			mutex_enter(&mi->mi_lock);
1895 			mi->mi_flags |= MI_DOWN;
1896 			mutex_exit(&mi->mi_lock);
1897 			CLNT_GETERR(client, &rpcerr);
1898 #ifdef DEBUG
1899 			bufp = clnt_sperror(client, svp->sv_hostname);
1900 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1901 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1902 			if (nfs_has_ctty()) {
1903 				if (!(mi->mi_flags & MI_NOPRINT)) {
1904 					uprintf("NFS_ACL%d %s failed for %s\n",
1905 					    mi->mi_vers, mi->mi_aclnames[which],
1906 					    bufp);
1907 				}
1908 			}
1909 			kmem_free(bufp, MAXPATHLEN);
1910 #else
1911 			zprintf(zoneid,
1912 			    "NFS %s failed for server %s: error %d (%s)\n",
1913 			    mi->mi_aclnames[which], svp->sv_hostname,
1914 			    status, clnt_sperrno(status));
1915 			if (nfs_has_ctty()) {
1916 				if (!(mi->mi_flags & MI_NOPRINT))
1917 					uprintf(
1918 				"NFS %s failed for server %s: error %d (%s)\n",
1919 					    mi->mi_aclnames[which],
1920 					    svp->sv_hostname, status,
1921 					    clnt_sperrno(status));
1922 			}
1923 #endif
1924 			/*
1925 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1926 			 * re_errno is set appropriately depending on
1927 			 * the authentication error
1928 			 */
1929 			if (status == RPC_VERSMISMATCH ||
1930 			    status == RPC_PROGVERSMISMATCH)
1931 				rpcerr.re_errno = EIO;
1932 		}
1933 	} else {
1934 		/*
1935 		 * Test the value of mi_down and mi_printed without
1936 		 * holding the mi_lock mutex.  If they are both zero,
1937 		 * then it is okay to skip the down and printed
1938 		 * processing.  This saves on a mutex_enter and
1939 		 * mutex_exit pair for a normal, successful RPC.
1940 		 * This was just complete overhead.
1941 		 */
1942 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1943 			mutex_enter(&mi->mi_lock);
1944 			mi->mi_flags &= ~MI_DOWN;
1945 			if (mi->mi_flags & MI_PRINTED) {
1946 				mi->mi_flags &= ~MI_PRINTED;
1947 				mutex_exit(&mi->mi_lock);
1948 #ifdef DEBUG
1949 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1950 				    mi->mi_vers, svp->sv_hostname);
1951 #else
1952 				zprintf(zoneid, "NFS server %s ok\n",
1953 				    svp->sv_hostname);
1954 #endif
1955 			} else
1956 				mutex_exit(&mi->mi_lock);
1957 		}
1958 
1959 		if (*douprintf == 0) {
1960 			if (!(mi->mi_flags & MI_NOPRINT))
1961 #ifdef DEBUG
1962 				uprintf("NFS_ACL%d server %s ok\n",
1963 				    mi->mi_vers, svp->sv_hostname);
1964 #else
1965 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1966 #endif
1967 			*douprintf = 1;
1968 		}
1969 	}
1970 
1971 	clfree_impl(client, ch, nfscl);
1972 	if (cred_cloned)
1973 		crfree(cr);
1974 
1975 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1976 
1977 #if 0 /* notyet */
1978 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1979 	    rpcerr.re_errno);
1980 #endif
1981 
1982 	return (rpcerr.re_errno);
1983 }
1984 
1985 int
vattr_to_sattr(struct vattr * vap,struct nfssattr * sa)1986 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1987 {
1988 	uint_t mask = vap->va_mask;
1989 
1990 	if (!(mask & AT_MODE))
1991 		sa->sa_mode = (uint32_t)-1;
1992 	else
1993 		sa->sa_mode = vap->va_mode;
1994 	if (!(mask & AT_UID))
1995 		sa->sa_uid = (uint32_t)-1;
1996 	else
1997 		sa->sa_uid = (uint32_t)vap->va_uid;
1998 	if (!(mask & AT_GID))
1999 		sa->sa_gid = (uint32_t)-1;
2000 	else
2001 		sa->sa_gid = (uint32_t)vap->va_gid;
2002 	if (!(mask & AT_SIZE))
2003 		sa->sa_size = (uint32_t)-1;
2004 	else
2005 		sa->sa_size = (uint32_t)vap->va_size;
2006 	if (!(mask & AT_ATIME))
2007 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2008 	else {
2009 		/* check time validity */
2010 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2011 			return (EOVERFLOW);
2012 		}
2013 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2014 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2015 	}
2016 	if (!(mask & AT_MTIME))
2017 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2018 	else {
2019 		/* check time validity */
2020 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2021 			return (EOVERFLOW);
2022 		}
2023 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2024 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2025 	}
2026 	return (0);
2027 }
2028 
2029 int
vattr_to_sattr3(struct vattr * vap,sattr3 * sa)2030 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2031 {
2032 	uint_t mask = vap->va_mask;
2033 
2034 	if (!(mask & AT_MODE))
2035 		sa->mode.set_it = FALSE;
2036 	else {
2037 		sa->mode.set_it = TRUE;
2038 		sa->mode.mode = (mode3)vap->va_mode;
2039 	}
2040 	if (!(mask & AT_UID))
2041 		sa->uid.set_it = FALSE;
2042 	else {
2043 		sa->uid.set_it = TRUE;
2044 		sa->uid.uid = (uid3)vap->va_uid;
2045 	}
2046 	if (!(mask & AT_GID))
2047 		sa->gid.set_it = FALSE;
2048 	else {
2049 		sa->gid.set_it = TRUE;
2050 		sa->gid.gid = (gid3)vap->va_gid;
2051 	}
2052 	if (!(mask & AT_SIZE))
2053 		sa->size.set_it = FALSE;
2054 	else {
2055 		sa->size.set_it = TRUE;
2056 		sa->size.size = (size3)vap->va_size;
2057 	}
2058 	if (!(mask & AT_ATIME))
2059 		sa->atime.set_it = DONT_CHANGE;
2060 	else {
2061 		/* check time validity */
2062 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2063 			return (EOVERFLOW);
2064 		}
2065 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2066 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2067 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2068 	}
2069 	if (!(mask & AT_MTIME))
2070 		sa->mtime.set_it = DONT_CHANGE;
2071 	else {
2072 		/* check time validity */
2073 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2074 			return (EOVERFLOW);
2075 		}
2076 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2077 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2078 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2079 	}
2080 	return (0);
2081 }
2082 
2083 void
setdiropargs(struct nfsdiropargs * da,char * nm,vnode_t * dvp)2084 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2085 {
2086 
2087 	da->da_fhandle = VTOFH(dvp);
2088 	da->da_name = nm;
2089 	da->da_flags = 0;
2090 }
2091 
2092 void
setdiropargs3(diropargs3 * da,char * nm,vnode_t * dvp)2093 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2094 {
2095 
2096 	da->dirp = VTOFH3(dvp);
2097 	da->name = nm;
2098 }
2099 
2100 int
setdirgid(vnode_t * dvp,gid_t * gidp,cred_t * cr)2101 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2102 {
2103 	int error;
2104 	rnode_t *rp;
2105 	struct vattr va;
2106 
2107 	va.va_mask = AT_MODE | AT_GID;
2108 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2109 	if (error)
2110 		return (error);
2111 
2112 	/*
2113 	 * To determine the expected group-id of the created file:
2114 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2115 	 *	GRPID option, and the directory's set-gid bit is clear,
2116 	 *	then use the process's gid.
2117 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2118 	 */
2119 	rp = VTOR(dvp);
2120 	mutex_enter(&rp->r_statelock);
2121 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2122 		*gidp = crgetgid(cr);
2123 	else
2124 		*gidp = va.va_gid;
2125 	mutex_exit(&rp->r_statelock);
2126 	return (0);
2127 }
2128 
2129 int
setdirmode(vnode_t * dvp,mode_t * omp,cred_t * cr)2130 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2131 {
2132 	int error;
2133 	struct vattr va;
2134 
2135 	va.va_mask = AT_MODE;
2136 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2137 	if (error)
2138 		return (error);
2139 
2140 	/*
2141 	 * Modify the expected mode (om) so that the set-gid bit matches
2142 	 * that of the parent directory (dvp).
2143 	 */
2144 	if (va.va_mode & VSGID)
2145 		*omp |= VSGID;
2146 	else
2147 		*omp &= ~VSGID;
2148 	return (0);
2149 }
2150 
2151 void
nfs_setswaplike(vnode_t * vp,vattr_t * vap)2152 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2153 {
2154 
2155 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2156 		if (!(vp->v_flag & VSWAPLIKE)) {
2157 			mutex_enter(&vp->v_lock);
2158 			vp->v_flag |= VSWAPLIKE;
2159 			mutex_exit(&vp->v_lock);
2160 		}
2161 	} else {
2162 		if (vp->v_flag & VSWAPLIKE) {
2163 			mutex_enter(&vp->v_lock);
2164 			vp->v_flag &= ~VSWAPLIKE;
2165 			mutex_exit(&vp->v_lock);
2166 		}
2167 	}
2168 }
2169 
2170 /*
2171  * Free the resources associated with an rnode.
2172  */
2173 static void
rinactive(rnode_t * rp,cred_t * cr)2174 rinactive(rnode_t *rp, cred_t *cr)
2175 {
2176 	vnode_t *vp;
2177 	cred_t *cred;
2178 	char *contents;
2179 	int size;
2180 	vsecattr_t *vsp;
2181 	int error;
2182 	nfs3_pathconf_info *info;
2183 
2184 	/*
2185 	 * Before freeing anything, wait until all asynchronous
2186 	 * activity is done on this rnode.  This will allow all
2187 	 * asynchronous read ahead and write behind i/o's to
2188 	 * finish.
2189 	 */
2190 	mutex_enter(&rp->r_statelock);
2191 	while (rp->r_count > 0)
2192 		cv_wait(&rp->r_cv, &rp->r_statelock);
2193 	mutex_exit(&rp->r_statelock);
2194 
2195 	/*
2196 	 * Flush and invalidate all pages associated with the vnode.
2197 	 */
2198 	vp = RTOV(rp);
2199 	if (vn_has_cached_data(vp)) {
2200 		ASSERT(vp->v_type != VCHR);
2201 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2202 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2203 			if (error && (error == ENOSPC || error == EDQUOT)) {
2204 				mutex_enter(&rp->r_statelock);
2205 				if (!rp->r_error)
2206 					rp->r_error = error;
2207 				mutex_exit(&rp->r_statelock);
2208 			}
2209 		}
2210 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2211 	}
2212 
2213 	/*
2214 	 * Free any held credentials and caches which may be associated
2215 	 * with this rnode.
2216 	 */
2217 	mutex_enter(&rp->r_statelock);
2218 	cred = rp->r_cred;
2219 	rp->r_cred = NULL;
2220 	contents = rp->r_symlink.contents;
2221 	size = rp->r_symlink.size;
2222 	rp->r_symlink.contents = NULL;
2223 	vsp = rp->r_secattr;
2224 	rp->r_secattr = NULL;
2225 	info = rp->r_pathconf;
2226 	rp->r_pathconf = NULL;
2227 	mutex_exit(&rp->r_statelock);
2228 
2229 	/*
2230 	 * Free the held credential.
2231 	 */
2232 	if (cred != NULL)
2233 		crfree(cred);
2234 
2235 	/*
2236 	 * Free the access cache entries.
2237 	 */
2238 	(void) nfs_access_purge_rp(rp);
2239 
2240 	/*
2241 	 * Free the readdir cache entries.
2242 	 */
2243 	if (HAVE_RDDIR_CACHE(rp))
2244 		nfs_purge_rddir_cache(vp);
2245 
2246 	/*
2247 	 * Free the symbolic link cache.
2248 	 */
2249 	if (contents != NULL) {
2250 
2251 		kmem_free((void *)contents, size);
2252 	}
2253 
2254 	/*
2255 	 * Free any cached ACL.
2256 	 */
2257 	if (vsp != NULL)
2258 		nfs_acl_free(vsp);
2259 
2260 	/*
2261 	 * Free any cached pathconf information.
2262 	 */
2263 	if (info != NULL)
2264 		kmem_free(info, sizeof (*info));
2265 }
2266 
2267 /*
2268  * Return a vnode for the given NFS Version 2 file handle.
2269  * If no rnode exists for this fhandle, create one and put it
2270  * into the hash queues.  If the rnode for this fhandle
2271  * already exists, return it.
2272  *
2273  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2274  */
2275 vnode_t *
makenfsnode(fhandle_t * fh,struct nfsfattr * attr,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2276 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2277     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2278 {
2279 	int newnode;
2280 	int index;
2281 	vnode_t *vp;
2282 	nfs_fhandle nfh;
2283 	vattr_t va;
2284 
2285 	nfh.fh_len = NFS_FHSIZE;
2286 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2287 
2288 	index = rtablehash(&nfh);
2289 	rw_enter(&rtable[index].r_lock, RW_READER);
2290 
2291 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2292 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2293 
2294 	if (attr != NULL) {
2295 		if (!newnode) {
2296 			rw_exit(&rtable[index].r_lock);
2297 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2298 		} else {
2299 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2300 				vp->v_type = VBAD;
2301 			else
2302 				vp->v_type = n2v_type(attr);
2303 			/*
2304 			 * A translation here seems to be necessary
2305 			 * because this function can be called
2306 			 * with `attr' that has come from the wire,
2307 			 * and been operated on by vattr_to_nattr().
2308 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2309 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2310 			 * ->makenfsnode().
2311 			 */
2312 			if ((attr->na_rdev & 0xffff0000) == 0)
2313 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2314 			else
2315 				vp->v_rdev = expldev(n2v_rdev(attr));
2316 			nfs_attrcache(vp, attr, t);
2317 			rw_exit(&rtable[index].r_lock);
2318 		}
2319 	} else {
2320 		if (newnode) {
2321 			PURGE_ATTRCACHE(vp);
2322 		}
2323 		rw_exit(&rtable[index].r_lock);
2324 	}
2325 
2326 	return (vp);
2327 }
2328 
2329 /*
2330  * Return a vnode for the given NFS Version 3 file handle.
2331  * If no rnode exists for this fhandle, create one and put it
2332  * into the hash queues.  If the rnode for this fhandle
2333  * already exists, return it.
2334  *
2335  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2336  */
2337 vnode_t *
makenfs3node_va(nfs_fh3 * fh,vattr_t * vap,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2338 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2339     cred_t *cr, char *dnm, char *nm)
2340 {
2341 	int newnode;
2342 	int index;
2343 	vnode_t *vp;
2344 
2345 	index = rtablehash((nfs_fhandle *)fh);
2346 	rw_enter(&rtable[index].r_lock, RW_READER);
2347 
2348 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2349 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2350 	    dnm, nm);
2351 
2352 	if (vap == NULL) {
2353 		if (newnode) {
2354 			PURGE_ATTRCACHE(vp);
2355 		}
2356 		rw_exit(&rtable[index].r_lock);
2357 		return (vp);
2358 	}
2359 
2360 	if (!newnode) {
2361 		rw_exit(&rtable[index].r_lock);
2362 		nfs_attr_cache(vp, vap, t, cr);
2363 	} else {
2364 		rnode_t *rp = VTOR(vp);
2365 
2366 		vp->v_type = vap->va_type;
2367 		vp->v_rdev = vap->va_rdev;
2368 
2369 		mutex_enter(&rp->r_statelock);
2370 		if (rp->r_mtime <= t)
2371 			nfs_attrcache_va(vp, vap);
2372 		mutex_exit(&rp->r_statelock);
2373 		rw_exit(&rtable[index].r_lock);
2374 	}
2375 
2376 	return (vp);
2377 }
2378 
2379 vnode_t *
makenfs3node(nfs_fh3 * fh,fattr3 * attr,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2380 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2381     cred_t *cr, char *dnm, char *nm)
2382 {
2383 	int newnode;
2384 	int index;
2385 	vnode_t *vp;
2386 	vattr_t va;
2387 
2388 	index = rtablehash((nfs_fhandle *)fh);
2389 	rw_enter(&rtable[index].r_lock, RW_READER);
2390 
2391 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2392 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2393 	    dnm, nm);
2394 
2395 	if (attr == NULL) {
2396 		if (newnode) {
2397 			PURGE_ATTRCACHE(vp);
2398 		}
2399 		rw_exit(&rtable[index].r_lock);
2400 		return (vp);
2401 	}
2402 
2403 	if (!newnode) {
2404 		rw_exit(&rtable[index].r_lock);
2405 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2406 	} else {
2407 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2408 			vp->v_type = VBAD;
2409 		else
2410 			vp->v_type = nf3_to_vt[attr->type];
2411 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2412 		    attr->rdev.specdata2);
2413 		nfs3_attrcache(vp, attr, t);
2414 		rw_exit(&rtable[index].r_lock);
2415 	}
2416 
2417 	return (vp);
2418 }
2419 
2420 /*
2421  * Read this comment before making changes to rtablehash()!
2422  * This is a hash function in which seemingly obvious and harmless
2423  * changes can cause escalations costing million dollars!
2424  * Know what you are doing.
2425  *
2426  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2427  * algorithm is currently detailed here:
2428  *
2429  *   http://burtleburtle.net/bob/hash/doobs.html
2430  *
2431  * Of course, the above link may not be valid by the time you are reading
2432  * this, but suffice it to say that the one-at-a-time algorithm works well in
2433  * almost all cases.  If you are changing the algorithm be sure to verify that
2434  * the hash algorithm still provides even distribution in all cases and with
2435  * any server returning filehandles in whatever order (sequential or random).
2436  */
2437 static int
rtablehash(nfs_fhandle * fh)2438 rtablehash(nfs_fhandle *fh)
2439 {
2440 	ulong_t hash, len, i;
2441 	char *key;
2442 
2443 	key = fh->fh_buf;
2444 	len = (ulong_t)fh->fh_len;
2445 	for (hash = 0, i = 0; i < len; i++) {
2446 		hash += key[i];
2447 		hash += (hash << 10);
2448 		hash ^= (hash >> 6);
2449 	}
2450 	hash += (hash << 3);
2451 	hash ^= (hash >> 11);
2452 	hash += (hash << 15);
2453 	return (hash & rtablemask);
2454 }
2455 
2456 static vnode_t *
make_rnode(nfs_fhandle * fh,rhashq_t * rhtp,struct vfs * vfsp,struct vnodeops * vops,int (* putapage)(vnode_t *,page_t *,u_offset_t *,size_t *,int,cred_t *),int (* compar)(const void *,const void *),int * newnode,cred_t * cr,char * dnm,char * nm)2457 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2458     struct vnodeops *vops,
2459     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2460     int (*compar)(const void *, const void *),
2461     int *newnode, cred_t *cr, char *dnm, char *nm)
2462 {
2463 	rnode_t *rp;
2464 	rnode_t *trp;
2465 	vnode_t *vp;
2466 	mntinfo_t *mi;
2467 
2468 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2469 
2470 	mi = VFTOMI(vfsp);
2471 start:
2472 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2473 		vp = RTOV(rp);
2474 		nfs_set_vroot(vp);
2475 		*newnode = 0;
2476 		return (vp);
2477 	}
2478 	rw_exit(&rhtp->r_lock);
2479 
2480 	mutex_enter(&rpfreelist_lock);
2481 	if (rpfreelist != NULL && rnew >= nrnode) {
2482 		rp = rpfreelist;
2483 		rp_rmfree(rp);
2484 		mutex_exit(&rpfreelist_lock);
2485 
2486 		vp = RTOV(rp);
2487 
2488 		if (rp->r_flags & RHASHED) {
2489 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2490 			mutex_enter(&vp->v_lock);
2491 			if (vp->v_count > 1) {
2492 				vp->v_count--;
2493 				mutex_exit(&vp->v_lock);
2494 				rw_exit(&rp->r_hashq->r_lock);
2495 				rw_enter(&rhtp->r_lock, RW_READER);
2496 				goto start;
2497 			}
2498 			mutex_exit(&vp->v_lock);
2499 			rp_rmhash_locked(rp);
2500 			rw_exit(&rp->r_hashq->r_lock);
2501 		}
2502 
2503 		rinactive(rp, cr);
2504 
2505 		mutex_enter(&vp->v_lock);
2506 		if (vp->v_count > 1) {
2507 			vp->v_count--;
2508 			mutex_exit(&vp->v_lock);
2509 			rw_enter(&rhtp->r_lock, RW_READER);
2510 			goto start;
2511 		}
2512 		mutex_exit(&vp->v_lock);
2513 		vn_invalid(vp);
2514 		/*
2515 		 * destroy old locks before bzero'ing and
2516 		 * recreating the locks below.
2517 		 */
2518 		nfs_rw_destroy(&rp->r_rwlock);
2519 		nfs_rw_destroy(&rp->r_lkserlock);
2520 		mutex_destroy(&rp->r_statelock);
2521 		cv_destroy(&rp->r_cv);
2522 		cv_destroy(&rp->r_commit.c_cv);
2523 		nfs_free_r_path(rp);
2524 		avl_destroy(&rp->r_dir);
2525 		/*
2526 		 * Make sure that if rnode is recycled then
2527 		 * VFS count is decremented properly before
2528 		 * reuse.
2529 		 */
2530 		VFS_RELE(vp->v_vfsp);
2531 		vn_reinit(vp);
2532 	} else {
2533 		vnode_t *new_vp;
2534 
2535 		mutex_exit(&rpfreelist_lock);
2536 
2537 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2538 		new_vp = vn_alloc(KM_SLEEP);
2539 
2540 		atomic_inc_ulong((ulong_t *)&rnew);
2541 #ifdef DEBUG
2542 		clstat_debug.nrnode.value.ui64++;
2543 #endif
2544 		vp = new_vp;
2545 	}
2546 
2547 	bzero(rp, sizeof (*rp));
2548 	rp->r_vnode = vp;
2549 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2550 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2551 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2552 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2553 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2554 	rp->r_fh.fh_len = fh->fh_len;
2555 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2556 	rp->r_server = mi->mi_curr_serv;
2557 	if (FAILOVER_MOUNT(mi)) {
2558 		/*
2559 		 * If replicated servers, stash pathnames
2560 		 */
2561 		if (dnm != NULL && nm != NULL) {
2562 			char *s, *p;
2563 			uint_t len;
2564 
2565 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2566 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2567 #ifdef DEBUG
2568 			clstat_debug.rpath.value.ui64 += len;
2569 #endif
2570 			s = rp->r_path;
2571 			for (p = dnm; *p; p++)
2572 				*s++ = *p;
2573 			*s++ = '/';
2574 			for (p = nm; *p; p++)
2575 				*s++ = *p;
2576 			*s = '\0';
2577 		} else {
2578 			/* special case for root */
2579 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2580 #ifdef DEBUG
2581 			clstat_debug.rpath.value.ui64 += 2;
2582 #endif
2583 			*rp->r_path = '.';
2584 			*(rp->r_path + 1) = '\0';
2585 		}
2586 	}
2587 	VFS_HOLD(vfsp);
2588 	rp->r_putapage = putapage;
2589 	rp->r_hashq = rhtp;
2590 	rp->r_flags = RREADDIRPLUS;
2591 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2592 	    offsetof(rddir_cache, tree));
2593 	vn_setops(vp, vops);
2594 	vp->v_data = (caddr_t)rp;
2595 	vp->v_vfsp = vfsp;
2596 	vp->v_type = VNON;
2597 	vp->v_flag |= VMODSORT;
2598 	nfs_set_vroot(vp);
2599 
2600 	/*
2601 	 * There is a race condition if someone else
2602 	 * alloc's the rnode while no locks are held, so we
2603 	 * check again and recover if found.
2604 	 */
2605 	rw_enter(&rhtp->r_lock, RW_WRITER);
2606 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2607 		vp = RTOV(trp);
2608 		nfs_set_vroot(vp);
2609 		*newnode = 0;
2610 		rw_exit(&rhtp->r_lock);
2611 		rp_addfree(rp, cr);
2612 		rw_enter(&rhtp->r_lock, RW_READER);
2613 		return (vp);
2614 	}
2615 	rp_addhash(rp);
2616 	*newnode = 1;
2617 	return (vp);
2618 }
2619 
2620 /*
2621  * Callback function to check if the page should be marked as
2622  * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2623  */
2624 int
nfs_setmod_check(page_t * pp)2625 nfs_setmod_check(page_t *pp)
2626 {
2627 	if (pp->p_fsdata != C_NOCOMMIT) {
2628 		pp->p_fsdata = C_NOCOMMIT;
2629 		return (1);
2630 	}
2631 	return (0);
2632 }
2633 
2634 static void
nfs_set_vroot(vnode_t * vp)2635 nfs_set_vroot(vnode_t *vp)
2636 {
2637 	rnode_t *rp;
2638 	nfs_fhandle *rootfh;
2639 
2640 	rp = VTOR(vp);
2641 	rootfh = &rp->r_server->sv_fhandle;
2642 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2643 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2644 		if (!(vp->v_flag & VROOT)) {
2645 			mutex_enter(&vp->v_lock);
2646 			vp->v_flag |= VROOT;
2647 			mutex_exit(&vp->v_lock);
2648 		}
2649 	}
2650 }
2651 
2652 static void
nfs_free_r_path(rnode_t * rp)2653 nfs_free_r_path(rnode_t *rp)
2654 {
2655 	char *path;
2656 	size_t len;
2657 
2658 	path = rp->r_path;
2659 	if (path) {
2660 		rp->r_path = NULL;
2661 		len = strlen(path) + 1;
2662 		kmem_free(path, len);
2663 #ifdef DEBUG
2664 		clstat_debug.rpath.value.ui64 -= len;
2665 #endif
2666 	}
2667 }
2668 
2669 /*
2670  * Put an rnode on the free list.
2671  *
2672  * Rnodes which were allocated above and beyond the normal limit
2673  * are immediately freed.
2674  */
2675 void
rp_addfree(rnode_t * rp,cred_t * cr)2676 rp_addfree(rnode_t *rp, cred_t *cr)
2677 {
2678 	vnode_t *vp;
2679 	struct vfs *vfsp;
2680 
2681 	vp = RTOV(rp);
2682 	ASSERT(vp->v_count >= 1);
2683 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2684 
2685 	/*
2686 	 * If we have too many rnodes allocated and there are no
2687 	 * references to this rnode, or if the rnode is no longer
2688 	 * accessible by it does not reside in the hash queues,
2689 	 * or if an i/o error occurred while writing to the file,
2690 	 * then just free it instead of putting it on the rnode
2691 	 * freelist.
2692 	 */
2693 	vfsp = vp->v_vfsp;
2694 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2695 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2696 		if (rp->r_flags & RHASHED) {
2697 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2698 			mutex_enter(&vp->v_lock);
2699 			if (vp->v_count > 1) {
2700 				vp->v_count--;
2701 				mutex_exit(&vp->v_lock);
2702 				rw_exit(&rp->r_hashq->r_lock);
2703 				return;
2704 			}
2705 			mutex_exit(&vp->v_lock);
2706 			rp_rmhash_locked(rp);
2707 			rw_exit(&rp->r_hashq->r_lock);
2708 		}
2709 
2710 		rinactive(rp, cr);
2711 
2712 		/*
2713 		 * Recheck the vnode reference count.  We need to
2714 		 * make sure that another reference has not been
2715 		 * acquired while we were not holding v_lock.  The
2716 		 * rnode is not in the rnode hash queues, so the
2717 		 * only way for a reference to have been acquired
2718 		 * is for a VOP_PUTPAGE because the rnode was marked
2719 		 * with RDIRTY or for a modified page.  This
2720 		 * reference may have been acquired before our call
2721 		 * to rinactive.  The i/o may have been completed,
2722 		 * thus allowing rinactive to complete, but the
2723 		 * reference to the vnode may not have been released
2724 		 * yet.  In any case, the rnode can not be destroyed
2725 		 * until the other references to this vnode have been
2726 		 * released.  The other references will take care of
2727 		 * either destroying the rnode or placing it on the
2728 		 * rnode freelist.  If there are no other references,
2729 		 * then the rnode may be safely destroyed.
2730 		 */
2731 		mutex_enter(&vp->v_lock);
2732 		if (vp->v_count > 1) {
2733 			vp->v_count--;
2734 			mutex_exit(&vp->v_lock);
2735 			return;
2736 		}
2737 		mutex_exit(&vp->v_lock);
2738 
2739 		destroy_rnode(rp);
2740 		return;
2741 	}
2742 
2743 	/*
2744 	 * Lock the hash queue and then recheck the reference count
2745 	 * to ensure that no other threads have acquired a reference
2746 	 * to indicate that the rnode should not be placed on the
2747 	 * freelist.  If another reference has been acquired, then
2748 	 * just release this one and let the other thread complete
2749 	 * the processing of adding this rnode to the freelist.
2750 	 */
2751 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2752 
2753 	mutex_enter(&vp->v_lock);
2754 	if (vp->v_count > 1) {
2755 		vp->v_count--;
2756 		mutex_exit(&vp->v_lock);
2757 		rw_exit(&rp->r_hashq->r_lock);
2758 		return;
2759 	}
2760 	mutex_exit(&vp->v_lock);
2761 
2762 	/*
2763 	 * If there is no cached data or metadata for this file, then
2764 	 * put the rnode on the front of the freelist so that it will
2765 	 * be reused before other rnodes which may have cached data or
2766 	 * metadata associated with them.
2767 	 */
2768 	mutex_enter(&rpfreelist_lock);
2769 	if (rpfreelist == NULL) {
2770 		rp->r_freef = rp;
2771 		rp->r_freeb = rp;
2772 		rpfreelist = rp;
2773 	} else {
2774 		rp->r_freef = rpfreelist;
2775 		rp->r_freeb = rpfreelist->r_freeb;
2776 		rpfreelist->r_freeb->r_freef = rp;
2777 		rpfreelist->r_freeb = rp;
2778 		if (!vn_has_cached_data(vp) &&
2779 		    !HAVE_RDDIR_CACHE(rp) &&
2780 		    rp->r_symlink.contents == NULL &&
2781 		    rp->r_secattr == NULL &&
2782 		    rp->r_pathconf == NULL)
2783 			rpfreelist = rp;
2784 	}
2785 	mutex_exit(&rpfreelist_lock);
2786 
2787 	rw_exit(&rp->r_hashq->r_lock);
2788 }
2789 
2790 /*
2791  * Remove an rnode from the free list.
2792  *
2793  * The caller must be holding rpfreelist_lock and the rnode
2794  * must be on the freelist.
2795  */
2796 static void
rp_rmfree(rnode_t * rp)2797 rp_rmfree(rnode_t *rp)
2798 {
2799 
2800 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2801 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2802 
2803 	if (rp == rpfreelist) {
2804 		rpfreelist = rp->r_freef;
2805 		if (rp == rpfreelist)
2806 			rpfreelist = NULL;
2807 	}
2808 
2809 	rp->r_freeb->r_freef = rp->r_freef;
2810 	rp->r_freef->r_freeb = rp->r_freeb;
2811 
2812 	rp->r_freef = rp->r_freeb = NULL;
2813 }
2814 
2815 /*
2816  * Put a rnode in the hash table.
2817  *
2818  * The caller must be holding the exclusive hash queue lock.
2819  */
2820 static void
rp_addhash(rnode_t * rp)2821 rp_addhash(rnode_t *rp)
2822 {
2823 	mntinfo_t *mi;
2824 
2825 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2826 	ASSERT(!(rp->r_flags & RHASHED));
2827 
2828 	rp->r_hashf = rp->r_hashq->r_hashf;
2829 	rp->r_hashq->r_hashf = rp;
2830 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2831 	rp->r_hashf->r_hashb = rp;
2832 
2833 	mutex_enter(&rp->r_statelock);
2834 	rp->r_flags |= RHASHED;
2835 	mutex_exit(&rp->r_statelock);
2836 
2837 	mi = VTOMI(RTOV(rp));
2838 	mutex_enter(&mi->mi_rnodes_lock);
2839 	list_insert_tail(&mi->mi_rnodes, rp);
2840 	mutex_exit(&mi->mi_rnodes_lock);
2841 }
2842 
2843 /*
2844  * Remove a rnode from the hash table.
2845  *
2846  * The caller must be holding the hash queue lock.
2847  */
2848 static void
rp_rmhash_locked(rnode_t * rp)2849 rp_rmhash_locked(rnode_t *rp)
2850 {
2851 	mntinfo_t *mi;
2852 
2853 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2854 	ASSERT(rp->r_flags & RHASHED);
2855 
2856 	rp->r_hashb->r_hashf = rp->r_hashf;
2857 	rp->r_hashf->r_hashb = rp->r_hashb;
2858 
2859 	mutex_enter(&rp->r_statelock);
2860 	rp->r_flags &= ~RHASHED;
2861 	mutex_exit(&rp->r_statelock);
2862 
2863 	mi = VTOMI(RTOV(rp));
2864 	mutex_enter(&mi->mi_rnodes_lock);
2865 	if (list_link_active(&rp->r_mi_link))
2866 		list_remove(&mi->mi_rnodes, rp);
2867 	mutex_exit(&mi->mi_rnodes_lock);
2868 }
2869 
2870 /*
2871  * Remove a rnode from the hash table.
2872  *
2873  * The caller must not be holding the hash queue lock.
2874  */
2875 void
rp_rmhash(rnode_t * rp)2876 rp_rmhash(rnode_t *rp)
2877 {
2878 
2879 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2880 	rp_rmhash_locked(rp);
2881 	rw_exit(&rp->r_hashq->r_lock);
2882 }
2883 
2884 /*
2885  * Lookup a rnode by fhandle.
2886  *
2887  * The caller must be holding the hash queue lock, either shared or exclusive.
2888  */
2889 static rnode_t *
rfind(rhashq_t * rhtp,nfs_fhandle * fh,struct vfs * vfsp)2890 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2891 {
2892 	rnode_t *rp;
2893 	vnode_t *vp;
2894 
2895 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2896 
2897 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2898 		vp = RTOV(rp);
2899 		if (vp->v_vfsp == vfsp &&
2900 		    rp->r_fh.fh_len == fh->fh_len &&
2901 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2902 			/*
2903 			 * remove rnode from free list, if necessary.
2904 			 */
2905 			if (rp->r_freef != NULL) {
2906 				mutex_enter(&rpfreelist_lock);
2907 				/*
2908 				 * If the rnode is on the freelist,
2909 				 * then remove it and use that reference
2910 				 * as the new reference.  Otherwise,
2911 				 * need to increment the reference count.
2912 				 */
2913 				if (rp->r_freef != NULL) {
2914 					rp_rmfree(rp);
2915 					mutex_exit(&rpfreelist_lock);
2916 				} else {
2917 					mutex_exit(&rpfreelist_lock);
2918 					VN_HOLD(vp);
2919 				}
2920 			} else
2921 				VN_HOLD(vp);
2922 			return (rp);
2923 		}
2924 	}
2925 	return (NULL);
2926 }
2927 
2928 /*
2929  * Return 1 if there is an active vnode belonging to this vfs in the
2930  * rtable cache.
2931  *
2932  * Several of these checks are done without holding the usual
2933  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2934  * etc. will redo the necessary checks before actually destroying
2935  * any rnodes.
2936  */
2937 int
check_rtable(struct vfs * vfsp)2938 check_rtable(struct vfs *vfsp)
2939 {
2940 	rnode_t *rp;
2941 	vnode_t *vp;
2942 	mntinfo_t *mi;
2943 
2944 	ASSERT(vfsp != NULL);
2945 	mi = VFTOMI(vfsp);
2946 
2947 	mutex_enter(&mi->mi_rnodes_lock);
2948 	for (rp = list_head(&mi->mi_rnodes); rp != NULL;
2949 	    rp = list_next(&mi->mi_rnodes, rp)) {
2950 		vp = RTOV(rp);
2951 
2952 		if (rp->r_freef == NULL ||
2953 		    (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) ||
2954 		    rp->r_count > 0) {
2955 			mutex_exit(&mi->mi_rnodes_lock);
2956 			return (1);
2957 		}
2958 	}
2959 	mutex_exit(&mi->mi_rnodes_lock);
2960 
2961 	return (0);
2962 }
2963 
2964 /*
2965  * Destroy inactive vnodes from the hash queues which belong to this
2966  * vfs.  It is essential that we destroy all inactive vnodes during a
2967  * forced unmount as well as during a normal unmount.
2968  */
2969 void
destroy_rtable(struct vfs * vfsp,cred_t * cr)2970 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2971 {
2972 	rnode_t *rp;
2973 	mntinfo_t *mi;
2974 
2975 	ASSERT(vfsp != NULL);
2976 
2977 	mi = VFTOMI(vfsp);
2978 
2979 	mutex_enter(&rpfreelist_lock);
2980 	mutex_enter(&mi->mi_rnodes_lock);
2981 	while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
2982 		/*
2983 		 * If the rnode is no longer on the freelist it is not
2984 		 * ours and it will be handled by some other thread, so
2985 		 * skip it.
2986 		 */
2987 		if (rp->r_freef == NULL)
2988 			continue;
2989 		mutex_exit(&mi->mi_rnodes_lock);
2990 
2991 		rp_rmfree(rp);
2992 		mutex_exit(&rpfreelist_lock);
2993 
2994 		rp_rmhash(rp);
2995 
2996 		/*
2997 		 * This call to rp_addfree will end up destroying the
2998 		 * rnode, but in a safe way with the appropriate set
2999 		 * of checks done.
3000 		 */
3001 		rp_addfree(rp, cr);
3002 
3003 		mutex_enter(&rpfreelist_lock);
3004 		mutex_enter(&mi->mi_rnodes_lock);
3005 	}
3006 	mutex_exit(&mi->mi_rnodes_lock);
3007 	mutex_exit(&rpfreelist_lock);
3008 }
3009 
3010 /*
3011  * This routine destroys all the resources associated with the rnode
3012  * and then the rnode itself.
3013  */
3014 static void
destroy_rnode(rnode_t * rp)3015 destroy_rnode(rnode_t *rp)
3016 {
3017 	vnode_t *vp;
3018 	vfs_t *vfsp;
3019 
3020 	vp = RTOV(rp);
3021 	vfsp = vp->v_vfsp;
3022 
3023 	ASSERT(vp->v_count == 1);
3024 	ASSERT(rp->r_count == 0);
3025 	ASSERT(rp->r_lmpl == NULL);
3026 	ASSERT(rp->r_mapcnt == 0);
3027 	ASSERT(!(rp->r_flags & RHASHED));
3028 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3029 	atomic_dec_ulong((ulong_t *)&rnew);
3030 #ifdef DEBUG
3031 	clstat_debug.nrnode.value.ui64--;
3032 #endif
3033 	nfs_rw_destroy(&rp->r_rwlock);
3034 	nfs_rw_destroy(&rp->r_lkserlock);
3035 	mutex_destroy(&rp->r_statelock);
3036 	cv_destroy(&rp->r_cv);
3037 	cv_destroy(&rp->r_commit.c_cv);
3038 	if (rp->r_flags & RDELMAPLIST)
3039 		list_destroy(&rp->r_indelmap);
3040 	nfs_free_r_path(rp);
3041 	avl_destroy(&rp->r_dir);
3042 	vn_invalid(vp);
3043 	vn_free(vp);
3044 	kmem_cache_free(rnode_cache, rp);
3045 	VFS_RELE(vfsp);
3046 }
3047 
3048 /*
3049  * Flush all vnodes in this (or every) vfs.
3050  * Used by nfs_sync and by nfs_unmount.
3051  */
3052 void
rflush(struct vfs * vfsp,cred_t * cr)3053 rflush(struct vfs *vfsp, cred_t *cr)
3054 {
3055 	int index;
3056 	rnode_t *rp;
3057 	vnode_t *vp, **vplist;
3058 	long num, cnt;
3059 
3060 	/*
3061 	 * Check to see whether there is anything to do.
3062 	 */
3063 	num = rnew;
3064 	if (num == 0)
3065 		return;
3066 
3067 	/*
3068 	 * Allocate a slot for all currently active rnodes on the
3069 	 * supposition that they all may need flushing.
3070 	 */
3071 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3072 	cnt = 0;
3073 
3074 	/*
3075 	 * If the vfs is known we can do fast path by iterating all rnodes that
3076 	 * belongs to this vfs.  This is much faster than the traditional way
3077 	 * of iterating rtable (below) in a case there is a lot of rnodes that
3078 	 * does not belong to our vfs.
3079 	 */
3080 	if (vfsp != NULL) {
3081 		mntinfo_t *mi = VFTOMI(vfsp);
3082 
3083 		mutex_enter(&mi->mi_rnodes_lock);
3084 		for (rp = list_head(&mi->mi_rnodes); rp != NULL;
3085 		    rp = list_next(&mi->mi_rnodes, rp)) {
3086 			vp = RTOV(rp);
3087 			/*
3088 			 * Don't bother sync'ing a vp if it
3089 			 * is part of virtual swap device or
3090 			 * if VFS is read-only
3091 			 */
3092 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3093 				continue;
3094 			/*
3095 			 * If the vnode has pages and is marked as either dirty
3096 			 * or mmap'd, hold and add this vnode to the list of
3097 			 * vnodes to flush.
3098 			 */
3099 			ASSERT(vp->v_vfsp == vfsp);
3100 			if (vn_has_cached_data(vp) &&
3101 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3102 				VN_HOLD(vp);
3103 				vplist[cnt++] = vp;
3104 				if (cnt == num) {
3105 					/*
3106 					 * The vplist is full because there is
3107 					 * too many rnodes.  We are done for
3108 					 * now.
3109 					 */
3110 					break;
3111 				}
3112 			}
3113 		}
3114 		mutex_exit(&mi->mi_rnodes_lock);
3115 
3116 		goto done;
3117 	}
3118 
3119 	ASSERT(vfsp == NULL);
3120 
3121 	/*
3122 	 * Walk the hash queues looking for rnodes with page
3123 	 * lists associated with them.  Make a list of these
3124 	 * files.
3125 	 */
3126 	for (index = 0; index < rtablesize; index++) {
3127 		rw_enter(&rtable[index].r_lock, RW_READER);
3128 		for (rp = rtable[index].r_hashf;
3129 		    rp != (rnode_t *)(&rtable[index]);
3130 		    rp = rp->r_hashf) {
3131 			vp = RTOV(rp);
3132 			/*
3133 			 * Don't bother sync'ing a vp if it
3134 			 * is part of virtual swap device or
3135 			 * if VFS is read-only
3136 			 */
3137 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3138 				continue;
3139 			/*
3140 			 * If the vnode has pages and is marked as either dirty
3141 			 * or mmap'd, hold and add this vnode to the list of
3142 			 * vnodes to flush.
3143 			 */
3144 			if (vn_has_cached_data(vp) &&
3145 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3146 				VN_HOLD(vp);
3147 				vplist[cnt++] = vp;
3148 				if (cnt == num) {
3149 					rw_exit(&rtable[index].r_lock);
3150 					/*
3151 					 * The vplist is full because there is
3152 					 * too many rnodes.  We are done for
3153 					 * now.
3154 					 */
3155 					goto done;
3156 				}
3157 			}
3158 		}
3159 		rw_exit(&rtable[index].r_lock);
3160 	}
3161 
3162 done:
3163 
3164 	/*
3165 	 * Flush and release all of the files on the list.
3166 	 */
3167 	while (cnt-- > 0) {
3168 		vp = vplist[cnt];
3169 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3170 		VN_RELE(vp);
3171 	}
3172 
3173 	/*
3174 	 * Free the space allocated to hold the list.
3175 	 */
3176 	kmem_free(vplist, num * sizeof (*vplist));
3177 }
3178 
3179 /*
3180  * This probably needs to be larger than or equal to
3181  * log2(sizeof (struct rnode)) due to the way that rnodes are
3182  * allocated.
3183  */
3184 #define	ACACHE_SHIFT_BITS	9
3185 
3186 static int
acachehash(rnode_t * rp,cred_t * cr)3187 acachehash(rnode_t *rp, cred_t *cr)
3188 {
3189 
3190 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3191 	    acachemask);
3192 }
3193 
3194 #ifdef DEBUG
3195 static long nfs_access_cache_hits = 0;
3196 static long nfs_access_cache_misses = 0;
3197 #endif
3198 
3199 nfs_access_type_t
nfs_access_check(rnode_t * rp,uint32_t acc,cred_t * cr)3200 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3201 {
3202 	vnode_t *vp;
3203 	acache_t *ap;
3204 	acache_hash_t *hp;
3205 	nfs_access_type_t all;
3206 
3207 	vp = RTOV(rp);
3208 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3209 		return (NFS_ACCESS_UNKNOWN);
3210 
3211 	if (rp->r_acache != NULL) {
3212 		hp = &acache[acachehash(rp, cr)];
3213 		rw_enter(&hp->lock, RW_READER);
3214 		ap = hp->next;
3215 		while (ap != (acache_t *)hp) {
3216 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3217 				if ((ap->known & acc) == acc) {
3218 #ifdef DEBUG
3219 					nfs_access_cache_hits++;
3220 #endif
3221 					if ((ap->allowed & acc) == acc)
3222 						all = NFS_ACCESS_ALLOWED;
3223 					else
3224 						all = NFS_ACCESS_DENIED;
3225 				} else {
3226 #ifdef DEBUG
3227 					nfs_access_cache_misses++;
3228 #endif
3229 					all = NFS_ACCESS_UNKNOWN;
3230 				}
3231 				rw_exit(&hp->lock);
3232 				return (all);
3233 			}
3234 			ap = ap->next;
3235 		}
3236 		rw_exit(&hp->lock);
3237 	}
3238 
3239 #ifdef DEBUG
3240 	nfs_access_cache_misses++;
3241 #endif
3242 	return (NFS_ACCESS_UNKNOWN);
3243 }
3244 
3245 void
nfs_access_cache(rnode_t * rp,uint32_t acc,uint32_t resacc,cred_t * cr)3246 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3247 {
3248 	acache_t *ap;
3249 	acache_t *nap;
3250 	acache_hash_t *hp;
3251 
3252 	hp = &acache[acachehash(rp, cr)];
3253 
3254 	/*
3255 	 * Allocate now assuming that mostly an allocation will be
3256 	 * required.  This allows the allocation to happen without
3257 	 * holding the hash bucket locked.
3258 	 */
3259 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3260 	if (nap != NULL) {
3261 		nap->known = acc;
3262 		nap->allowed = resacc;
3263 		nap->rnode = rp;
3264 		crhold(cr);
3265 		nap->cred = cr;
3266 		nap->hashq = hp;
3267 	}
3268 
3269 	rw_enter(&hp->lock, RW_WRITER);
3270 
3271 	if (rp->r_acache != NULL) {
3272 		ap = hp->next;
3273 		while (ap != (acache_t *)hp) {
3274 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3275 				ap->known |= acc;
3276 				ap->allowed &= ~acc;
3277 				ap->allowed |= resacc;
3278 				rw_exit(&hp->lock);
3279 				if (nap != NULL) {
3280 					crfree(nap->cred);
3281 					kmem_cache_free(acache_cache, nap);
3282 				}
3283 				return;
3284 			}
3285 			ap = ap->next;
3286 		}
3287 	}
3288 
3289 	if (nap != NULL) {
3290 #ifdef DEBUG
3291 		clstat_debug.access.value.ui64++;
3292 #endif
3293 		nap->next = hp->next;
3294 		hp->next = nap;
3295 		nap->next->prev = nap;
3296 		nap->prev = (acache_t *)hp;
3297 
3298 		mutex_enter(&rp->r_statelock);
3299 		nap->list = rp->r_acache;
3300 		rp->r_acache = nap;
3301 		mutex_exit(&rp->r_statelock);
3302 	}
3303 
3304 	rw_exit(&hp->lock);
3305 }
3306 
3307 int
nfs_access_purge_rp(rnode_t * rp)3308 nfs_access_purge_rp(rnode_t *rp)
3309 {
3310 	acache_t *ap;
3311 	acache_t *tmpap;
3312 	acache_t *rplist;
3313 
3314 	/*
3315 	 * If there aren't any cached entries, then there is nothing
3316 	 * to free.
3317 	 */
3318 	if (rp->r_acache == NULL)
3319 		return (0);
3320 
3321 	mutex_enter(&rp->r_statelock);
3322 	rplist = rp->r_acache;
3323 	rp->r_acache = NULL;
3324 	mutex_exit(&rp->r_statelock);
3325 
3326 	/*
3327 	 * Loop through each entry in the list pointed to in the
3328 	 * rnode.  Remove each of these entries from the hash
3329 	 * queue that it is on and remove it from the list in
3330 	 * the rnode.
3331 	 */
3332 	for (ap = rplist; ap != NULL; ap = tmpap) {
3333 		rw_enter(&ap->hashq->lock, RW_WRITER);
3334 		ap->prev->next = ap->next;
3335 		ap->next->prev = ap->prev;
3336 		rw_exit(&ap->hashq->lock);
3337 
3338 		tmpap = ap->list;
3339 		crfree(ap->cred);
3340 		kmem_cache_free(acache_cache, ap);
3341 #ifdef DEBUG
3342 		clstat_debug.access.value.ui64--;
3343 #endif
3344 	}
3345 
3346 	return (1);
3347 }
3348 
3349 static const char prefix[] = ".nfs";
3350 
3351 static kmutex_t newnum_lock;
3352 
3353 int
newnum(void)3354 newnum(void)
3355 {
3356 	static uint_t newnum = 0;
3357 	uint_t id;
3358 
3359 	mutex_enter(&newnum_lock);
3360 	if (newnum == 0)
3361 		newnum = gethrestime_sec() & 0xffff;
3362 	id = newnum++;
3363 	mutex_exit(&newnum_lock);
3364 	return (id);
3365 }
3366 
3367 char *
newname(void)3368 newname(void)
3369 {
3370 	char *news;
3371 	char *s;
3372 	const char *p;
3373 	uint_t id;
3374 
3375 	id = newnum();
3376 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3377 	s = news;
3378 	p = prefix;
3379 	while (*p != '\0')
3380 		*s++ = *p++;
3381 	while (id != 0) {
3382 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3383 		id >>= 4;
3384 	}
3385 	*s = '\0';
3386 	return (news);
3387 }
3388 
3389 /*
3390  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3391  * framework.
3392  */
3393 static int
cl_snapshot(kstat_t * ksp,void * buf,int rw)3394 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3395 {
3396 	ksp->ks_snaptime = gethrtime();
3397 	if (rw == KSTAT_WRITE) {
3398 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3399 #ifdef DEBUG
3400 		/*
3401 		 * Currently only the global zone can write to kstats, but we
3402 		 * add the check just for paranoia.
3403 		 */
3404 		if (INGLOBALZONE(curproc))
3405 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3406 			    sizeof (clstat_debug));
3407 #endif
3408 	} else {
3409 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3410 #ifdef DEBUG
3411 		/*
3412 		 * If we're displaying the "global" debug kstat values, we
3413 		 * display them as-is to all zones since in fact they apply to
3414 		 * the system as a whole.
3415 		 */
3416 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3417 		    sizeof (clstat_debug));
3418 #endif
3419 	}
3420 	return (0);
3421 }
3422 
3423 static void *
clinit_zone(zoneid_t zoneid)3424 clinit_zone(zoneid_t zoneid)
3425 {
3426 	kstat_t *nfs_client_kstat;
3427 	struct nfs_clnt *nfscl;
3428 	uint_t ndata;
3429 
3430 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3431 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3432 	nfscl->nfscl_chtable = NULL;
3433 	nfscl->nfscl_zoneid = zoneid;
3434 
3435 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3436 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3437 #ifdef DEBUG
3438 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3439 #endif
3440 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3441 	    "misc", KSTAT_TYPE_NAMED, ndata,
3442 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3443 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3444 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3445 		kstat_install(nfs_client_kstat);
3446 	}
3447 	mutex_enter(&nfs_clnt_list_lock);
3448 	list_insert_head(&nfs_clnt_list, nfscl);
3449 	mutex_exit(&nfs_clnt_list_lock);
3450 	return (nfscl);
3451 }
3452 
3453 /*ARGSUSED*/
3454 static void
clfini_zone(zoneid_t zoneid,void * arg)3455 clfini_zone(zoneid_t zoneid, void *arg)
3456 {
3457 	struct nfs_clnt *nfscl = arg;
3458 	chhead_t *chp, *next;
3459 
3460 	if (nfscl == NULL)
3461 		return;
3462 	mutex_enter(&nfs_clnt_list_lock);
3463 	list_remove(&nfs_clnt_list, nfscl);
3464 	mutex_exit(&nfs_clnt_list_lock);
3465 	clreclaim_zone(nfscl, 0);
3466 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3467 		ASSERT(chp->ch_list == NULL);
3468 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3469 		next = chp->ch_next;
3470 		kmem_free(chp, sizeof (*chp));
3471 	}
3472 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3473 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3474 	kmem_free(nfscl, sizeof (*nfscl));
3475 }
3476 
3477 /*
3478  * Called by endpnt_destructor to make sure the client handles are
3479  * cleaned up before the RPC endpoints.  This becomes a no-op if
3480  * clfini_zone (above) is called first.  This function is needed
3481  * (rather than relying on clfini_zone to clean up) because the ZSD
3482  * callbacks have no ordering mechanism, so we have no way to ensure
3483  * that clfini_zone is called before endpnt_destructor.
3484  */
3485 void
clcleanup_zone(zoneid_t zoneid)3486 clcleanup_zone(zoneid_t zoneid)
3487 {
3488 	struct nfs_clnt *nfscl;
3489 
3490 	mutex_enter(&nfs_clnt_list_lock);
3491 	nfscl = list_head(&nfs_clnt_list);
3492 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3493 		if (nfscl->nfscl_zoneid == zoneid) {
3494 			clreclaim_zone(nfscl, 0);
3495 			break;
3496 		}
3497 	}
3498 	mutex_exit(&nfs_clnt_list_lock);
3499 }
3500 
3501 int
nfs_subrinit(void)3502 nfs_subrinit(void)
3503 {
3504 	int i;
3505 	ulong_t nrnode_max;
3506 
3507 	/*
3508 	 * Allocate and initialize the rnode hash queues
3509 	 */
3510 	if (nrnode <= 0)
3511 		nrnode = ncsize;
3512 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3513 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3514 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3515 		    "!setting nrnode to max value of %ld", nrnode_max);
3516 		nrnode = nrnode_max;
3517 	}
3518 
3519 	rtablesize = 1 << highbit(nrnode / hashlen);
3520 	rtablemask = rtablesize - 1;
3521 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3522 	for (i = 0; i < rtablesize; i++) {
3523 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3524 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3525 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3526 	}
3527 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3528 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3529 
3530 	/*
3531 	 * Allocate and initialize the access cache
3532 	 */
3533 
3534 	/*
3535 	 * Initial guess is one access cache entry per rnode unless
3536 	 * nacache is set to a non-zero value and then it is used to
3537 	 * indicate a guess at the number of access cache entries.
3538 	 */
3539 	if (nacache > 0)
3540 		acachesize = 1 << highbit(nacache / hashlen);
3541 	else
3542 		acachesize = rtablesize;
3543 	acachemask = acachesize - 1;
3544 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3545 	for (i = 0; i < acachesize; i++) {
3546 		acache[i].next = (acache_t *)&acache[i];
3547 		acache[i].prev = (acache_t *)&acache[i];
3548 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3549 	}
3550 	acache_cache = kmem_cache_create("nfs_access_cache",
3551 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3552 	/*
3553 	 * Allocate and initialize the client handle cache
3554 	 */
3555 	chtab_cache = kmem_cache_create("client_handle_cache",
3556 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3557 	/*
3558 	 * Initialize the list of per-zone client handles (and associated data).
3559 	 * This needs to be done before we call zone_key_create().
3560 	 */
3561 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3562 	    offsetof(struct nfs_clnt, nfscl_node));
3563 	/*
3564 	 * Initialize the zone_key for per-zone client handle lists.
3565 	 */
3566 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3567 	/*
3568 	 * Initialize the various mutexes and reader/writer locks
3569 	 */
3570 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3571 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3572 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3573 
3574 	/*
3575 	 * Assign unique major number for all nfs mounts
3576 	 */
3577 	if ((nfs_major = getudev()) == -1) {
3578 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3579 		    "nfs: init: can't get unique device number");
3580 		nfs_major = 0;
3581 	}
3582 	nfs_minor = 0;
3583 
3584 	if (nfs3_jukebox_delay == 0)
3585 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3586 
3587 	return (0);
3588 }
3589 
3590 void
nfs_subrfini(void)3591 nfs_subrfini(void)
3592 {
3593 	int i;
3594 
3595 	/*
3596 	 * Deallocate the rnode hash queues
3597 	 */
3598 	kmem_cache_destroy(rnode_cache);
3599 
3600 	for (i = 0; i < rtablesize; i++)
3601 		rw_destroy(&rtable[i].r_lock);
3602 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3603 
3604 	/*
3605 	 * Deallocated the access cache
3606 	 */
3607 	kmem_cache_destroy(acache_cache);
3608 
3609 	for (i = 0; i < acachesize; i++)
3610 		rw_destroy(&acache[i].lock);
3611 	kmem_free(acache, acachesize * sizeof (*acache));
3612 
3613 	/*
3614 	 * Deallocate the client handle cache
3615 	 */
3616 	kmem_cache_destroy(chtab_cache);
3617 
3618 	/*
3619 	 * Destroy the various mutexes and reader/writer locks
3620 	 */
3621 	mutex_destroy(&rpfreelist_lock);
3622 	mutex_destroy(&newnum_lock);
3623 	mutex_destroy(&nfs_minor_lock);
3624 	(void) zone_key_delete(nfsclnt_zone_key);
3625 }
3626 
3627 enum nfsstat
puterrno(int error)3628 puterrno(int error)
3629 {
3630 
3631 	switch (error) {
3632 	case EOPNOTSUPP:
3633 		return (NFSERR_OPNOTSUPP);
3634 	case ENAMETOOLONG:
3635 		return (NFSERR_NAMETOOLONG);
3636 	case ENOTEMPTY:
3637 		return (NFSERR_NOTEMPTY);
3638 	case EDQUOT:
3639 		return (NFSERR_DQUOT);
3640 	case ESTALE:
3641 		return (NFSERR_STALE);
3642 	case EREMOTE:
3643 		return (NFSERR_REMOTE);
3644 	case ENOSYS:
3645 		return (NFSERR_OPNOTSUPP);
3646 	case EOVERFLOW:
3647 		return (NFSERR_INVAL);
3648 	default:
3649 		return ((enum nfsstat)error);
3650 	}
3651 	/* NOTREACHED */
3652 }
3653 
3654 int
geterrno(enum nfsstat status)3655 geterrno(enum nfsstat status)
3656 {
3657 
3658 	switch (status) {
3659 	case NFSERR_OPNOTSUPP:
3660 		return (EOPNOTSUPP);
3661 	case NFSERR_NAMETOOLONG:
3662 		return (ENAMETOOLONG);
3663 	case NFSERR_NOTEMPTY:
3664 		return (ENOTEMPTY);
3665 	case NFSERR_DQUOT:
3666 		return (EDQUOT);
3667 	case NFSERR_STALE:
3668 		return (ESTALE);
3669 	case NFSERR_REMOTE:
3670 		return (EREMOTE);
3671 	case NFSERR_WFLUSH:
3672 		return (EIO);
3673 	default:
3674 		return ((int)status);
3675 	}
3676 	/* NOTREACHED */
3677 }
3678 
3679 enum nfsstat3
puterrno3(int error)3680 puterrno3(int error)
3681 {
3682 
3683 #ifdef DEBUG
3684 	switch (error) {
3685 	case 0:
3686 		return (NFS3_OK);
3687 	case EPERM:
3688 		return (NFS3ERR_PERM);
3689 	case ENOENT:
3690 		return (NFS3ERR_NOENT);
3691 	case EIO:
3692 		return (NFS3ERR_IO);
3693 	case ENXIO:
3694 		return (NFS3ERR_NXIO);
3695 	case EACCES:
3696 		return (NFS3ERR_ACCES);
3697 	case EEXIST:
3698 		return (NFS3ERR_EXIST);
3699 	case EXDEV:
3700 		return (NFS3ERR_XDEV);
3701 	case ENODEV:
3702 		return (NFS3ERR_NODEV);
3703 	case ENOTDIR:
3704 		return (NFS3ERR_NOTDIR);
3705 	case EISDIR:
3706 		return (NFS3ERR_ISDIR);
3707 	case EINVAL:
3708 		return (NFS3ERR_INVAL);
3709 	case EFBIG:
3710 		return (NFS3ERR_FBIG);
3711 	case ENOSPC:
3712 		return (NFS3ERR_NOSPC);
3713 	case EROFS:
3714 		return (NFS3ERR_ROFS);
3715 	case EMLINK:
3716 		return (NFS3ERR_MLINK);
3717 	case ENAMETOOLONG:
3718 		return (NFS3ERR_NAMETOOLONG);
3719 	case ENOTEMPTY:
3720 		return (NFS3ERR_NOTEMPTY);
3721 	case EDQUOT:
3722 		return (NFS3ERR_DQUOT);
3723 	case ESTALE:
3724 		return (NFS3ERR_STALE);
3725 	case EREMOTE:
3726 		return (NFS3ERR_REMOTE);
3727 	case ENOSYS:
3728 	case EOPNOTSUPP:
3729 		return (NFS3ERR_NOTSUPP);
3730 	case EOVERFLOW:
3731 		return (NFS3ERR_INVAL);
3732 	default:
3733 		zcmn_err(getzoneid(), CE_WARN,
3734 		    "puterrno3: got error %d", error);
3735 		return ((enum nfsstat3)error);
3736 	}
3737 #else
3738 	switch (error) {
3739 	case ENAMETOOLONG:
3740 		return (NFS3ERR_NAMETOOLONG);
3741 	case ENOTEMPTY:
3742 		return (NFS3ERR_NOTEMPTY);
3743 	case EDQUOT:
3744 		return (NFS3ERR_DQUOT);
3745 	case ESTALE:
3746 		return (NFS3ERR_STALE);
3747 	case ENOSYS:
3748 	case EOPNOTSUPP:
3749 		return (NFS3ERR_NOTSUPP);
3750 	case EREMOTE:
3751 		return (NFS3ERR_REMOTE);
3752 	case EOVERFLOW:
3753 		return (NFS3ERR_INVAL);
3754 	default:
3755 		return ((enum nfsstat3)error);
3756 	}
3757 #endif
3758 }
3759 
3760 int
geterrno3(enum nfsstat3 status)3761 geterrno3(enum nfsstat3 status)
3762 {
3763 
3764 #ifdef DEBUG
3765 	switch (status) {
3766 	case NFS3_OK:
3767 		return (0);
3768 	case NFS3ERR_PERM:
3769 		return (EPERM);
3770 	case NFS3ERR_NOENT:
3771 		return (ENOENT);
3772 	case NFS3ERR_IO:
3773 		return (EIO);
3774 	case NFS3ERR_NXIO:
3775 		return (ENXIO);
3776 	case NFS3ERR_ACCES:
3777 		return (EACCES);
3778 	case NFS3ERR_EXIST:
3779 		return (EEXIST);
3780 	case NFS3ERR_XDEV:
3781 		return (EXDEV);
3782 	case NFS3ERR_NODEV:
3783 		return (ENODEV);
3784 	case NFS3ERR_NOTDIR:
3785 		return (ENOTDIR);
3786 	case NFS3ERR_ISDIR:
3787 		return (EISDIR);
3788 	case NFS3ERR_INVAL:
3789 		return (EINVAL);
3790 	case NFS3ERR_FBIG:
3791 		return (EFBIG);
3792 	case NFS3ERR_NOSPC:
3793 		return (ENOSPC);
3794 	case NFS3ERR_ROFS:
3795 		return (EROFS);
3796 	case NFS3ERR_MLINK:
3797 		return (EMLINK);
3798 	case NFS3ERR_NAMETOOLONG:
3799 		return (ENAMETOOLONG);
3800 	case NFS3ERR_NOTEMPTY:
3801 		return (ENOTEMPTY);
3802 	case NFS3ERR_DQUOT:
3803 		return (EDQUOT);
3804 	case NFS3ERR_STALE:
3805 		return (ESTALE);
3806 	case NFS3ERR_REMOTE:
3807 		return (EREMOTE);
3808 	case NFS3ERR_BADHANDLE:
3809 		return (ESTALE);
3810 	case NFS3ERR_NOT_SYNC:
3811 		return (EINVAL);
3812 	case NFS3ERR_BAD_COOKIE:
3813 		return (ENOENT);
3814 	case NFS3ERR_NOTSUPP:
3815 		return (EOPNOTSUPP);
3816 	case NFS3ERR_TOOSMALL:
3817 		return (EINVAL);
3818 	case NFS3ERR_SERVERFAULT:
3819 		return (EIO);
3820 	case NFS3ERR_BADTYPE:
3821 		return (EINVAL);
3822 	case NFS3ERR_JUKEBOX:
3823 		return (ENXIO);
3824 	default:
3825 		zcmn_err(getzoneid(), CE_WARN,
3826 		    "geterrno3: got status %d", status);
3827 		return ((int)status);
3828 	}
3829 #else
3830 	switch (status) {
3831 	case NFS3ERR_NAMETOOLONG:
3832 		return (ENAMETOOLONG);
3833 	case NFS3ERR_NOTEMPTY:
3834 		return (ENOTEMPTY);
3835 	case NFS3ERR_DQUOT:
3836 		return (EDQUOT);
3837 	case NFS3ERR_STALE:
3838 	case NFS3ERR_BADHANDLE:
3839 		return (ESTALE);
3840 	case NFS3ERR_NOTSUPP:
3841 		return (EOPNOTSUPP);
3842 	case NFS3ERR_REMOTE:
3843 		return (EREMOTE);
3844 	case NFS3ERR_NOT_SYNC:
3845 	case NFS3ERR_TOOSMALL:
3846 	case NFS3ERR_BADTYPE:
3847 		return (EINVAL);
3848 	case NFS3ERR_BAD_COOKIE:
3849 		return (ENOENT);
3850 	case NFS3ERR_SERVERFAULT:
3851 		return (EIO);
3852 	case NFS3ERR_JUKEBOX:
3853 		return (ENXIO);
3854 	default:
3855 		return ((int)status);
3856 	}
3857 #endif
3858 }
3859 
3860 rddir_cache *
rddir_cache_alloc(int flags)3861 rddir_cache_alloc(int flags)
3862 {
3863 	rddir_cache *rc;
3864 
3865 	rc = kmem_alloc(sizeof (*rc), flags);
3866 	if (rc != NULL) {
3867 		rc->entries = NULL;
3868 		rc->flags = RDDIR;
3869 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3870 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3871 		rc->count = 1;
3872 #ifdef DEBUG
3873 		atomic_inc_64(&clstat_debug.dirent.value.ui64);
3874 #endif
3875 	}
3876 	return (rc);
3877 }
3878 
3879 static void
rddir_cache_free(rddir_cache * rc)3880 rddir_cache_free(rddir_cache *rc)
3881 {
3882 
3883 #ifdef DEBUG
3884 	atomic_dec_64(&clstat_debug.dirent.value.ui64);
3885 #endif
3886 	if (rc->entries != NULL) {
3887 #ifdef DEBUG
3888 		rddir_cache_buf_free(rc->entries, rc->buflen);
3889 #else
3890 		kmem_free(rc->entries, rc->buflen);
3891 #endif
3892 	}
3893 	cv_destroy(&rc->cv);
3894 	mutex_destroy(&rc->lock);
3895 	kmem_free(rc, sizeof (*rc));
3896 }
3897 
3898 void
rddir_cache_hold(rddir_cache * rc)3899 rddir_cache_hold(rddir_cache *rc)
3900 {
3901 
3902 	mutex_enter(&rc->lock);
3903 	rc->count++;
3904 	mutex_exit(&rc->lock);
3905 }
3906 
3907 void
rddir_cache_rele(rddir_cache * rc)3908 rddir_cache_rele(rddir_cache *rc)
3909 {
3910 
3911 	mutex_enter(&rc->lock);
3912 	ASSERT(rc->count > 0);
3913 	if (--rc->count == 0) {
3914 		mutex_exit(&rc->lock);
3915 		rddir_cache_free(rc);
3916 	} else
3917 		mutex_exit(&rc->lock);
3918 }
3919 
3920 #ifdef DEBUG
3921 char *
rddir_cache_buf_alloc(size_t size,int flags)3922 rddir_cache_buf_alloc(size_t size, int flags)
3923 {
3924 	char *rc;
3925 
3926 	rc = kmem_alloc(size, flags);
3927 	if (rc != NULL)
3928 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3929 	return (rc);
3930 }
3931 
3932 void
rddir_cache_buf_free(void * addr,size_t size)3933 rddir_cache_buf_free(void *addr, size_t size)
3934 {
3935 
3936 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3937 	kmem_free(addr, size);
3938 }
3939 #endif
3940 
3941 static int
nfs_free_data_reclaim(rnode_t * rp)3942 nfs_free_data_reclaim(rnode_t *rp)
3943 {
3944 	char *contents;
3945 	int size;
3946 	vsecattr_t *vsp;
3947 	nfs3_pathconf_info *info;
3948 	int freed;
3949 	cred_t *cred;
3950 
3951 	/*
3952 	 * Free any held credentials and caches which
3953 	 * may be associated with this rnode.
3954 	 */
3955 	mutex_enter(&rp->r_statelock);
3956 	cred = rp->r_cred;
3957 	rp->r_cred = NULL;
3958 	contents = rp->r_symlink.contents;
3959 	size = rp->r_symlink.size;
3960 	rp->r_symlink.contents = NULL;
3961 	vsp = rp->r_secattr;
3962 	rp->r_secattr = NULL;
3963 	info = rp->r_pathconf;
3964 	rp->r_pathconf = NULL;
3965 	mutex_exit(&rp->r_statelock);
3966 
3967 	if (cred != NULL)
3968 		crfree(cred);
3969 
3970 	/*
3971 	 * Free the access cache entries.
3972 	 */
3973 	freed = nfs_access_purge_rp(rp);
3974 
3975 	if (!HAVE_RDDIR_CACHE(rp) &&
3976 	    contents == NULL &&
3977 	    vsp == NULL &&
3978 	    info == NULL)
3979 		return (freed);
3980 
3981 	/*
3982 	 * Free the readdir cache entries
3983 	 */
3984 	if (HAVE_RDDIR_CACHE(rp))
3985 		nfs_purge_rddir_cache(RTOV(rp));
3986 
3987 	/*
3988 	 * Free the symbolic link cache.
3989 	 */
3990 	if (contents != NULL) {
3991 
3992 		kmem_free((void *)contents, size);
3993 	}
3994 
3995 	/*
3996 	 * Free any cached ACL.
3997 	 */
3998 	if (vsp != NULL)
3999 		nfs_acl_free(vsp);
4000 
4001 	/*
4002 	 * Free any cached pathconf information.
4003 	 */
4004 	if (info != NULL)
4005 		kmem_free(info, sizeof (*info));
4006 
4007 	return (1);
4008 }
4009 
4010 static int
nfs_active_data_reclaim(rnode_t * rp)4011 nfs_active_data_reclaim(rnode_t *rp)
4012 {
4013 	char *contents;
4014 	int size;
4015 	vsecattr_t *vsp;
4016 	nfs3_pathconf_info *info;
4017 	int freed;
4018 
4019 	/*
4020 	 * Free any held credentials and caches which
4021 	 * may be associated with this rnode.
4022 	 */
4023 	if (!mutex_tryenter(&rp->r_statelock))
4024 		return (0);
4025 	contents = rp->r_symlink.contents;
4026 	size = rp->r_symlink.size;
4027 	rp->r_symlink.contents = NULL;
4028 	vsp = rp->r_secattr;
4029 	rp->r_secattr = NULL;
4030 	info = rp->r_pathconf;
4031 	rp->r_pathconf = NULL;
4032 	mutex_exit(&rp->r_statelock);
4033 
4034 	/*
4035 	 * Free the access cache entries.
4036 	 */
4037 	freed = nfs_access_purge_rp(rp);
4038 
4039 	if (!HAVE_RDDIR_CACHE(rp) &&
4040 	    contents == NULL &&
4041 	    vsp == NULL &&
4042 	    info == NULL)
4043 		return (freed);
4044 
4045 	/*
4046 	 * Free the readdir cache entries
4047 	 */
4048 	if (HAVE_RDDIR_CACHE(rp))
4049 		nfs_purge_rddir_cache(RTOV(rp));
4050 
4051 	/*
4052 	 * Free the symbolic link cache.
4053 	 */
4054 	if (contents != NULL) {
4055 
4056 		kmem_free((void *)contents, size);
4057 	}
4058 
4059 	/*
4060 	 * Free any cached ACL.
4061 	 */
4062 	if (vsp != NULL)
4063 		nfs_acl_free(vsp);
4064 
4065 	/*
4066 	 * Free any cached pathconf information.
4067 	 */
4068 	if (info != NULL)
4069 		kmem_free(info, sizeof (*info));
4070 
4071 	return (1);
4072 }
4073 
4074 static int
nfs_free_reclaim(void)4075 nfs_free_reclaim(void)
4076 {
4077 	int freed;
4078 	rnode_t *rp;
4079 
4080 #ifdef DEBUG
4081 	clstat_debug.f_reclaim.value.ui64++;
4082 #endif
4083 	freed = 0;
4084 	mutex_enter(&rpfreelist_lock);
4085 	rp = rpfreelist;
4086 	if (rp != NULL) {
4087 		do {
4088 			if (nfs_free_data_reclaim(rp))
4089 				freed = 1;
4090 		} while ((rp = rp->r_freef) != rpfreelist);
4091 	}
4092 	mutex_exit(&rpfreelist_lock);
4093 	return (freed);
4094 }
4095 
4096 static int
nfs_active_reclaim(void)4097 nfs_active_reclaim(void)
4098 {
4099 	int freed;
4100 	int index;
4101 	rnode_t *rp;
4102 
4103 #ifdef DEBUG
4104 	clstat_debug.a_reclaim.value.ui64++;
4105 #endif
4106 	freed = 0;
4107 	for (index = 0; index < rtablesize; index++) {
4108 		rw_enter(&rtable[index].r_lock, RW_READER);
4109 		for (rp = rtable[index].r_hashf;
4110 		    rp != (rnode_t *)(&rtable[index]);
4111 		    rp = rp->r_hashf) {
4112 			if (nfs_active_data_reclaim(rp))
4113 				freed = 1;
4114 		}
4115 		rw_exit(&rtable[index].r_lock);
4116 	}
4117 	return (freed);
4118 }
4119 
4120 static int
nfs_rnode_reclaim(void)4121 nfs_rnode_reclaim(void)
4122 {
4123 	int freed;
4124 	rnode_t *rp;
4125 	vnode_t *vp;
4126 
4127 #ifdef DEBUG
4128 	clstat_debug.r_reclaim.value.ui64++;
4129 #endif
4130 	freed = 0;
4131 	mutex_enter(&rpfreelist_lock);
4132 	while ((rp = rpfreelist) != NULL) {
4133 		rp_rmfree(rp);
4134 		mutex_exit(&rpfreelist_lock);
4135 		if (rp->r_flags & RHASHED) {
4136 			vp = RTOV(rp);
4137 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4138 			mutex_enter(&vp->v_lock);
4139 			if (vp->v_count > 1) {
4140 				vp->v_count--;
4141 				mutex_exit(&vp->v_lock);
4142 				rw_exit(&rp->r_hashq->r_lock);
4143 				mutex_enter(&rpfreelist_lock);
4144 				continue;
4145 			}
4146 			mutex_exit(&vp->v_lock);
4147 			rp_rmhash_locked(rp);
4148 			rw_exit(&rp->r_hashq->r_lock);
4149 		}
4150 		/*
4151 		 * This call to rp_addfree will end up destroying the
4152 		 * rnode, but in a safe way with the appropriate set
4153 		 * of checks done.
4154 		 */
4155 		rp_addfree(rp, CRED());
4156 		mutex_enter(&rpfreelist_lock);
4157 	}
4158 	mutex_exit(&rpfreelist_lock);
4159 	return (freed);
4160 }
4161 
4162 /*ARGSUSED*/
4163 static void
nfs_reclaim(void * cdrarg)4164 nfs_reclaim(void *cdrarg)
4165 {
4166 
4167 #ifdef DEBUG
4168 	clstat_debug.reclaim.value.ui64++;
4169 #endif
4170 	if (nfs_free_reclaim())
4171 		return;
4172 
4173 	if (nfs_active_reclaim())
4174 		return;
4175 
4176 	(void) nfs_rnode_reclaim();
4177 }
4178 
4179 /*
4180  * NFS client failover support
4181  *
4182  * Routines to copy filehandles
4183  */
4184 void
nfscopyfh(caddr_t fhp,vnode_t * vp)4185 nfscopyfh(caddr_t fhp, vnode_t *vp)
4186 {
4187 	fhandle_t *dest = (fhandle_t *)fhp;
4188 
4189 	if (dest != NULL)
4190 		*dest = *VTOFH(vp);
4191 }
4192 
4193 void
nfs3copyfh(caddr_t fhp,vnode_t * vp)4194 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4195 {
4196 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4197 
4198 	if (dest != NULL)
4199 		*dest = *VTOFH3(vp);
4200 }
4201 
4202 /*
4203  * NFS client failover support
4204  *
4205  * failover_safe() will test various conditions to ensure that
4206  * failover is permitted for this vnode.  It will be denied
4207  * if:
4208  *	1) the operation in progress does not support failover (NULL fi)
4209  *	2) there are no available replicas (NULL mi_servers->sv_next)
4210  *	3) any locks are outstanding on this file
4211  */
4212 static int
failover_safe(failinfo_t * fi)4213 failover_safe(failinfo_t *fi)
4214 {
4215 
4216 	/*
4217 	 * Does this op permit failover?
4218 	 */
4219 	if (fi == NULL || fi->vp == NULL)
4220 		return (0);
4221 
4222 	/*
4223 	 * Are there any alternates to failover to?
4224 	 */
4225 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4226 		return (0);
4227 
4228 	/*
4229 	 * Disable check; we've forced local locking
4230 	 *
4231 	 * if (flk_has_remote_locks(fi->vp))
4232 	 *	return (0);
4233 	 */
4234 
4235 	/*
4236 	 * If we have no partial path, we can't do anything
4237 	 */
4238 	if (VTOR(fi->vp)->r_path == NULL)
4239 		return (0);
4240 
4241 	return (1);
4242 }
4243 
4244 #include <sys/thread.h>
4245 
4246 /*
4247  * NFS client failover support
4248  *
4249  * failover_newserver() will start a search for a new server,
4250  * preferably by starting an async thread to do the work.  If
4251  * someone is already doing this (recognizable by MI_BINDINPROG
4252  * being set), it will simply return and the calling thread
4253  * will queue on the mi_failover_cv condition variable.
4254  */
4255 static void
failover_newserver(mntinfo_t * mi)4256 failover_newserver(mntinfo_t *mi)
4257 {
4258 	/*
4259 	 * Check if someone else is doing this already
4260 	 */
4261 	mutex_enter(&mi->mi_lock);
4262 	if (mi->mi_flags & MI_BINDINPROG) {
4263 		mutex_exit(&mi->mi_lock);
4264 		return;
4265 	}
4266 	mi->mi_flags |= MI_BINDINPROG;
4267 
4268 	/*
4269 	 * Need to hold the vfs struct so that it can't be released
4270 	 * while the failover thread is selecting a new server.
4271 	 */
4272 	VFS_HOLD(mi->mi_vfsp);
4273 
4274 	/*
4275 	 * Start a thread to do the real searching.
4276 	 */
4277 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4278 
4279 	mutex_exit(&mi->mi_lock);
4280 }
4281 
4282 /*
4283  * NFS client failover support
4284  *
4285  * failover_thread() will find a new server to replace the one
4286  * currently in use, wake up other threads waiting on this mount
4287  * point, and die.  It will start at the head of the server list
4288  * and poll servers until it finds one with an NFS server which is
4289  * registered and responds to a NULL procedure ping.
4290  *
4291  * XXX failover_thread is unsafe within the scope of the
4292  * present model defined for cpr to suspend the system.
4293  * Specifically, over-the-wire calls made by the thread
4294  * are unsafe. The thread needs to be reevaluated in case of
4295  * future updates to the cpr suspend model.
4296  */
4297 static void
failover_thread(mntinfo_t * mi)4298 failover_thread(mntinfo_t *mi)
4299 {
4300 	servinfo_t *svp = NULL;
4301 	CLIENT *cl;
4302 	enum clnt_stat status;
4303 	struct timeval tv;
4304 	int error;
4305 	int oncethru = 0;
4306 	callb_cpr_t cprinfo;
4307 	rnode_t *rp;
4308 	int index;
4309 	char *srvnames;
4310 	size_t srvnames_len;
4311 	struct nfs_clnt *nfscl = NULL;
4312 	zoneid_t zoneid = getzoneid();
4313 
4314 #ifdef DEBUG
4315 	/*
4316 	 * This is currently only needed to access counters which exist on
4317 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4318 	 * on non-DEBUG kernels.
4319 	 */
4320 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4321 	ASSERT(nfscl != NULL);
4322 #endif
4323 
4324 	/*
4325 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4326 	 * code guarantees that there will be only one failover thread
4327 	 * per mountinfo at any instance.
4328 	 */
4329 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4330 	    "failover_thread");
4331 
4332 	mutex_enter(&mi->mi_lock);
4333 	while (mi->mi_readers) {
4334 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4335 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4336 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4337 	}
4338 	mutex_exit(&mi->mi_lock);
4339 
4340 	tv.tv_sec = 2;
4341 	tv.tv_usec = 0;
4342 
4343 	/*
4344 	 * Ping the null NFS procedure of every server in
4345 	 * the list until one responds.  We always start
4346 	 * at the head of the list and always skip the one
4347 	 * that is current, since it's caused us a problem.
4348 	 */
4349 	while (svp == NULL) {
4350 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4351 			if (!oncethru && svp == mi->mi_curr_serv)
4352 				continue;
4353 
4354 			/*
4355 			 * If the file system was forcibly umounted
4356 			 * while trying to do a failover, then just
4357 			 * give up on the failover.  It won't matter
4358 			 * what the server is.
4359 			 */
4360 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4361 				svp = NULL;
4362 				goto done;
4363 			}
4364 
4365 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4366 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4367 			if (error)
4368 				continue;
4369 
4370 			if (!(mi->mi_flags & MI_INT))
4371 				cl->cl_nosignal = TRUE;
4372 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4373 			    xdr_void, NULL, tv);
4374 			if (!(mi->mi_flags & MI_INT))
4375 				cl->cl_nosignal = FALSE;
4376 			AUTH_DESTROY(cl->cl_auth);
4377 			CLNT_DESTROY(cl);
4378 			if (status == RPC_SUCCESS) {
4379 				if (svp == mi->mi_curr_serv) {
4380 #ifdef DEBUG
4381 					zcmn_err(zoneid, CE_NOTE,
4382 			"NFS%d: failing over: selecting original server %s",
4383 					    mi->mi_vers, svp->sv_hostname);
4384 #else
4385 					zcmn_err(zoneid, CE_NOTE,
4386 			"NFS: failing over: selecting original server %s",
4387 					    svp->sv_hostname);
4388 #endif
4389 				} else {
4390 #ifdef DEBUG
4391 					zcmn_err(zoneid, CE_NOTE,
4392 				    "NFS%d: failing over from %s to %s",
4393 					    mi->mi_vers,
4394 					    mi->mi_curr_serv->sv_hostname,
4395 					    svp->sv_hostname);
4396 #else
4397 					zcmn_err(zoneid, CE_NOTE,
4398 				    "NFS: failing over from %s to %s",
4399 					    mi->mi_curr_serv->sv_hostname,
4400 					    svp->sv_hostname);
4401 #endif
4402 				}
4403 				break;
4404 			}
4405 		}
4406 
4407 		if (svp == NULL) {
4408 			if (!oncethru) {
4409 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4410 #ifdef DEBUG
4411 				zprintf(zoneid,
4412 				    "NFS%d servers %s not responding "
4413 				    "still trying\n", mi->mi_vers, srvnames);
4414 #else
4415 				zprintf(zoneid, "NFS servers %s not responding "
4416 				    "still trying\n", srvnames);
4417 #endif
4418 				oncethru = 1;
4419 			}
4420 			mutex_enter(&mi->mi_lock);
4421 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4422 			mutex_exit(&mi->mi_lock);
4423 			delay(hz);
4424 			mutex_enter(&mi->mi_lock);
4425 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4426 			mutex_exit(&mi->mi_lock);
4427 		}
4428 	}
4429 
4430 	if (oncethru) {
4431 #ifdef DEBUG
4432 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4433 #else
4434 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4435 #endif
4436 	}
4437 
4438 	if (svp != mi->mi_curr_serv) {
4439 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4440 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4441 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4442 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4443 		    mi->mi_vfsp);
4444 		if (rp != NULL) {
4445 			if (rp->r_flags & RHASHED)
4446 				rp_rmhash_locked(rp);
4447 			rw_exit(&rtable[index].r_lock);
4448 			rp->r_server = svp;
4449 			rp->r_fh = svp->sv_fhandle;
4450 			(void) nfs_free_data_reclaim(rp);
4451 			index = rtablehash(&rp->r_fh);
4452 			rp->r_hashq = &rtable[index];
4453 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4454 			vn_exists(RTOV(rp));
4455 			rp_addhash(rp);
4456 			rw_exit(&rp->r_hashq->r_lock);
4457 			VN_RELE(RTOV(rp));
4458 		} else
4459 			rw_exit(&rtable[index].r_lock);
4460 	}
4461 
4462 done:
4463 	if (oncethru)
4464 		kmem_free(srvnames, srvnames_len);
4465 	mutex_enter(&mi->mi_lock);
4466 	mi->mi_flags &= ~MI_BINDINPROG;
4467 	if (svp != NULL) {
4468 		mi->mi_curr_serv = svp;
4469 		mi->mi_failover++;
4470 #ifdef DEBUG
4471 	nfscl->nfscl_stat.failover.value.ui64++;
4472 #endif
4473 	}
4474 	cv_broadcast(&mi->mi_failover_cv);
4475 	CALLB_CPR_EXIT(&cprinfo);
4476 	VFS_RELE(mi->mi_vfsp);
4477 	zthread_exit();
4478 	/* NOTREACHED */
4479 }
4480 
4481 /*
4482  * NFS client failover support
4483  *
4484  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4485  * is cleared, meaning that failover is complete.  Called with
4486  * mi_lock mutex held.
4487  */
4488 static int
failover_wait(mntinfo_t * mi)4489 failover_wait(mntinfo_t *mi)
4490 {
4491 	k_sigset_t smask;
4492 
4493 	/*
4494 	 * If someone else is hunting for a living server,
4495 	 * sleep until it's done.  After our sleep, we may
4496 	 * be bound to the right server and get off cheaply.
4497 	 */
4498 	while (mi->mi_flags & MI_BINDINPROG) {
4499 		/*
4500 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4501 		 * and SIGTERM. (Preserving the existing masks).
4502 		 * Mask out SIGINT if mount option nointr is specified.
4503 		 */
4504 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4505 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4506 			/*
4507 			 * restore original signal mask
4508 			 */
4509 			sigunintr(&smask);
4510 			return (EINTR);
4511 		}
4512 		/*
4513 		 * restore original signal mask
4514 		 */
4515 		sigunintr(&smask);
4516 	}
4517 	return (0);
4518 }
4519 
4520 /*
4521  * NFS client failover support
4522  *
4523  * failover_remap() will do a partial pathname lookup and find the
4524  * desired vnode on the current server.  The interim vnode will be
4525  * discarded after we pilfer the new filehandle.
4526  *
4527  * Side effects:
4528  * - This routine will also update the filehandle in the args structure
4529  *    pointed to by the fi->fhp pointer if it is non-NULL.
4530  */
4531 
4532 static int
failover_remap(failinfo_t * fi)4533 failover_remap(failinfo_t *fi)
4534 {
4535 	vnode_t *vp, *nvp, *rootvp;
4536 	rnode_t *rp, *nrp;
4537 	mntinfo_t *mi;
4538 	int error;
4539 #ifdef DEBUG
4540 	struct nfs_clnt *nfscl;
4541 
4542 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4543 	ASSERT(nfscl != NULL);
4544 #endif
4545 	/*
4546 	 * Sanity check
4547 	 */
4548 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4549 		return (EINVAL);
4550 	vp = fi->vp;
4551 	rp = VTOR(vp);
4552 	mi = VTOMI(vp);
4553 
4554 	if (!(vp->v_flag & VROOT)) {
4555 		/*
4556 		 * Given the root fh, use the path stored in
4557 		 * the rnode to find the fh for the new server.
4558 		 */
4559 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4560 		if (error)
4561 			return (error);
4562 
4563 		error = failover_lookup(rp->r_path, rootvp,
4564 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4565 
4566 		VN_RELE(rootvp);
4567 
4568 		if (error)
4569 			return (error);
4570 
4571 		/*
4572 		 * If we found the same rnode, we're done now
4573 		 */
4574 		if (nvp == vp) {
4575 			/*
4576 			 * Failed and the new server may physically be same
4577 			 * OR may share a same disk subsystem. In this case
4578 			 * file handle for a particular file path is not going
4579 			 * to change, given the same filehandle lookup will
4580 			 * always locate the same rnode as the existing one.
4581 			 * All we might need to do is to update the r_server
4582 			 * with the current servinfo.
4583 			 */
4584 			if (!VALID_FH(fi)) {
4585 				rp->r_server = mi->mi_curr_serv;
4586 			}
4587 			VN_RELE(nvp);
4588 			return (0);
4589 		}
4590 
4591 		/*
4592 		 * Try to make it so that no one else will find this
4593 		 * vnode because it is just a temporary to hold the
4594 		 * new file handle until that file handle can be
4595 		 * copied to the original vnode/rnode.
4596 		 */
4597 		nrp = VTOR(nvp);
4598 		mutex_enter(&mi->mi_remap_lock);
4599 		/*
4600 		 * Some other thread could have raced in here and could
4601 		 * have done the remap for this particular rnode before
4602 		 * this thread here. Check for rp->r_server and
4603 		 * mi->mi_curr_serv and return if they are same.
4604 		 */
4605 		if (VALID_FH(fi)) {
4606 			mutex_exit(&mi->mi_remap_lock);
4607 			VN_RELE(nvp);
4608 			return (0);
4609 		}
4610 
4611 		if (nrp->r_flags & RHASHED)
4612 			rp_rmhash(nrp);
4613 
4614 		/*
4615 		 * As a heuristic check on the validity of the new
4616 		 * file, check that the size and type match against
4617 		 * that we remember from the old version.
4618 		 */
4619 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4620 			mutex_exit(&mi->mi_remap_lock);
4621 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4622 			    "NFS replicas %s and %s: file %s not same.",
4623 			    rp->r_server->sv_hostname,
4624 			    nrp->r_server->sv_hostname, rp->r_path);
4625 			VN_RELE(nvp);
4626 			return (EINVAL);
4627 		}
4628 
4629 		/*
4630 		 * snarf the filehandle from the new rnode
4631 		 * then release it, again while updating the
4632 		 * hash queues for the rnode.
4633 		 */
4634 		if (rp->r_flags & RHASHED)
4635 			rp_rmhash(rp);
4636 		rp->r_server = mi->mi_curr_serv;
4637 		rp->r_fh = nrp->r_fh;
4638 		rp->r_hashq = nrp->r_hashq;
4639 		/*
4640 		 * Copy the attributes from the new rnode to the old
4641 		 * rnode.  This will help to reduce unnecessary page
4642 		 * cache flushes.
4643 		 */
4644 		rp->r_attr = nrp->r_attr;
4645 		rp->r_attrtime = nrp->r_attrtime;
4646 		rp->r_mtime = nrp->r_mtime;
4647 		(void) nfs_free_data_reclaim(rp);
4648 		nfs_setswaplike(vp, &rp->r_attr);
4649 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4650 		rp_addhash(rp);
4651 		rw_exit(&rp->r_hashq->r_lock);
4652 		mutex_exit(&mi->mi_remap_lock);
4653 		VN_RELE(nvp);
4654 	}
4655 
4656 	/*
4657 	 * Update successful failover remap count
4658 	 */
4659 	mutex_enter(&mi->mi_lock);
4660 	mi->mi_remap++;
4661 	mutex_exit(&mi->mi_lock);
4662 #ifdef DEBUG
4663 	nfscl->nfscl_stat.remap.value.ui64++;
4664 #endif
4665 
4666 	/*
4667 	 * If we have a copied filehandle to update, do it now.
4668 	 */
4669 	if (fi->fhp != NULL && fi->copyproc != NULL)
4670 		(*fi->copyproc)(fi->fhp, vp);
4671 
4672 	return (0);
4673 }
4674 
4675 /*
4676  * NFS client failover support
4677  *
4678  * We want a simple pathname lookup routine to parse the pieces
4679  * of path in rp->r_path.  We know that the path was a created
4680  * as rnodes were made, so we know we have only to deal with
4681  * paths that look like:
4682  *	dir1/dir2/dir3/file
4683  * Any evidence of anything like .., symlinks, and ENOTDIR
4684  * are hard errors, because they mean something in this filesystem
4685  * is different from the one we came from, or has changed under
4686  * us in some way.  If this is true, we want the failure.
4687  *
4688  * Extended attributes: if the filesystem is mounted with extended
4689  * attributes enabled (-o xattr), the attribute directory will be
4690  * represented in the r_path as the magic name XATTR_RPATH. So if
4691  * we see that name in the pathname, is must be because this node
4692  * is an extended attribute.  Therefore, look it up that way.
4693  */
4694 static int
failover_lookup(char * path,vnode_t * root,int (* lookupproc)(vnode_t *,char *,vnode_t **,struct pathname *,int,vnode_t *,cred_t *,int),int (* xattrdirproc)(vnode_t *,vnode_t **,bool_t,cred_t *,int),vnode_t ** new)4695 failover_lookup(char *path, vnode_t *root,
4696     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4697     vnode_t *, cred_t *, int),
4698     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4699     vnode_t **new)
4700 {
4701 	vnode_t *dvp, *nvp;
4702 	int error = EINVAL;
4703 	char *s, *p, *tmppath;
4704 	size_t len;
4705 	mntinfo_t *mi;
4706 	bool_t xattr;
4707 
4708 	/* Make local copy of path */
4709 	len = strlen(path) + 1;
4710 	tmppath = kmem_alloc(len, KM_SLEEP);
4711 	(void) strcpy(tmppath, path);
4712 	s = tmppath;
4713 
4714 	dvp = root;
4715 	VN_HOLD(dvp);
4716 	mi = VTOMI(root);
4717 	xattr = mi->mi_flags & MI_EXTATTR;
4718 
4719 	do {
4720 		p = strchr(s, '/');
4721 		if (p != NULL)
4722 			*p = '\0';
4723 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4724 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4725 			    RFSCALL_SOFT);
4726 		} else {
4727 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4728 			    CRED(), RFSCALL_SOFT);
4729 		}
4730 		if (p != NULL)
4731 			*p++ = '/';
4732 		if (error) {
4733 			VN_RELE(dvp);
4734 			kmem_free(tmppath, len);
4735 			return (error);
4736 		}
4737 		s = p;
4738 		VN_RELE(dvp);
4739 		dvp = nvp;
4740 	} while (p != NULL);
4741 
4742 	if (nvp != NULL && new != NULL)
4743 		*new = nvp;
4744 	kmem_free(tmppath, len);
4745 	return (0);
4746 }
4747 
4748 /*
4749  * NFS client failover support
4750  *
4751  * sv_free() frees the malloc'd portion of a "servinfo_t".
4752  */
4753 void
sv_free(servinfo_t * svp)4754 sv_free(servinfo_t *svp)
4755 {
4756 	servinfo_t *next;
4757 	struct knetconfig *knconf;
4758 
4759 	while (svp != NULL) {
4760 		next = svp->sv_next;
4761 		if (svp->sv_secdata)
4762 			sec_clnt_freeinfo(svp->sv_secdata);
4763 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4764 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4765 		knconf = svp->sv_knconf;
4766 		if (knconf != NULL) {
4767 			if (knconf->knc_protofmly != NULL)
4768 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4769 			if (knconf->knc_proto != NULL)
4770 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4771 			kmem_free(knconf, sizeof (*knconf));
4772 		}
4773 		knconf = svp->sv_origknconf;
4774 		if (knconf != NULL) {
4775 			if (knconf->knc_protofmly != NULL)
4776 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4777 			if (knconf->knc_proto != NULL)
4778 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4779 			kmem_free(knconf, sizeof (*knconf));
4780 		}
4781 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4782 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4783 		mutex_destroy(&svp->sv_lock);
4784 		kmem_free(svp, sizeof (*svp));
4785 		svp = next;
4786 	}
4787 }
4788 
4789 /*
4790  * Only can return non-zero if intr != 0.
4791  */
4792 int
nfs_rw_enter_sig(nfs_rwlock_t * l,krw_t rw,int intr)4793 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4794 {
4795 
4796 	mutex_enter(&l->lock);
4797 
4798 	/*
4799 	 * If this is a nested enter, then allow it.  There
4800 	 * must be as many exits as enters through.
4801 	 */
4802 	if (l->owner == curthread) {
4803 		/* lock is held for writing by current thread */
4804 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4805 		l->count--;
4806 	} else if (rw == RW_READER) {
4807 		/*
4808 		 * While there is a writer active or writers waiting,
4809 		 * then wait for them to finish up and move on.  Then,
4810 		 * increment the count to indicate that a reader is
4811 		 * active.
4812 		 */
4813 		while (l->count < 0 || l->waiters > 0) {
4814 			if (intr) {
4815 				klwp_t *lwp = ttolwp(curthread);
4816 
4817 				if (lwp != NULL)
4818 					lwp->lwp_nostop++;
4819 				if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4820 					if (lwp != NULL)
4821 						lwp->lwp_nostop--;
4822 					mutex_exit(&l->lock);
4823 					return (EINTR);
4824 				}
4825 				if (lwp != NULL)
4826 					lwp->lwp_nostop--;
4827 			} else
4828 				cv_wait(&l->cv_rd, &l->lock);
4829 		}
4830 		ASSERT(l->count < INT_MAX);
4831 #ifdef	DEBUG
4832 		if ((l->count % 10000) == 9999)
4833 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4834 			    "rwlock @ %p\n", l->count, (void *)&l);
4835 #endif
4836 		l->count++;
4837 	} else {
4838 		ASSERT(rw == RW_WRITER);
4839 		/*
4840 		 * While there are readers active or a writer
4841 		 * active, then wait for all of the readers
4842 		 * to finish or for the writer to finish.
4843 		 * Then, set the owner field to curthread and
4844 		 * decrement count to indicate that a writer
4845 		 * is active.
4846 		 */
4847 		while (l->count != 0) {
4848 			l->waiters++;
4849 			if (intr) {
4850 				klwp_t *lwp = ttolwp(curthread);
4851 
4852 				if (lwp != NULL)
4853 					lwp->lwp_nostop++;
4854 				if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4855 					if (lwp != NULL)
4856 						lwp->lwp_nostop--;
4857 					l->waiters--;
4858 					/*
4859 					 * If there are readers active and no
4860 					 * writers waiting then wake up all of
4861 					 * the waiting readers (if any).
4862 					 */
4863 					if (l->count > 0 && l->waiters == 0)
4864 						cv_broadcast(&l->cv_rd);
4865 					mutex_exit(&l->lock);
4866 					return (EINTR);
4867 				}
4868 				if (lwp != NULL)
4869 					lwp->lwp_nostop--;
4870 			} else
4871 				cv_wait(&l->cv, &l->lock);
4872 			l->waiters--;
4873 		}
4874 		ASSERT(l->owner == NULL);
4875 		l->owner = curthread;
4876 		l->count--;
4877 	}
4878 
4879 	mutex_exit(&l->lock);
4880 
4881 	return (0);
4882 }
4883 
4884 /*
4885  * If the lock is available, obtain it and return non-zero.  If there is
4886  * already a conflicting lock, return 0 immediately.
4887  */
4888 
4889 int
nfs_rw_tryenter(nfs_rwlock_t * l,krw_t rw)4890 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4891 {
4892 	mutex_enter(&l->lock);
4893 
4894 	/*
4895 	 * If this is a nested enter, then allow it.  There
4896 	 * must be as many exits as enters through.
4897 	 */
4898 	if (l->owner == curthread) {
4899 		/* lock is held for writing by current thread */
4900 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4901 		l->count--;
4902 	} else if (rw == RW_READER) {
4903 		/*
4904 		 * If there is a writer active or writers waiting, deny the
4905 		 * lock.  Otherwise, bump the count of readers.
4906 		 */
4907 		if (l->count < 0 || l->waiters > 0) {
4908 			mutex_exit(&l->lock);
4909 			return (0);
4910 		}
4911 		l->count++;
4912 	} else {
4913 		ASSERT(rw == RW_WRITER);
4914 		/*
4915 		 * If there are readers active or a writer active, deny the
4916 		 * lock.  Otherwise, set the owner field to curthread and
4917 		 * decrement count to indicate that a writer is active.
4918 		 */
4919 		if (l->count != 0) {
4920 			mutex_exit(&l->lock);
4921 			return (0);
4922 		}
4923 		ASSERT(l->owner == NULL);
4924 		l->owner = curthread;
4925 		l->count--;
4926 	}
4927 
4928 	mutex_exit(&l->lock);
4929 
4930 	return (1);
4931 }
4932 
4933 void
nfs_rw_exit(nfs_rwlock_t * l)4934 nfs_rw_exit(nfs_rwlock_t *l)
4935 {
4936 
4937 	mutex_enter(&l->lock);
4938 
4939 	if (l->owner != NULL) {
4940 		ASSERT(l->owner == curthread);
4941 
4942 		/*
4943 		 * To release a writer lock increment count to indicate that
4944 		 * there is one less writer active.  If this was the last of
4945 		 * possibly nested writer locks, then clear the owner field as
4946 		 * well to indicate that there is no writer active.
4947 		 */
4948 		ASSERT(l->count < 0);
4949 		l->count++;
4950 		if (l->count == 0) {
4951 			l->owner = NULL;
4952 
4953 			/*
4954 			 * If there are no writers waiting then wakeup all of
4955 			 * the waiting readers (if any).
4956 			 */
4957 			if (l->waiters == 0)
4958 				cv_broadcast(&l->cv_rd);
4959 		}
4960 	} else {
4961 		/*
4962 		 * To release a reader lock just decrement count to indicate
4963 		 * that there is one less reader active.
4964 		 */
4965 		ASSERT(l->count > 0);
4966 		l->count--;
4967 	}
4968 
4969 	/*
4970 	 * If there are no readers active nor a writer active and there is a
4971 	 * writer waiting we need to wake up it.
4972 	 */
4973 	if (l->count == 0 && l->waiters > 0)
4974 		cv_signal(&l->cv);
4975 	mutex_exit(&l->lock);
4976 }
4977 
4978 int
nfs_rw_lock_held(nfs_rwlock_t * l,krw_t rw)4979 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4980 {
4981 
4982 	if (rw == RW_READER)
4983 		return (l->count > 0);
4984 	ASSERT(rw == RW_WRITER);
4985 	return (l->count < 0);
4986 }
4987 
4988 /* ARGSUSED */
4989 void
nfs_rw_init(nfs_rwlock_t * l,char * name,krw_type_t type,void * arg)4990 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4991 {
4992 
4993 	l->count = 0;
4994 	l->waiters = 0;
4995 	l->owner = NULL;
4996 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4997 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4998 	cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4999 }
5000 
5001 void
nfs_rw_destroy(nfs_rwlock_t * l)5002 nfs_rw_destroy(nfs_rwlock_t *l)
5003 {
5004 
5005 	mutex_destroy(&l->lock);
5006 	cv_destroy(&l->cv);
5007 	cv_destroy(&l->cv_rd);
5008 }
5009 
5010 int
nfs3_rddir_compar(const void * x,const void * y)5011 nfs3_rddir_compar(const void *x, const void *y)
5012 {
5013 	rddir_cache *a = (rddir_cache *)x;
5014 	rddir_cache *b = (rddir_cache *)y;
5015 
5016 	if (a->nfs3_cookie == b->nfs3_cookie) {
5017 		if (a->buflen == b->buflen)
5018 			return (0);
5019 		if (a->buflen < b->buflen)
5020 			return (-1);
5021 		return (1);
5022 	}
5023 
5024 	if (a->nfs3_cookie < b->nfs3_cookie)
5025 		return (-1);
5026 
5027 	return (1);
5028 }
5029 
5030 int
nfs_rddir_compar(const void * x,const void * y)5031 nfs_rddir_compar(const void *x, const void *y)
5032 {
5033 	rddir_cache *a = (rddir_cache *)x;
5034 	rddir_cache *b = (rddir_cache *)y;
5035 
5036 	if (a->nfs_cookie == b->nfs_cookie) {
5037 		if (a->buflen == b->buflen)
5038 			return (0);
5039 		if (a->buflen < b->buflen)
5040 			return (-1);
5041 		return (1);
5042 	}
5043 
5044 	if (a->nfs_cookie < b->nfs_cookie)
5045 		return (-1);
5046 
5047 	return (1);
5048 }
5049 
5050 static char *
nfs_getsrvnames(mntinfo_t * mi,size_t * len)5051 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5052 {
5053 	servinfo_t *s;
5054 	char *srvnames;
5055 	char *namep;
5056 	size_t length;
5057 
5058 	/*
5059 	 * Calculate the length of the string required to hold all
5060 	 * of the server names plus either a comma or a null
5061 	 * character following each individual one.
5062 	 */
5063 	length = 0;
5064 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5065 		length += s->sv_hostnamelen;
5066 
5067 	srvnames = kmem_alloc(length, KM_SLEEP);
5068 
5069 	namep = srvnames;
5070 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5071 		(void) strcpy(namep, s->sv_hostname);
5072 		namep += s->sv_hostnamelen - 1;
5073 		*namep++ = ',';
5074 	}
5075 	*--namep = '\0';
5076 
5077 	*len = length;
5078 
5079 	return (srvnames);
5080 }
5081 
5082 /*
5083  * These two functions are temporary and designed for the upgrade-workaround
5084  * only.  They cannot be used for general zone-crossing NFS client support, and
5085  * will be removed shortly.
5086  *
5087  * When the workaround is enabled, all NFS traffic is forced into the global
5088  * zone.  These functions are called when the code needs to refer to the state
5089  * of the underlying network connection.  They're not called when the function
5090  * needs to refer to the state of the process that invoked the system call.
5091  * (E.g., when checking whether the zone is shutting down during the mount()
5092  * call.)
5093  */
5094 
5095 struct zone *
nfs_zone(void)5096 nfs_zone(void)
5097 {
5098 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5099 }
5100 
5101 zoneid_t
nfs_zoneid(void)5102 nfs_zoneid(void)
5103 {
5104 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5105 }
5106 
5107 /*
5108  * nfs_mount_label_policy:
5109  *	Determine whether the mount is allowed according to MAC check,
5110  *	by comparing (where appropriate) label of the remote server
5111  *	against the label of the zone being mounted into.
5112  *
5113  *	Returns:
5114  *		 0 :	access allowed
5115  *		-1 :	read-only access allowed (i.e., read-down)
5116  *		>0 :	error code, such as EACCES
5117  */
5118 int
nfs_mount_label_policy(vfs_t * vfsp,struct netbuf * addr,struct knetconfig * knconf,cred_t * cr)5119 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5120     struct knetconfig *knconf, cred_t *cr)
5121 {
5122 	int		addr_type;
5123 	void		*ipaddr;
5124 	bslabel_t	*server_sl, *mntlabel;
5125 	zone_t		*mntzone = NULL;
5126 	ts_label_t	*zlabel;
5127 	tsol_tpc_t	*tp;
5128 	ts_label_t	*tsl = NULL;
5129 	int		retv;
5130 
5131 	/*
5132 	 * Get the zone's label.  Each zone on a labeled system has a label.
5133 	 */
5134 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5135 	zlabel = mntzone->zone_slabel;
5136 	ASSERT(zlabel != NULL);
5137 	label_hold(zlabel);
5138 
5139 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5140 		addr_type = IPV4_VERSION;
5141 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5142 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5143 		addr_type = IPV6_VERSION;
5144 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5145 	} else {
5146 		retv = 0;
5147 		goto out;
5148 	}
5149 
5150 	retv = EACCES;				/* assume the worst */
5151 
5152 	/*
5153 	 * Next, get the assigned label of the remote server.
5154 	 */
5155 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5156 	if (tp == NULL)
5157 		goto out;			/* error getting host entry */
5158 
5159 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5160 		goto rel_tpc;			/* invalid domain */
5161 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5162 	    (tp->tpc_tp.host_type != UNLABELED))
5163 		goto rel_tpc;			/* invalid hosttype */
5164 
5165 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5166 		tsl = getflabel_cipso(vfsp);
5167 		if (tsl == NULL)
5168 			goto rel_tpc;		/* error getting server lbl */
5169 
5170 		server_sl = label2bslabel(tsl);
5171 	} else {	/* UNLABELED */
5172 		server_sl = &tp->tpc_tp.tp_def_label;
5173 	}
5174 
5175 	mntlabel = label2bslabel(zlabel);
5176 
5177 	/*
5178 	 * Now compare labels to complete the MAC check.  If the labels
5179 	 * are equal or if the requestor is in the global zone and has
5180 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5181 	 * mounts into the global zone itself; restrict these to
5182 	 * read-only.)
5183 	 *
5184 	 * If the requestor is in some other zone, but his label
5185 	 * dominates the server, then allow read-down.
5186 	 *
5187 	 * Otherwise, access is denied.
5188 	 */
5189 	if (blequal(mntlabel, server_sl) ||
5190 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5191 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5192 		if ((mntzone == global_zone) ||
5193 		    !blequal(mntlabel, server_sl))
5194 			retv = -1;		/* read-only */
5195 		else
5196 			retv = 0;		/* access OK */
5197 	} else if (bldominates(mntlabel, server_sl)) {
5198 		retv = -1;			/* read-only */
5199 	} else {
5200 		retv = EACCES;
5201 	}
5202 
5203 	if (tsl != NULL)
5204 		label_rele(tsl);
5205 
5206 rel_tpc:
5207 	TPC_RELE(tp);
5208 out:
5209 	if (mntzone)
5210 		zone_rele(mntzone);
5211 	label_rele(zlabel);
5212 	return (retv);
5213 }
5214 
5215 boolean_t
nfs_has_ctty(void)5216 nfs_has_ctty(void)
5217 {
5218 	boolean_t rv;
5219 	mutex_enter(&curproc->p_splock);
5220 	rv = (curproc->p_sessp->s_vp != NULL);
5221 	mutex_exit(&curproc->p_splock);
5222 	return (rv);
5223 }
5224 
5225 /*
5226  * See if xattr directory to see if it has any generic user attributes
5227  */
5228 int
do_xattr_exists_check(vnode_t * vp,ulong_t * valp,cred_t * cr)5229 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5230 {
5231 	struct uio uio;
5232 	struct iovec iov;
5233 	char *dbuf;
5234 	struct dirent64 *dp;
5235 	size_t dlen = 8 * 1024;
5236 	size_t dbuflen;
5237 	int eof = 0;
5238 	int error;
5239 
5240 	*valp = 0;
5241 	dbuf = kmem_alloc(dlen, KM_SLEEP);
5242 	uio.uio_iov = &iov;
5243 	uio.uio_iovcnt = 1;
5244 	uio.uio_segflg = UIO_SYSSPACE;
5245 	uio.uio_fmode = 0;
5246 	uio.uio_extflg = UIO_COPY_CACHED;
5247 	uio.uio_loffset = 0;
5248 	uio.uio_resid = dlen;
5249 	iov.iov_base = dbuf;
5250 	iov.iov_len = dlen;
5251 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5252 	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5253 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5254 
5255 	dbuflen = dlen - uio.uio_resid;
5256 
5257 	if (error || dbuflen == 0) {
5258 		kmem_free(dbuf, dlen);
5259 		return (error);
5260 	}
5261 
5262 	dp = (dirent64_t *)dbuf;
5263 
5264 	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5265 		if (strcmp(dp->d_name, ".") == 0 ||
5266 		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5267 		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5268 		    VIEW_READONLY) == 0) {
5269 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5270 			continue;
5271 		}
5272 
5273 		*valp = 1;
5274 		break;
5275 	}
5276 	kmem_free(dbuf, dlen);
5277 	return (0);
5278 }
5279