xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_subr.c (revision 7014882c6a3672fd0e5d60200af8643ae53c5928)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27  */
28 
29 /*
30  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
31  *	All Rights Reserved
32  */
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cmn_err.h>
38 #include <sys/vtrace.h>
39 #include <sys/session.h>
40 #include <sys/thread.h>
41 #include <sys/dnlc.h>
42 #include <sys/cred.h>
43 #include <sys/priv.h>
44 #include <sys/list.h>
45 #include <sys/sdt.h>
46 #include <sys/policy.h>
47 
48 #include <rpc/types.h>
49 #include <rpc/xdr.h>
50 
51 #include <nfs/nfs.h>
52 
53 #include <nfs/nfs_clnt.h>
54 
55 #include <nfs/nfs4.h>
56 #include <nfs/rnode4.h>
57 #include <nfs/nfs4_clnt.h>
58 
59 /* utf8-checking variables */
60 #define	UTF8_TAIL_MASK		0xc0
61 #define	UTF8_TAIL_SIGNATURE	0x80
62 #define	UTF8_TAIL_SHIFT		6
63 #define	UTF16_SURROGATE_LOW	0xd800
64 #define	UTF16_SURROGATE_HIGH	0xdfff
65 #define	UNICODE_INVAL_1		0xfffe
66 #define	UNICODE_INVAL_2		0xffff
67 
68 typedef struct {
69 	unsigned char mask;
70 	unsigned char signature;
71 	unsigned int  min_val;
72 	unsigned char tail_bytes;
73 } utf8_encoding_table;
74 
75 static utf8_encoding_table utf8_table[] = {
76 	{ 0x80,	0x00,	0x00000000,	0 }, // 1 byte
77 	{ 0xe0,	0xc0,	0x00000080,	1 }, // 2 bytes
78 	{ 0xf0,	0xe0,	0x00000800,	2 }, // 3 bytes
79 	{ 0xf8,	0xf0,	0x00010000,	3 }, // 4 bytes
80 	{ 0xfc,	0xf8,	0x00200000, 	4 }, // 5 bytes
81 	{ 0xfe,	0xfc,	0x04000000,	5 }, // 6 bytes
82 	{ 0,	0,	0,		0 },
83 };
84 
85 
86 /*
87  * client side statistics
88  */
89 static const struct clstat4 clstat4_tmpl = {
90 	{ "calls",	KSTAT_DATA_UINT64 },
91 	{ "badcalls",	KSTAT_DATA_UINT64 },
92 	{ "referrals",	KSTAT_DATA_UINT64 },
93 	{ "referlinks",	KSTAT_DATA_UINT64 },
94 	{ "clgets",	KSTAT_DATA_UINT64 },
95 	{ "cltoomany",	KSTAT_DATA_UINT64 },
96 #ifdef DEBUG
97 	{ "clalloc",	KSTAT_DATA_UINT64 },
98 	{ "noresponse",	KSTAT_DATA_UINT64 },
99 	{ "failover",	KSTAT_DATA_UINT64 },
100 	{ "remap",	KSTAT_DATA_UINT64 },
101 #endif
102 };
103 
104 #ifdef DEBUG
105 struct clstat4_debug clstat4_debug = {
106 	{ "nrnode",	KSTAT_DATA_UINT64 },
107 	{ "access",	KSTAT_DATA_UINT64 },
108 	{ "dirent",	KSTAT_DATA_UINT64 },
109 	{ "dirents",	KSTAT_DATA_UINT64 },
110 	{ "reclaim",	KSTAT_DATA_UINT64 },
111 	{ "clreclaim",	KSTAT_DATA_UINT64 },
112 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
113 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
114 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
115 	{ "r_path",	KSTAT_DATA_UINT64 },
116 };
117 #endif
118 
119 /*
120  * We keep a global list of per-zone client data, so we can clean up all zones
121  * if we get low on memory.
122  */
123 static list_t nfs4_clnt_list;
124 static kmutex_t nfs4_clnt_list_lock;
125 zone_key_t nfs4clnt_zone_key;
126 
127 static struct kmem_cache *chtab4_cache;
128 
129 #ifdef DEBUG
130 static int nfs4_rfscall_debug;
131 static int nfs4_try_failover_any;
132 int nfs4_utf8_debug = 0;
133 #endif
134 
135 /*
136  * NFSv4 readdir cache implementation
137  */
138 typedef struct rddir4_cache_impl {
139 	rddir4_cache	rc;		/* readdir cache element */
140 	kmutex_t	lock;		/* lock protects count */
141 	uint_t		count;		/* reference count */
142 	avl_node_t	tree;		/* AVL tree link */
143 } rddir4_cache_impl;
144 
145 static int rddir4_cache_compar(const void *, const void *);
146 static void rddir4_cache_free(rddir4_cache_impl *);
147 static rddir4_cache *rddir4_cache_alloc(int);
148 static void rddir4_cache_hold(rddir4_cache *);
149 static int try_failover(enum clnt_stat);
150 
151 static int nfs4_readdir_cache_hits = 0;
152 static int nfs4_readdir_cache_waits = 0;
153 static int nfs4_readdir_cache_misses = 0;
154 
155 /*
156  * Shared nfs4 functions
157  */
158 
159 /*
160  * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
161  * be allocated.
162  */
163 
164 void
165 nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
166 {
167 	to->nfs_fh4_len = from->nfs_fh4_len;
168 	bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
169 }
170 
171 /*
172  * nfs4cmpfh - compare 2 filehandles.
173  * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
174  * "less" than the second, +1 if the first is "greater" than the second.
175  */
176 
177 int
178 nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
179 {
180 	const char *c1, *c2;
181 
182 	if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
183 		return (-1);
184 	if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
185 		return (1);
186 	for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
187 	    c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
188 	    c1++, c2++) {
189 		if (*c1 < *c2)
190 			return (-1);
191 		if (*c1 > *c2)
192 			return (1);
193 	}
194 
195 	return (0);
196 }
197 
198 /*
199  * Compare two v4 filehandles.  Return zero if they're the same, non-zero
200  * if they're not.  Like nfs4cmpfh(), but different filehandle
201  * representation, and doesn't provide information about greater than or
202  * less than.
203  */
204 
205 int
206 nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
207 {
208 	if (fh1->fh_len == fh2->fh_len)
209 		return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
210 
211 	return (1);
212 }
213 
214 int
215 stateid4_cmp(stateid4 *s1, stateid4 *s2)
216 {
217 	if (bcmp(s1, s2, sizeof (stateid4)) == 0)
218 		return (1);
219 	else
220 		return (0);
221 }
222 
223 nfsstat4
224 puterrno4(int error)
225 {
226 	switch (error) {
227 	case 0:
228 		return (NFS4_OK);
229 	case EPERM:
230 		return (NFS4ERR_PERM);
231 	case ENOENT:
232 		return (NFS4ERR_NOENT);
233 	case EINTR:
234 		return (NFS4ERR_IO);
235 	case EIO:
236 		return (NFS4ERR_IO);
237 	case ENXIO:
238 		return (NFS4ERR_NXIO);
239 	case ENOMEM:
240 		return (NFS4ERR_RESOURCE);
241 	case EACCES:
242 		return (NFS4ERR_ACCESS);
243 	case EBUSY:
244 		return (NFS4ERR_IO);
245 	case EEXIST:
246 		return (NFS4ERR_EXIST);
247 	case EXDEV:
248 		return (NFS4ERR_XDEV);
249 	case ENODEV:
250 		return (NFS4ERR_IO);
251 	case ENOTDIR:
252 		return (NFS4ERR_NOTDIR);
253 	case EISDIR:
254 		return (NFS4ERR_ISDIR);
255 	case EINVAL:
256 		return (NFS4ERR_INVAL);
257 	case EMFILE:
258 		return (NFS4ERR_RESOURCE);
259 	case EFBIG:
260 		return (NFS4ERR_FBIG);
261 	case ENOSPC:
262 		return (NFS4ERR_NOSPC);
263 	case EROFS:
264 		return (NFS4ERR_ROFS);
265 	case EMLINK:
266 		return (NFS4ERR_MLINK);
267 	case EDEADLK:
268 		return (NFS4ERR_DEADLOCK);
269 	case ENOLCK:
270 		return (NFS4ERR_DENIED);
271 	case EREMOTE:
272 		return (NFS4ERR_SERVERFAULT);
273 	case ENOTSUP:
274 		return (NFS4ERR_NOTSUPP);
275 	case EDQUOT:
276 		return (NFS4ERR_DQUOT);
277 	case ENAMETOOLONG:
278 		return (NFS4ERR_NAMETOOLONG);
279 	case EOVERFLOW:
280 		return (NFS4ERR_INVAL);
281 	case ENOSYS:
282 		return (NFS4ERR_NOTSUPP);
283 	case ENOTEMPTY:
284 		return (NFS4ERR_NOTEMPTY);
285 	case EOPNOTSUPP:
286 		return (NFS4ERR_NOTSUPP);
287 	case ESTALE:
288 		return (NFS4ERR_STALE);
289 	case EAGAIN:
290 		if (curthread->t_flag & T_WOULDBLOCK) {
291 			curthread->t_flag &= ~T_WOULDBLOCK;
292 			return (NFS4ERR_DELAY);
293 		}
294 		return (NFS4ERR_LOCKED);
295 	default:
296 		return ((enum nfsstat4)error);
297 	}
298 }
299 
300 int
301 geterrno4(enum nfsstat4 status)
302 {
303 	switch (status) {
304 	case NFS4_OK:
305 		return (0);
306 	case NFS4ERR_PERM:
307 		return (EPERM);
308 	case NFS4ERR_NOENT:
309 		return (ENOENT);
310 	case NFS4ERR_IO:
311 		return (EIO);
312 	case NFS4ERR_NXIO:
313 		return (ENXIO);
314 	case NFS4ERR_ACCESS:
315 		return (EACCES);
316 	case NFS4ERR_EXIST:
317 		return (EEXIST);
318 	case NFS4ERR_XDEV:
319 		return (EXDEV);
320 	case NFS4ERR_NOTDIR:
321 		return (ENOTDIR);
322 	case NFS4ERR_ISDIR:
323 		return (EISDIR);
324 	case NFS4ERR_INVAL:
325 		return (EINVAL);
326 	case NFS4ERR_FBIG:
327 		return (EFBIG);
328 	case NFS4ERR_NOSPC:
329 		return (ENOSPC);
330 	case NFS4ERR_ROFS:
331 		return (EROFS);
332 	case NFS4ERR_MLINK:
333 		return (EMLINK);
334 	case NFS4ERR_NAMETOOLONG:
335 		return (ENAMETOOLONG);
336 	case NFS4ERR_NOTEMPTY:
337 		return (ENOTEMPTY);
338 	case NFS4ERR_DQUOT:
339 		return (EDQUOT);
340 	case NFS4ERR_STALE:
341 		return (ESTALE);
342 	case NFS4ERR_BADHANDLE:
343 		return (ESTALE);
344 	case NFS4ERR_BAD_COOKIE:
345 		return (EINVAL);
346 	case NFS4ERR_NOTSUPP:
347 		return (EOPNOTSUPP);
348 	case NFS4ERR_TOOSMALL:
349 		return (EINVAL);
350 	case NFS4ERR_SERVERFAULT:
351 		return (EIO);
352 	case NFS4ERR_BADTYPE:
353 		return (EINVAL);
354 	case NFS4ERR_DELAY:
355 		return (ENXIO);
356 	case NFS4ERR_SAME:
357 		return (EPROTO);
358 	case NFS4ERR_DENIED:
359 		return (ENOLCK);
360 	case NFS4ERR_EXPIRED:
361 		return (EPROTO);
362 	case NFS4ERR_LOCKED:
363 		return (EACCES);
364 	case NFS4ERR_GRACE:
365 		return (EAGAIN);
366 	case NFS4ERR_FHEXPIRED:	/* if got here, failed to get a new fh */
367 		return (ESTALE);
368 	case NFS4ERR_SHARE_DENIED:
369 		return (EACCES);
370 	case NFS4ERR_WRONGSEC:
371 		return (EPERM);
372 	case NFS4ERR_CLID_INUSE:
373 		return (EAGAIN);
374 	case NFS4ERR_RESOURCE:
375 		return (EAGAIN);
376 	case NFS4ERR_MOVED:
377 		return (EPROTO);
378 	case NFS4ERR_NOFILEHANDLE:
379 		return (EIO);
380 	case NFS4ERR_MINOR_VERS_MISMATCH:
381 		return (ENOTSUP);
382 	case NFS4ERR_STALE_CLIENTID:
383 		return (EIO);
384 	case NFS4ERR_STALE_STATEID:
385 		return (EIO);
386 	case NFS4ERR_OLD_STATEID:
387 		return (EIO);
388 	case NFS4ERR_BAD_STATEID:
389 		return (EIO);
390 	case NFS4ERR_BAD_SEQID:
391 		return (EIO);
392 	case NFS4ERR_NOT_SAME:
393 		return (EPROTO);
394 	case NFS4ERR_LOCK_RANGE:
395 		return (EPROTO);
396 	case NFS4ERR_SYMLINK:
397 		return (EPROTO);
398 	case NFS4ERR_RESTOREFH:
399 		return (EPROTO);
400 	case NFS4ERR_LEASE_MOVED:
401 		return (EPROTO);
402 	case NFS4ERR_ATTRNOTSUPP:
403 		return (ENOTSUP);
404 	case NFS4ERR_NO_GRACE:
405 		return (EPROTO);
406 	case NFS4ERR_RECLAIM_BAD:
407 		return (EPROTO);
408 	case NFS4ERR_RECLAIM_CONFLICT:
409 		return (EPROTO);
410 	case NFS4ERR_BADXDR:
411 		return (EINVAL);
412 	case NFS4ERR_LOCKS_HELD:
413 		return (EIO);
414 	case NFS4ERR_OPENMODE:
415 		return (EACCES);
416 	case NFS4ERR_BADOWNER:
417 		/*
418 		 * Client and server are in different DNS domains
419 		 * and the NFSMAPID_DOMAIN in /etc/default/nfs
420 		 * doesn't match.  No good answer here.  Return
421 		 * EACCESS, which translates to "permission denied".
422 		 */
423 		return (EACCES);
424 	case NFS4ERR_BADCHAR:
425 		return (EINVAL);
426 	case NFS4ERR_BADNAME:
427 		return (EINVAL);
428 	case NFS4ERR_BAD_RANGE:
429 		return (EIO);
430 	case NFS4ERR_LOCK_NOTSUPP:
431 		return (ENOTSUP);
432 	case NFS4ERR_OP_ILLEGAL:
433 		return (EINVAL);
434 	case NFS4ERR_DEADLOCK:
435 		return (EDEADLK);
436 	case NFS4ERR_FILE_OPEN:
437 		return (EACCES);
438 	case NFS4ERR_ADMIN_REVOKED:
439 		return (EPROTO);
440 	case NFS4ERR_CB_PATH_DOWN:
441 		return (EPROTO);
442 	default:
443 #ifdef DEBUG
444 		zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
445 		    status);
446 #endif
447 		return ((int)status);
448 	}
449 }
450 
451 void
452 nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
453 {
454 	nfs4_server_t *server;
455 
456 	/*
457 	 * Return if already printed/queued a msg
458 	 * for this mount point.
459 	 */
460 	if (mi->mi_flags & MI4_BADOWNER_DEBUG)
461 		return;
462 	/*
463 	 * Happens once per client <-> server pair.
464 	 */
465 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
466 	    mi->mi_flags & MI4_INT))
467 		return;
468 
469 	server = find_nfs4_server(mi);
470 	if (server == NULL) {
471 		nfs_rw_exit(&mi->mi_recovlock);
472 		return;
473 	}
474 
475 	if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
476 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
477 		    "!NFSMAPID_DOMAIN does not match"
478 		    " the server: %s domain.\n"
479 		    "Please check configuration",
480 		    mi->mi_curr_serv->sv_hostname);
481 		server->s_flags |= N4S_BADOWNER_DEBUG;
482 	}
483 	mutex_exit(&server->s_lock);
484 	nfs4_server_rele(server);
485 	nfs_rw_exit(&mi->mi_recovlock);
486 
487 	/*
488 	 * Happens once per mntinfo4_t.
489 	 * This error is deemed as one of the recovery facts "RF_BADOWNER",
490 	 * queue this in the mesg queue for this mount_info. This message
491 	 * is not printed, meaning its absent from id_to_dump_solo_fact()
492 	 * but its there for inspection if the queue is ever dumped/inspected.
493 	 */
494 	mutex_enter(&mi->mi_lock);
495 	if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
496 		nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
497 		    FALSE, NULL, 0, NULL);
498 		mi->mi_flags |= MI4_BADOWNER_DEBUG;
499 	}
500 	mutex_exit(&mi->mi_lock);
501 }
502 
503 int
504 nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
505 {
506 	int64_t sec;
507 	int32_t nsec;
508 
509 	/*
510 	 * Here check that the nfsv4 time is valid for the system.
511 	 * nfsv4 time value is a signed 64-bit, and the system time
512 	 * may be either int64_t or int32_t (depends on the kernel),
513 	 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
514 	 */
515 #ifndef _LP64
516 	if (! NFS4_TIME_OK(ntime->seconds)) {
517 		return (EOVERFLOW);
518 	}
519 #endif
520 
521 	/* Invalid to specify 1 billion (or more) nsecs */
522 	if (ntime->nseconds >= 1000000000)
523 		return (EINVAL);
524 
525 	if (ntime->seconds < 0) {
526 		sec = ntime->seconds + 1;
527 		nsec = -1000000000 + ntime->nseconds;
528 	} else {
529 		sec = ntime->seconds;
530 		nsec = ntime->nseconds;
531 	}
532 
533 	vatime->tv_sec = sec;
534 	vatime->tv_nsec = nsec;
535 
536 	return (0);
537 }
538 
539 int
540 nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
541 {
542 	int64_t sec;
543 	uint32_t nsec;
544 
545 	/*
546 	 * nfsv4 time value is a signed 64-bit, and the system time
547 	 * may be either int64_t or int32_t (depends on the kernel),
548 	 * so all system time values will fit.
549 	 */
550 	if (vatime->tv_nsec >= 0) {
551 		sec = vatime->tv_sec;
552 		nsec = vatime->tv_nsec;
553 	} else {
554 		sec = vatime->tv_sec - 1;
555 		nsec = 1000000000 + vatime->tv_nsec;
556 	}
557 	ntime->seconds = sec;
558 	ntime->nseconds = nsec;
559 
560 	return (0);
561 }
562 
563 /*
564  * Converts a utf8 string to a valid null terminated filename string.
565  *
566  * XXX - Not actually translating the UTF-8 string as per RFC 2279.
567  *	 For now, just validate that the UTF-8 string off the wire
568  *	 does not have characters that will freak out UFS, and leave
569  *	 it at that.
570  */
571 char *
572 utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
573 {
574 	ASSERT(lenp != NULL);
575 
576 	if (u8s == NULL || u8s->utf8string_len <= 0 ||
577 	    u8s->utf8string_val == NULL)
578 		return (NULL);
579 
580 	/*
581 	 * Check for obvious illegal filename chars
582 	 */
583 	if (utf8_strchr(u8s, '/') != NULL) {
584 #ifdef DEBUG
585 		if (nfs4_utf8_debug) {
586 			char *path;
587 			int len = u8s->utf8string_len;
588 
589 			path = kmem_alloc(len + 1, KM_SLEEP);
590 			bcopy(u8s->utf8string_val, path, len);
591 			path[len] = '\0';
592 
593 			zcmn_err(getzoneid(), CE_WARN,
594 			    "Invalid UTF-8 filename: %s", path);
595 
596 			kmem_free(path, len + 1);
597 		}
598 #endif
599 		return (NULL);
600 	}
601 
602 	return (utf8_to_str(u8s, lenp, s));
603 }
604 
605 /*
606  * Converts a utf8 string to a C string.
607  * kmem_allocs a new string if not supplied
608  */
609 char *
610 utf8_to_str(utf8string *str, uint_t *lenp, char *s)
611 {
612 	char	*sp;
613 	char	*u8p;
614 	int	len;
615 	int	 i;
616 
617 	ASSERT(lenp != NULL);
618 
619 	if (str == NULL)
620 		return (NULL);
621 
622 	u8p = str->utf8string_val;
623 	len = str->utf8string_len;
624 	if (len <= 0 || u8p == NULL) {
625 		if (s)
626 			*s = '\0';
627 		return (NULL);
628 	}
629 
630 	sp = s;
631 	if (sp == NULL)
632 		sp = kmem_alloc(len + 1, KM_SLEEP);
633 
634 	/*
635 	 * At least check for embedded nulls
636 	 */
637 	for (i = 0; i < len; i++) {
638 		sp[i] = u8p[i];
639 		if (u8p[i] == '\0') {
640 #ifdef	DEBUG
641 			zcmn_err(getzoneid(), CE_WARN,
642 			    "Embedded NULL in UTF-8 string");
643 #endif
644 			if (s == NULL)
645 				kmem_free(sp, len + 1);
646 			return (NULL);
647 		}
648 	}
649 	sp[len] = '\0';
650 	*lenp = len + 1;
651 
652 	return (sp);
653 }
654 
655 /*
656  * str_to_utf8 - converts a null-terminated C string to a utf8 string
657  */
658 utf8string *
659 str_to_utf8(char *nm, utf8string *str)
660 {
661 	int len;
662 
663 	if (str == NULL)
664 		return (NULL);
665 
666 	if (nm == NULL || *nm == '\0') {
667 		str->utf8string_len = 0;
668 		str->utf8string_val = NULL;
669 	}
670 
671 	len = strlen(nm);
672 
673 	str->utf8string_val = kmem_alloc(len, KM_SLEEP);
674 	str->utf8string_len = len;
675 	bcopy(nm, str->utf8string_val, len);
676 
677 	return (str);
678 }
679 
680 utf8string *
681 utf8_copy(utf8string *src, utf8string *dest)
682 {
683 	if (src == NULL)
684 		return (NULL);
685 	if (dest == NULL)
686 		return (NULL);
687 
688 	if (src->utf8string_len > 0) {
689 		dest->utf8string_val = kmem_alloc(src->utf8string_len,
690 		    KM_SLEEP);
691 		bcopy(src->utf8string_val, dest->utf8string_val,
692 		    src->utf8string_len);
693 		dest->utf8string_len = src->utf8string_len;
694 	} else {
695 		dest->utf8string_val = NULL;
696 		dest->utf8string_len = 0;
697 	}
698 
699 	return (dest);
700 }
701 
702 int
703 utf8_compare(const utf8string *a, const utf8string *b)
704 {
705 	int mlen, cmp;
706 	int alen, blen;
707 	char *aval, *bval;
708 
709 	if ((a == NULL) && (b == NULL))
710 		return (0);
711 	else if (a == NULL)
712 		return (-1);
713 	else if (b == NULL)
714 		return (1);
715 
716 	alen = a->utf8string_len;
717 	blen = b->utf8string_len;
718 	aval = a->utf8string_val;
719 	bval = b->utf8string_val;
720 
721 	if (((alen == 0) || (aval == NULL)) &&
722 	    ((blen == 0) || (bval == NULL)))
723 		return (0);
724 	else if ((alen == 0) || (aval == NULL))
725 		return (-1);
726 	else if ((blen == 0) || (bval == NULL))
727 		return (1);
728 
729 	mlen = MIN(alen, blen);
730 	cmp = strncmp(aval, bval, mlen);
731 
732 	if ((cmp == 0) && (alen == blen))
733 		return (0);
734 	else if ((cmp == 0) && (alen < blen))
735 		return (-1);
736 	else if (cmp == 0)
737 		return (1);
738 	else if (cmp < 0)
739 		return (-1);
740 	return (1);
741 }
742 
743 /*
744  * utf8_name_verify - verify utf8-correctness of the passed string.
745  *
746  * Byte's checking is performed by applying and-mask to byte and checking
747  * result of this operation (signature).
748  * ~mask used to extract valuable bits from byte that will be put in 'symbol'
749  * that represents encoded unicode character.
750  *
751  * Symbols encoded with UTF8 have following format:
752  * 0xxxxxxx						  - 1 byte symbol
753  * 110xxxxx 10xxxxxx					  - 2 bytes
754  * 1110xxxx 10xxxxxx 10xxxxxx				  - 3 bytes
755  * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx			  - 4 bytes
756  * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	  - 5 bytes
757  * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 6 bytes
758  */
759 nfsstat4
760 utf8_name_verify(utf8string *str)
761 {
762 	int len = str->utf8string_len;
763 	unsigned char *u8p = (unsigned char *) str->utf8string_val;
764 	int pos = 0;
765 
766 	while (pos < len) {
767 		unsigned char c = u8p[pos++];
768 		int  i;
769 		unsigned int symbol;
770 		utf8_encoding_table * encoding = utf8_table;
771 
772 		/* check leading byte */
773 		while (encoding->mask != 0x00) {
774 			if ((c & encoding->mask) == encoding->signature)
775 				break;
776 			++encoding;
777 		}
778 		if (encoding->mask == 0x00)
779 			return (NFS4ERR_INVAL);
780 
781 		symbol = c & (~encoding->mask);
782 
783 		/* check tail bytes if leading byte describes so */
784 		for (i = 0; i < encoding->tail_bytes; ++i) {
785 			if (pos >= len)
786 				return (NFS4ERR_INVAL);
787 			c = u8p[pos++];
788 			if ((c & UTF8_TAIL_MASK) != UTF8_TAIL_SIGNATURE)
789 				return (NFS4ERR_INVAL);
790 			symbol <<= UTF8_TAIL_SHIFT;
791 			symbol |= (c & (~UTF8_TAIL_MASK));
792 		}
793 
794 		/* check UTF-16 surrogate */
795 		if ((symbol >= UTF16_SURROGATE_LOW) &&
796 		    (symbol <= UTF16_SURROGATE_HIGH))
797 			return (NFS4ERR_INVAL);
798 
799 		/* check wrong Unicode character case */
800 		if ((symbol == UNICODE_INVAL_1) || (symbol == UNICODE_INVAL_2))
801 			return (NFS4ERR_INVAL);
802 
803 		/* check overlonging */
804 		if (symbol < encoding->min_val)
805 			return (NFS4ERR_INVAL);
806 	}
807 
808 	return (NFS4_OK);
809 }
810 
811 /*
812  * utf8_dir_verify - checks that the utf8 string is valid
813  */
814 nfsstat4
815 utf8_dir_verify(utf8string *str)
816 {
817 	char *nm;
818 	int len;
819 
820 	if (str == NULL)
821 		return (NFS4ERR_INVAL);
822 
823 	nm = str->utf8string_val;
824 	len = str->utf8string_len;
825 	if (nm == NULL || len == 0) {
826 		return (NFS4ERR_INVAL);
827 	}
828 
829 	if (len == 1 && nm[0] == '.')
830 		return (NFS4ERR_BADNAME);
831 	if (len == 2 && nm[0] == '.' && nm[1] == '.')
832 		return (NFS4ERR_BADNAME);
833 
834 	if (utf8_strchr(str, '/') != NULL)
835 		return (NFS4ERR_BADNAME);
836 
837 	if (utf8_strchr(str, '\0') != NULL)
838 		return (NFS4ERR_BADNAME);
839 
840 	return (utf8_name_verify(str));
841 }
842 
843 /*
844  * from rpcsec module (common/rpcsec)
845  */
846 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
847 extern void sec_clnt_freeh(AUTH *);
848 extern void sec_clnt_freeinfo(struct sec_data *);
849 
850 /*
851  * authget() gets an auth handle based on the security
852  * information from the servinfo in mountinfo.
853  * The auth handle is stored in ch_client->cl_auth.
854  *
855  * First security flavor of choice is to use sv_secdata
856  * which is initiated by the client. If that fails, get
857  * secinfo from the server and then select one from the
858  * server secinfo list .
859  *
860  * For RPCSEC_GSS flavor, upon success, a secure context is
861  * established between client and server.
862  */
863 int
864 authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
865 {
866 	int error, i;
867 
868 	/*
869 	 * SV4_TRYSECINFO indicates to try the secinfo list from
870 	 * sv_secinfo until a successful one is reached. Point
871 	 * sv_currsec to the selected security mechanism for
872 	 * later sessions.
873 	 */
874 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
875 	if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
876 		for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
877 		    i++) {
878 			if (!(error = sec_clnt_geth(ch_client,
879 			    &svp->sv_secinfo->sdata[i],
880 			    cr, &ch_client->cl_auth))) {
881 
882 				svp->sv_currsec = &svp->sv_secinfo->sdata[i];
883 				svp->sv_secinfo->index = i;
884 				/* done */
885 				svp->sv_flags &= ~SV4_TRYSECINFO;
886 				break;
887 			}
888 
889 			/*
890 			 * Allow the caller retry with the security flavor
891 			 * pointed by svp->sv_secinfo->index when
892 			 * ETIMEDOUT/ECONNRESET occurs.
893 			 */
894 			if (error == ETIMEDOUT || error == ECONNRESET) {
895 				svp->sv_secinfo->index = i;
896 				break;
897 			}
898 		}
899 	} else {
900 		/* sv_currsec points to one of the entries in sv_secinfo */
901 		if (svp->sv_currsec) {
902 			error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
903 			    &ch_client->cl_auth);
904 		} else {
905 			/* If it's null, use sv_secdata. */
906 			error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
907 			    &ch_client->cl_auth);
908 		}
909 	}
910 	nfs_rw_exit(&svp->sv_lock);
911 
912 	return (error);
913 }
914 
915 /*
916  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
917  */
918 int
919 clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
920     struct chtab **chp, struct nfs4_clnt *nfscl)
921 {
922 	struct chhead *ch, *newch;
923 	struct chhead **plistp;
924 	struct chtab *cp;
925 	int error;
926 	k_sigset_t smask;
927 
928 	if (newcl == NULL || chp == NULL || ci == NULL)
929 		return (EINVAL);
930 
931 	*newcl = NULL;
932 	*chp = NULL;
933 
934 	/*
935 	 * Find an unused handle or create one
936 	 */
937 	newch = NULL;
938 	nfscl->nfscl_stat.clgets.value.ui64++;
939 top:
940 	/*
941 	 * Find the correct entry in the cache to check for free
942 	 * client handles.  The search is based on the RPC program
943 	 * number, program version number, dev_t for the transport
944 	 * device, and the protocol family.
945 	 */
946 	mutex_enter(&nfscl->nfscl_chtable4_lock);
947 	plistp = &nfscl->nfscl_chtable4;
948 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
949 		if (ch->ch_prog == ci->cl_prog &&
950 		    ch->ch_vers == ci->cl_vers &&
951 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
952 		    (strcmp(ch->ch_protofmly,
953 		    svp->sv_knconf->knc_protofmly) == 0))
954 			break;
955 		plistp = &ch->ch_next;
956 	}
957 
958 	/*
959 	 * If we didn't find a cache entry for this quadruple, then
960 	 * create one.  If we don't have one already preallocated,
961 	 * then drop the cache lock, create one, and then start over.
962 	 * If we did have a preallocated entry, then just add it to
963 	 * the front of the list.
964 	 */
965 	if (ch == NULL) {
966 		if (newch == NULL) {
967 			mutex_exit(&nfscl->nfscl_chtable4_lock);
968 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
969 			newch->ch_timesused = 0;
970 			newch->ch_prog = ci->cl_prog;
971 			newch->ch_vers = ci->cl_vers;
972 			newch->ch_dev = svp->sv_knconf->knc_rdev;
973 			newch->ch_protofmly = kmem_alloc(
974 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
975 			    KM_SLEEP);
976 			(void) strcpy(newch->ch_protofmly,
977 			    svp->sv_knconf->knc_protofmly);
978 			newch->ch_list = NULL;
979 			goto top;
980 		}
981 		ch = newch;
982 		newch = NULL;
983 		ch->ch_next = nfscl->nfscl_chtable4;
984 		nfscl->nfscl_chtable4 = ch;
985 	/*
986 	 * We found a cache entry, but if it isn't on the front of the
987 	 * list, then move it to the front of the list to try to take
988 	 * advantage of locality of operations.
989 	 */
990 	} else if (ch != nfscl->nfscl_chtable4) {
991 		*plistp = ch->ch_next;
992 		ch->ch_next = nfscl->nfscl_chtable4;
993 		nfscl->nfscl_chtable4 = ch;
994 	}
995 
996 	/*
997 	 * If there was a free client handle cached, then remove it
998 	 * from the list, init it, and use it.
999 	 */
1000 	if (ch->ch_list != NULL) {
1001 		cp = ch->ch_list;
1002 		ch->ch_list = cp->ch_list;
1003 		mutex_exit(&nfscl->nfscl_chtable4_lock);
1004 		if (newch != NULL) {
1005 			kmem_free(newch->ch_protofmly,
1006 			    strlen(newch->ch_protofmly) + 1);
1007 			kmem_free(newch, sizeof (*newch));
1008 		}
1009 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
1010 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
1011 
1012 		/*
1013 		 * Get an auth handle.
1014 		 */
1015 		error = authget(svp, cp->ch_client, cr);
1016 		if (error || cp->ch_client->cl_auth == NULL) {
1017 			CLNT_DESTROY(cp->ch_client);
1018 			kmem_cache_free(chtab4_cache, cp);
1019 			return ((error != 0) ? error : EINTR);
1020 		}
1021 		ch->ch_timesused++;
1022 		*newcl = cp->ch_client;
1023 		*chp = cp;
1024 		return (0);
1025 	}
1026 
1027 	/*
1028 	 * There weren't any free client handles which fit, so allocate
1029 	 * a new one and use that.
1030 	 */
1031 #ifdef DEBUG
1032 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
1033 #endif
1034 	mutex_exit(&nfscl->nfscl_chtable4_lock);
1035 
1036 	nfscl->nfscl_stat.cltoomany.value.ui64++;
1037 	if (newch != NULL) {
1038 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
1039 		kmem_free(newch, sizeof (*newch));
1040 	}
1041 
1042 	cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
1043 	cp->ch_head = ch;
1044 
1045 	sigintr(&smask, (int)ci->cl_flags & MI4_INT);
1046 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
1047 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
1048 	sigunintr(&smask);
1049 
1050 	if (error != 0) {
1051 		kmem_cache_free(chtab4_cache, cp);
1052 #ifdef DEBUG
1053 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
1054 #endif
1055 		/*
1056 		 * Warning is unnecessary if error is EINTR.
1057 		 */
1058 		if (error != EINTR) {
1059 			nfs_cmn_err(error, CE_WARN,
1060 			    "clget: couldn't create handle: %m\n");
1061 		}
1062 		return (error);
1063 	}
1064 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
1065 	auth_destroy(cp->ch_client->cl_auth);
1066 
1067 	/*
1068 	 * Get an auth handle.
1069 	 */
1070 	error = authget(svp, cp->ch_client, cr);
1071 	if (error || cp->ch_client->cl_auth == NULL) {
1072 		CLNT_DESTROY(cp->ch_client);
1073 		kmem_cache_free(chtab4_cache, cp);
1074 #ifdef DEBUG
1075 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
1076 #endif
1077 		return ((error != 0) ? error : EINTR);
1078 	}
1079 	ch->ch_timesused++;
1080 	*newcl = cp->ch_client;
1081 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
1082 	*chp = cp;
1083 	return (0);
1084 }
1085 
1086 static int
1087 nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
1088     struct chtab **chp, struct nfs4_clnt *nfscl)
1089 {
1090 	clinfo_t ci;
1091 	bool_t is_recov;
1092 	int firstcall, error = 0;
1093 
1094 	/*
1095 	 * Set read buffer size to rsize
1096 	 * and add room for RPC headers.
1097 	 */
1098 	ci.cl_readsize = mi->mi_tsize;
1099 	if (ci.cl_readsize != 0)
1100 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
1101 
1102 	/*
1103 	 * If soft mount and server is down just try once.
1104 	 * meaning: do not retransmit.
1105 	 */
1106 	if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
1107 		ci.cl_retrans = 0;
1108 	else
1109 		ci.cl_retrans = mi->mi_retrans;
1110 
1111 	ci.cl_prog = mi->mi_prog;
1112 	ci.cl_vers = mi->mi_vers;
1113 	ci.cl_flags = mi->mi_flags;
1114 
1115 	/*
1116 	 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
1117 	 * security flavor, the client tries to establish a security context
1118 	 * by contacting the server. If the connection is timed out or reset,
1119 	 * e.g. server reboot, we will try again.
1120 	 */
1121 	is_recov = (curthread == mi->mi_recovthread);
1122 	firstcall = 1;
1123 
1124 	do {
1125 		error = clget4(&ci, svp, cr, newcl, chp, nfscl);
1126 
1127 		if (error == 0)
1128 			break;
1129 
1130 		/*
1131 		 * For forced unmount and zone shutdown, bail out but
1132 		 * let the recovery thread do one more transmission.
1133 		 */
1134 		if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
1135 		    (!is_recov || !firstcall)) {
1136 			error = EIO;
1137 			break;
1138 		}
1139 
1140 		/* do not retry for soft mount */
1141 		if (!(mi->mi_flags & MI4_HARD))
1142 			break;
1143 
1144 		/* let the caller deal with the failover case */
1145 		if (FAILOVER_MOUNT4(mi))
1146 			break;
1147 
1148 		firstcall = 0;
1149 
1150 	} while (error == ETIMEDOUT || error == ECONNRESET);
1151 
1152 	return (error);
1153 }
1154 
1155 void
1156 clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
1157 {
1158 	if (cl->cl_auth != NULL) {
1159 		sec_clnt_freeh(cl->cl_auth);
1160 		cl->cl_auth = NULL;
1161 	}
1162 
1163 	/*
1164 	 * Timestamp this cache entry so that we know when it was last
1165 	 * used.
1166 	 */
1167 	cp->ch_freed = gethrestime_sec();
1168 
1169 	/*
1170 	 * Add the free client handle to the front of the list.
1171 	 * This way, the list will be sorted in youngest to oldest
1172 	 * order.
1173 	 */
1174 	mutex_enter(&nfscl->nfscl_chtable4_lock);
1175 	cp->ch_list = cp->ch_head->ch_list;
1176 	cp->ch_head->ch_list = cp;
1177 	mutex_exit(&nfscl->nfscl_chtable4_lock);
1178 }
1179 
1180 #define	CL_HOLDTIME	60	/* time to hold client handles */
1181 
1182 static void
1183 clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
1184 {
1185 	struct chhead *ch;
1186 	struct chtab *cp;	/* list of objects that can be reclaimed */
1187 	struct chtab *cpe;
1188 	struct chtab *cpl;
1189 	struct chtab **cpp;
1190 #ifdef DEBUG
1191 	int n = 0;
1192 	clstat4_debug.clreclaim.value.ui64++;
1193 #endif
1194 
1195 	/*
1196 	 * Need to reclaim some memory, so step through the cache
1197 	 * looking through the lists for entries which can be freed.
1198 	 */
1199 	cp = NULL;
1200 
1201 	mutex_enter(&nfscl->nfscl_chtable4_lock);
1202 
1203 	/*
1204 	 * Here we step through each non-NULL quadruple and start to
1205 	 * construct the reclaim list pointed to by cp.  Note that
1206 	 * cp will contain all eligible chtab entries.  When this traversal
1207 	 * completes, chtab entries from the last quadruple will be at the
1208 	 * front of cp and entries from previously inspected quadruples have
1209 	 * been appended to the rear of cp.
1210 	 */
1211 	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
1212 		if (ch->ch_list == NULL)
1213 			continue;
1214 		/*
1215 		 * Search each list for entries older then
1216 		 * cl_holdtime seconds.  The lists are maintained
1217 		 * in youngest to oldest order so that when the
1218 		 * first entry is found which is old enough, then
1219 		 * all of the rest of the entries on the list will
1220 		 * be old enough as well.
1221 		 */
1222 		cpl = ch->ch_list;
1223 		cpp = &ch->ch_list;
1224 		while (cpl != NULL &&
1225 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
1226 			cpp = &cpl->ch_list;
1227 			cpl = cpl->ch_list;
1228 		}
1229 		if (cpl != NULL) {
1230 			*cpp = NULL;
1231 			if (cp != NULL) {
1232 				cpe = cpl;
1233 				while (cpe->ch_list != NULL)
1234 					cpe = cpe->ch_list;
1235 				cpe->ch_list = cp;
1236 			}
1237 			cp = cpl;
1238 		}
1239 	}
1240 
1241 	mutex_exit(&nfscl->nfscl_chtable4_lock);
1242 
1243 	/*
1244 	 * If cp is empty, then there is nothing to reclaim here.
1245 	 */
1246 	if (cp == NULL)
1247 		return;
1248 
1249 	/*
1250 	 * Step through the list of entries to free, destroying each client
1251 	 * handle and kmem_free'ing the memory for each entry.
1252 	 */
1253 	while (cp != NULL) {
1254 #ifdef DEBUG
1255 		n++;
1256 #endif
1257 		CLNT_DESTROY(cp->ch_client);
1258 		cpl = cp->ch_list;
1259 		kmem_cache_free(chtab4_cache, cp);
1260 		cp = cpl;
1261 	}
1262 
1263 #ifdef DEBUG
1264 	/*
1265 	 * Update clalloc so that nfsstat shows the current number
1266 	 * of allocated client handles.
1267 	 */
1268 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
1269 #endif
1270 }
1271 
1272 /* ARGSUSED */
1273 static void
1274 clreclaim4(void *all)
1275 {
1276 	struct nfs4_clnt *nfscl;
1277 
1278 	/*
1279 	 * The system is low on memory; go through and try to reclaim some from
1280 	 * every zone on the system.
1281 	 */
1282 	mutex_enter(&nfs4_clnt_list_lock);
1283 	nfscl = list_head(&nfs4_clnt_list);
1284 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
1285 		clreclaim4_zone(nfscl, CL_HOLDTIME);
1286 	mutex_exit(&nfs4_clnt_list_lock);
1287 }
1288 
1289 /*
1290  * Minimum time-out values indexed by call type
1291  * These units are in "eights" of a second to avoid multiplies
1292  */
1293 static unsigned int minimum_timeo[] = {
1294 	6, 7, 10
1295 };
1296 
1297 #define	SHORTWAIT	(NFS_COTS_TIMEO / 10)
1298 
1299 /*
1300  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
1301  */
1302 #define	MAXTIMO	(20*hz)
1303 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
1304 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
1305 
1306 static int
1307 nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1308     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
1309     enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
1310 {
1311 	CLIENT *client;
1312 	struct chtab *ch;
1313 	cred_t *cr = icr;
1314 	struct rpc_err rpcerr, rpcerr_tmp;
1315 	enum clnt_stat status;
1316 	int error;
1317 	struct timeval wait;
1318 	int timeo;		/* in units of hz */
1319 	bool_t tryagain, is_recov;
1320 	bool_t cred_cloned = FALSE;
1321 	k_sigset_t smask;
1322 	servinfo4_t *svp;
1323 #ifdef DEBUG
1324 	char *bufp;
1325 #endif
1326 	int firstcall;
1327 
1328 	rpcerr.re_status = RPC_SUCCESS;
1329 
1330 	/*
1331 	 * If we know that we are rebooting then let's
1332 	 * not bother with doing any over the wireness.
1333 	 */
1334 	mutex_enter(&mi->mi_lock);
1335 	if (mi->mi_flags & MI4_SHUTDOWN) {
1336 		mutex_exit(&mi->mi_lock);
1337 		return (EIO);
1338 	}
1339 	mutex_exit(&mi->mi_lock);
1340 
1341 	/* For TSOL, use a new cred which has net_mac_aware flag */
1342 	if (!cred_cloned && is_system_labeled()) {
1343 		cred_cloned = TRUE;
1344 		cr = crdup(icr);
1345 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1346 	}
1347 
1348 	/*
1349 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1350 	 * are guaranteed to reprocess the retry as a new request.
1351 	 */
1352 	svp = mi->mi_curr_serv;
1353 	rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
1354 	if (rpcerr.re_errno != 0)
1355 		return (rpcerr.re_errno);
1356 
1357 	timeo = (mi->mi_timeo * hz) / 10;
1358 
1359 	/*
1360 	 * If hard mounted fs, retry call forever unless hard error
1361 	 * occurs.
1362 	 *
1363 	 * For forced unmount, let the recovery thread through but return
1364 	 * an error for all others.  This is so that user processes can
1365 	 * exit quickly.  The recovery thread bails out after one
1366 	 * transmission so that it can tell if it needs to continue.
1367 	 *
1368 	 * For zone shutdown, behave as above to encourage quick
1369 	 * process exit, but also fail quickly when servers have
1370 	 * timed out before and reduce the timeouts.
1371 	 */
1372 	is_recov = (curthread == mi->mi_recovthread);
1373 	firstcall = 1;
1374 	do {
1375 		tryagain = FALSE;
1376 
1377 		NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
1378 		    "nfs4_rfscall: vfs_flag=0x%x, %s",
1379 		    mi->mi_vfsp->vfs_flag,
1380 		    is_recov ? "recov thread" : "not recov thread"));
1381 
1382 		/*
1383 		 * It's possible while we're retrying the admin
1384 		 * decided to reboot.
1385 		 */
1386 		mutex_enter(&mi->mi_lock);
1387 		if (mi->mi_flags & MI4_SHUTDOWN) {
1388 			mutex_exit(&mi->mi_lock);
1389 			clfree4(client, ch, nfscl);
1390 			if (cred_cloned)
1391 				crfree(cr);
1392 			return (EIO);
1393 		}
1394 		mutex_exit(&mi->mi_lock);
1395 
1396 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1397 		    (!is_recov || !firstcall)) {
1398 			clfree4(client, ch, nfscl);
1399 			if (cred_cloned)
1400 				crfree(cr);
1401 			return (EIO);
1402 		}
1403 
1404 		if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
1405 			mutex_enter(&mi->mi_lock);
1406 			if ((mi->mi_flags & MI4_TIMEDOUT) ||
1407 			    !is_recov || !firstcall) {
1408 				mutex_exit(&mi->mi_lock);
1409 				clfree4(client, ch, nfscl);
1410 				if (cred_cloned)
1411 					crfree(cr);
1412 				return (EIO);
1413 			}
1414 			mutex_exit(&mi->mi_lock);
1415 			timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
1416 		}
1417 
1418 		firstcall = 0;
1419 		TICK_TO_TIMEVAL(timeo, &wait);
1420 
1421 		/*
1422 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1423 		 * and SIGTERM. (Preserving the existing masks).
1424 		 * Mask out SIGINT if mount option nointr is specified.
1425 		 */
1426 		sigintr(&smask, (int)mi->mi_flags & MI4_INT);
1427 		if (!(mi->mi_flags & MI4_INT))
1428 			client->cl_nosignal = TRUE;
1429 
1430 		/*
1431 		 * If there is a current signal, then don't bother
1432 		 * even trying to send out the request because we
1433 		 * won't be able to block waiting for the response.
1434 		 * Simply assume RPC_INTR and get on with it.
1435 		 */
1436 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1437 			status = RPC_INTR;
1438 		else {
1439 			status = CLNT_CALL(client, which, xdrargs, argsp,
1440 			    xdrres, resp, wait);
1441 		}
1442 
1443 		if (!(mi->mi_flags & MI4_INT))
1444 			client->cl_nosignal = FALSE;
1445 		/*
1446 		 * restore original signal mask
1447 		 */
1448 		sigunintr(&smask);
1449 
1450 		switch (status) {
1451 		case RPC_SUCCESS:
1452 			break;
1453 
1454 		case RPC_INTR:
1455 			/*
1456 			 * There is no way to recover from this error,
1457 			 * even if mount option nointr is specified.
1458 			 * SIGKILL, for example, cannot be blocked.
1459 			 */
1460 			rpcerr.re_status = RPC_INTR;
1461 			rpcerr.re_errno = EINTR;
1462 			break;
1463 
1464 		case RPC_UDERROR:
1465 			/*
1466 			 * If the NFS server is local (vold) and
1467 			 * it goes away then we get RPC_UDERROR.
1468 			 * This is a retryable error, so we would
1469 			 * loop, so check to see if the specific
1470 			 * error was ECONNRESET, indicating that
1471 			 * target did not exist at all.  If so,
1472 			 * return with RPC_PROGUNAVAIL and
1473 			 * ECONNRESET to indicate why.
1474 			 */
1475 			CLNT_GETERR(client, &rpcerr);
1476 			if (rpcerr.re_errno == ECONNRESET) {
1477 				rpcerr.re_status = RPC_PROGUNAVAIL;
1478 				rpcerr.re_errno = ECONNRESET;
1479 				break;
1480 			}
1481 			/*FALLTHROUGH*/
1482 
1483 		default:		/* probably RPC_TIMEDOUT */
1484 
1485 			if (IS_UNRECOVERABLE_RPC(status))
1486 				break;
1487 
1488 			/*
1489 			 * increment server not responding count
1490 			 */
1491 			mutex_enter(&mi->mi_lock);
1492 			mi->mi_noresponse++;
1493 			mutex_exit(&mi->mi_lock);
1494 #ifdef DEBUG
1495 			nfscl->nfscl_stat.noresponse.value.ui64++;
1496 #endif
1497 			/*
1498 			 * On zone shutdown, mark server dead and move on.
1499 			 */
1500 			if (zone_status_get(curproc->p_zone) >=
1501 			    ZONE_IS_SHUTTING_DOWN) {
1502 				mutex_enter(&mi->mi_lock);
1503 				mi->mi_flags |= MI4_TIMEDOUT;
1504 				mutex_exit(&mi->mi_lock);
1505 				clfree4(client, ch, nfscl);
1506 				if (cred_cloned)
1507 					crfree(cr);
1508 				return (EIO);
1509 			}
1510 
1511 			/*
1512 			 * NFS client failover support:
1513 			 * return and let the caller take care of
1514 			 * failover.  We only return for failover mounts
1515 			 * because otherwise we want the "not responding"
1516 			 * message, the timer updates, etc.
1517 			 */
1518 			if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
1519 			    (error = try_failover(status)) != 0) {
1520 				clfree4(client, ch, nfscl);
1521 				if (cred_cloned)
1522 					crfree(cr);
1523 				*rpc_statusp = status;
1524 				return (error);
1525 			}
1526 
1527 			if (flags & RFSCALL_SOFT)
1528 				break;
1529 
1530 			tryagain = TRUE;
1531 
1532 			/*
1533 			 * The call is in progress (over COTS).
1534 			 * Try the CLNT_CALL again, but don't
1535 			 * print a noisy error message.
1536 			 */
1537 			if (status == RPC_INPROGRESS)
1538 				break;
1539 
1540 			timeo = backoff(timeo);
1541 			CLNT_GETERR(client, &rpcerr_tmp);
1542 
1543 			mutex_enter(&mi->mi_lock);
1544 			if (!(mi->mi_flags & MI4_PRINTED)) {
1545 				mi->mi_flags |= MI4_PRINTED;
1546 				mutex_exit(&mi->mi_lock);
1547 				if ((status == RPC_CANTSEND) &&
1548 				    (rpcerr_tmp.re_errno == ENOBUFS))
1549 					nfs4_queue_fact(RF_SENDQ_FULL, mi, 0,
1550 					    0, 0, FALSE, NULL, 0, NULL);
1551 				else
1552 					nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
1553 					    0, 0, 0, FALSE, NULL, 0, NULL);
1554 			} else
1555 				mutex_exit(&mi->mi_lock);
1556 
1557 			if (*doqueue && nfs_has_ctty()) {
1558 				*doqueue = 0;
1559 				if (!(mi->mi_flags & MI4_NOPRINT)) {
1560 					if ((status == RPC_CANTSEND) &&
1561 					    (rpcerr_tmp.re_errno == ENOBUFS))
1562 						nfs4_queue_fact(RF_SENDQ_FULL,
1563 						    mi, 0, 0, 0, FALSE, NULL,
1564 						    0, NULL);
1565 					else
1566 						nfs4_queue_fact(
1567 						    RF_SRV_NOT_RESPOND, mi, 0,
1568 						    0, 0, FALSE, NULL, 0, NULL);
1569 				}
1570 			}
1571 		}
1572 	} while (tryagain);
1573 
1574 	DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
1575 	    int, rpcerr.re_errno);
1576 
1577 	if (status != RPC_SUCCESS) {
1578 		zoneid_t zoneid = mi->mi_zone->zone_id;
1579 
1580 		/*
1581 		 * Let soft mounts use the timed out message.
1582 		 */
1583 		if (status == RPC_INPROGRESS)
1584 			status = RPC_TIMEDOUT;
1585 		nfscl->nfscl_stat.badcalls.value.ui64++;
1586 		if (status != RPC_INTR) {
1587 			mutex_enter(&mi->mi_lock);
1588 			mi->mi_flags |= MI4_DOWN;
1589 			mutex_exit(&mi->mi_lock);
1590 			CLNT_GETERR(client, &rpcerr);
1591 #ifdef DEBUG
1592 			bufp = clnt_sperror(client, svp->sv_hostname);
1593 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1594 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1595 			if (nfs_has_ctty()) {
1596 				if (!(mi->mi_flags & MI4_NOPRINT)) {
1597 					uprintf("NFS%d %s failed for %s\n",
1598 					    mi->mi_vers, mi->mi_rfsnames[which],
1599 					    bufp);
1600 				}
1601 			}
1602 			kmem_free(bufp, MAXPATHLEN);
1603 #else
1604 			zprintf(zoneid,
1605 			    "NFS %s failed for server %s: error %d (%s)\n",
1606 			    mi->mi_rfsnames[which], svp->sv_hostname,
1607 			    status, clnt_sperrno(status));
1608 			if (nfs_has_ctty()) {
1609 				if (!(mi->mi_flags & MI4_NOPRINT)) {
1610 					uprintf(
1611 				"NFS %s failed for server %s: error %d (%s)\n",
1612 					    mi->mi_rfsnames[which],
1613 					    svp->sv_hostname, status,
1614 					    clnt_sperrno(status));
1615 				}
1616 			}
1617 #endif
1618 			/*
1619 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1620 			 * re_errno is set appropriately depending on
1621 			 * the authentication error
1622 			 */
1623 			if (status == RPC_VERSMISMATCH ||
1624 			    status == RPC_PROGVERSMISMATCH)
1625 				rpcerr.re_errno = EIO;
1626 		}
1627 	} else {
1628 		/*
1629 		 * Test the value of mi_down and mi_printed without
1630 		 * holding the mi_lock mutex.  If they are both zero,
1631 		 * then it is okay to skip the down and printed
1632 		 * processing.  This saves on a mutex_enter and
1633 		 * mutex_exit pair for a normal, successful RPC.
1634 		 * This was just complete overhead.
1635 		 */
1636 		if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
1637 			mutex_enter(&mi->mi_lock);
1638 			mi->mi_flags &= ~MI4_DOWN;
1639 			if (mi->mi_flags & MI4_PRINTED) {
1640 				mi->mi_flags &= ~MI4_PRINTED;
1641 				mutex_exit(&mi->mi_lock);
1642 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1643 					nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
1644 					    0, FALSE, NULL, 0, NULL);
1645 			} else
1646 				mutex_exit(&mi->mi_lock);
1647 		}
1648 
1649 		if (*doqueue == 0) {
1650 			if (!(mi->mi_flags & MI4_NOPRINT) &&
1651 			    !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1652 				nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
1653 				    FALSE, NULL, 0, NULL);
1654 
1655 			*doqueue = 1;
1656 		}
1657 	}
1658 
1659 	clfree4(client, ch, nfscl);
1660 	if (cred_cloned)
1661 		crfree(cr);
1662 
1663 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1664 
1665 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
1666 	    rpcerr.re_errno);
1667 
1668 	*rpc_statusp = status;
1669 	return (rpcerr.re_errno);
1670 }
1671 
1672 /*
1673  * rfs4call - general wrapper for RPC calls initiated by the client
1674  */
1675 void
1676 rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
1677     cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
1678 {
1679 	int i, error;
1680 	enum clnt_stat rpc_status = NFS4_OK;
1681 	int num_resops;
1682 	struct nfs4_clnt *nfscl;
1683 
1684 	ASSERT(nfs_zone() == mi->mi_zone);
1685 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1686 	ASSERT(nfscl != NULL);
1687 
1688 	nfscl->nfscl_stat.calls.value.ui64++;
1689 	mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
1690 
1691 	/* Set up the results struct for XDR usage */
1692 	resp->argsp = argsp;
1693 	resp->array = NULL;
1694 	resp->status = 0;
1695 	resp->decode_len = 0;
1696 
1697 	error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
1698 	    xdr_COMPOUND4args_clnt, (caddr_t)argsp,
1699 	    xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
1700 	    doqueue, &rpc_status, flags, nfscl);
1701 
1702 	/* Return now if it was an RPC error */
1703 	if (error) {
1704 		ep->error = error;
1705 		ep->stat = resp->status;
1706 		ep->rpc_status = rpc_status;
1707 		return;
1708 	}
1709 
1710 	/* else we'll count the processed operations */
1711 	num_resops = resp->decode_len;
1712 	for (i = 0; i < num_resops; i++) {
1713 		/*
1714 		 * Count the individual operations
1715 		 * processed by the server.
1716 		 */
1717 		if (resp->array[i].resop >= NFSPROC4_NULL &&
1718 		    resp->array[i].resop <= OP_WRITE)
1719 			mi->mi_reqs[resp->array[i].resop].value.ui64++;
1720 	}
1721 
1722 	ep->error = 0;
1723 	ep->stat = resp->status;
1724 	ep->rpc_status = rpc_status;
1725 }
1726 
1727 /*
1728  * nfs4rename_update - updates stored state after a rename.  Currently this
1729  * is the path of the object and anything under it, and the filehandle of
1730  * the renamed object.
1731  */
1732 void
1733 nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
1734 {
1735 	sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
1736 	fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
1737 }
1738 
1739 /*
1740  * Routine to look up the filehandle for the given path and rootvp.
1741  *
1742  * Return values:
1743  * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
1744  *   updated.
1745  * - error: return value (errno value) and/or *statp is set appropriately.
1746  */
1747 #define	RML_ORDINARY	1
1748 #define	RML_NAMED_ATTR	2
1749 #define	RML_ATTRDIR	3
1750 
1751 static void
1752 remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
1753     int filetype, cred_t *cr,
1754     nfs_fh4 *fhp, nfs4_ga_res_t *garp,		/* fh, attrs for object */
1755     nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,	/* fh, attrs for parent */
1756     nfs4_error_t *ep)
1757 {
1758 	COMPOUND4args_clnt args;
1759 	COMPOUND4res_clnt res;
1760 	nfs_argop4 *argop;
1761 	nfs_resop4 *resop;
1762 	int num_argops;
1763 	lookup4_param_t lookuparg;
1764 	nfs_fh4 *tmpfhp;
1765 	int doqueue = 1;
1766 	char *path;
1767 	mntinfo4_t *mi;
1768 
1769 	ASSERT(fname != NULL);
1770 	ASSERT(rootvp->v_type == VDIR);
1771 
1772 	mi = VTOMI4(rootvp);
1773 	path = fn_path(fname);
1774 	switch (filetype) {
1775 	case RML_NAMED_ATTR:
1776 		lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
1777 		args.ctag = TAG_REMAP_LOOKUP_NA;
1778 		break;
1779 	case RML_ATTRDIR:
1780 		lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
1781 		args.ctag = TAG_REMAP_LOOKUP_AD;
1782 		break;
1783 	case RML_ORDINARY:
1784 		lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1785 		args.ctag = TAG_REMAP_LOOKUP;
1786 		break;
1787 	default:
1788 		ep->error = EINVAL;
1789 		return;
1790 	}
1791 	lookuparg.argsp = &args;
1792 	lookuparg.resp = &res;
1793 	lookuparg.header_len = 1;	/* Putfh */
1794 	lookuparg.trailer_len = 0;
1795 	lookuparg.ga_bits = NFS4_VATTR_MASK;
1796 	lookuparg.mi = VTOMI4(rootvp);
1797 
1798 	(void) nfs4lookup_setup(path, &lookuparg, 1);
1799 
1800 	/* 0: putfh directory */
1801 	argop = args.array;
1802 	argop[0].argop = OP_CPUTFH;
1803 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
1804 
1805 	num_argops = args.array_len;
1806 
1807 	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1808 
1809 	if (ep->error || res.status != NFS4_OK)
1810 		goto exit;
1811 
1812 	/* get the object filehandle */
1813 	resop = &res.array[res.array_len - 2];
1814 	if (resop->resop != OP_GETFH) {
1815 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1816 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1817 		ep->stat = NFS4ERR_SERVERFAULT;
1818 		goto exit;
1819 	}
1820 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1821 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1822 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1823 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1824 		    TAG_NONE, 0, 0);
1825 		ep->stat = NFS4ERR_SERVERFAULT;
1826 		goto exit;
1827 	}
1828 	fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1829 	nfs_fh4_copy(tmpfhp, fhp);
1830 
1831 	/* get the object attributes */
1832 	resop = &res.array[res.array_len - 1];
1833 	if (garp && resop->resop == OP_GETATTR)
1834 		*garp = resop->nfs_resop4_u.opgetattr.ga_res;
1835 
1836 	/* See if there are enough fields in the response for parent info */
1837 	if ((int)res.array_len - 5 <= 0)
1838 		goto exit;
1839 
1840 	/* get the parent filehandle */
1841 	resop = &res.array[res.array_len - 5];
1842 	if (resop->resop != OP_GETFH) {
1843 		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1844 		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1845 		ep->stat = NFS4ERR_SERVERFAULT;
1846 		goto exit;
1847 	}
1848 	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1849 	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1850 		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1851 		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1852 		    TAG_NONE, 0, 0);
1853 		ep->stat = NFS4ERR_SERVERFAULT;
1854 		goto exit;
1855 	}
1856 	pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1857 	nfs_fh4_copy(tmpfhp, pfhp);
1858 
1859 	/* get the parent attributes */
1860 	resop = &res.array[res.array_len - 4];
1861 	if (pgarp && resop->resop == OP_GETATTR)
1862 		*pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
1863 
1864 exit:
1865 	/*
1866 	 * It is too hard to remember where all the OP_LOOKUPs are
1867 	 */
1868 	nfs4args_lookup_free(argop, num_argops);
1869 	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1870 
1871 	if (!ep->error)
1872 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1873 	kmem_free(path, strlen(path)+1);
1874 }
1875 
1876 /*
1877  * NFS client failover / volatile filehandle support
1878  *
1879  * Recover the filehandle for the given rnode.
1880  *
1881  * Errors are returned via the nfs4_error_t parameter.
1882  */
1883 
1884 void
1885 nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
1886 {
1887 	int is_stub;
1888 	rnode4_t *rp = VTOR4(vp);
1889 	vnode_t *rootvp = NULL;
1890 	vnode_t *dvp = NULL;
1891 	cred_t *cr, *cred_otw;
1892 	nfs4_ga_res_t gar, pgar;
1893 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
1894 	int filetype = RML_ORDINARY;
1895 	nfs4_recov_state_t recov = {NULL, 0, 0};
1896 	int badfhcount = 0;
1897 	nfs4_open_stream_t *osp = NULL;
1898 	bool_t first_time = TRUE;	/* first time getting OTW cred */
1899 	bool_t last_time = FALSE;	/* last time getting OTW cred */
1900 
1901 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1902 	    "nfs4_remap_file: remapping %s", rnode4info(rp)));
1903 	ASSERT(nfs4_consistent_type(vp));
1904 
1905 	if (vp->v_flag & VROOT) {
1906 		nfs4_remap_root(mi, ep, flags);
1907 		return;
1908 	}
1909 
1910 	/*
1911 	 * Given the root fh, use the path stored in
1912 	 * the rnode to find the fh for the new server.
1913 	 */
1914 	ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1915 	if (ep->error != 0)
1916 		return;
1917 
1918 	cr = curthread->t_cred;
1919 	ASSERT(cr != NULL);
1920 get_remap_cred:
1921 	/*
1922 	 * Releases the osp, if it is provided.
1923 	 * Puts a hold on the cred_otw and the new osp (if found).
1924 	 */
1925 	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
1926 	    &first_time, &last_time);
1927 	ASSERT(cred_otw != NULL);
1928 
1929 	if (rp->r_flags & R4ISXATTR) {
1930 		filetype = RML_NAMED_ATTR;
1931 		(void) vtodv(vp, &dvp, cred_otw, FALSE);
1932 	}
1933 
1934 	if (vp->v_flag & V_XATTRDIR) {
1935 		filetype = RML_ATTRDIR;
1936 	}
1937 
1938 	if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
1939 		/* file mount, doesn't need a remap */
1940 		goto done;
1941 	}
1942 
1943 again:
1944 	remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
1945 	    &newfh, &gar, &newpfh, &pgar, ep);
1946 
1947 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1948 	    "nfs4_remap_file: remap_lookup returned %d/%d",
1949 	    ep->error, ep->stat));
1950 
1951 	if (last_time == FALSE && ep->error == EACCES) {
1952 		crfree(cred_otw);
1953 		if (dvp != NULL)
1954 			VN_RELE(dvp);
1955 		goto get_remap_cred;
1956 	}
1957 	if (ep->error != 0)
1958 		goto done;
1959 
1960 	switch (ep->stat) {
1961 	case NFS4_OK:
1962 		badfhcount = 0;
1963 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1964 			mutex_enter(&rp->r_statelock);
1965 			rp->r_delay_interval = 0;
1966 			mutex_exit(&rp->r_statelock);
1967 			uprintf("NFS File Available..\n");
1968 		}
1969 		break;
1970 	case NFS4ERR_FHEXPIRED:
1971 	case NFS4ERR_BADHANDLE:
1972 	case NFS4ERR_STALE:
1973 		/*
1974 		 * If we ran into filehandle problems, we should try to
1975 		 * remap the root vnode first and hope life gets better.
1976 		 * But we need to avoid loops.
1977 		 */
1978 		if (badfhcount++ > 0)
1979 			goto done;
1980 		if (newfh.nfs_fh4_len != 0) {
1981 			kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1982 			newfh.nfs_fh4_len = 0;
1983 		}
1984 		if (newpfh.nfs_fh4_len != 0) {
1985 			kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1986 			newpfh.nfs_fh4_len = 0;
1987 		}
1988 		/* relative path - remap rootvp then retry */
1989 		VN_RELE(rootvp);
1990 		rootvp = NULL;
1991 		nfs4_remap_root(mi, ep, flags);
1992 		if (ep->error != 0 || ep->stat != NFS4_OK)
1993 			goto done;
1994 		ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1995 		if (ep->error != 0)
1996 			goto done;
1997 		goto again;
1998 	case NFS4ERR_DELAY:
1999 		badfhcount = 0;
2000 		nfs4_set_delay_wait(vp);
2001 		ep->error = nfs4_wait_for_delay(vp, &recov);
2002 		if (ep->error != 0)
2003 			goto done;
2004 		goto again;
2005 	case NFS4ERR_ACCESS:
2006 		/* get new cred, try again */
2007 		if (last_time == TRUE)
2008 			goto done;
2009 		if (dvp != NULL)
2010 			VN_RELE(dvp);
2011 		crfree(cred_otw);
2012 		goto get_remap_cred;
2013 	default:
2014 		goto done;
2015 	}
2016 
2017 	/*
2018 	 * Check on the new and old rnodes before updating;
2019 	 * if the vnode type or size changes, issue a warning
2020 	 * and mark the file dead.
2021 	 */
2022 	mutex_enter(&rp->r_statelock);
2023 	if (flags & NFS4_REMAP_CKATTRS) {
2024 		if (vp->v_type != gar.n4g_va.va_type ||
2025 		    (vp->v_type != VDIR &&
2026 		    rp->r_size != gar.n4g_va.va_size)) {
2027 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2028 			    "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
2029 			    (int)rp->r_size, (int)gar.n4g_va.va_size,
2030 			    vp->v_type, gar.n4g_va.va_type));
2031 			mutex_exit(&rp->r_statelock);
2032 			nfs4_queue_event(RE_FILE_DIFF, mi,
2033 			    rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
2034 			    TAG_NONE, TAG_NONE, 0, 0);
2035 			nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
2036 			goto done;
2037 		}
2038 	}
2039 	ASSERT(gar.n4g_va.va_type != VNON);
2040 	rp->r_server = mi->mi_curr_serv;
2041 
2042 	/*
2043 	 * Turn this object into a "stub" object if we
2044 	 * crossed an underlying server fs boundary.
2045 	 *
2046 	 * This stub will be for a mirror-mount.
2047 	 * A referral would look like a boundary crossing
2048 	 * as well, but would not be the same type of object,
2049 	 * so we would expect to mark the object dead.
2050 	 *
2051 	 * See comment in r4_do_attrcache() for more details.
2052 	 */
2053 	is_stub = 0;
2054 	if (gar.n4g_fsid_valid) {
2055 		(void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
2056 		rp->r_srv_fsid = gar.n4g_fsid;
2057 		if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
2058 			is_stub = 1;
2059 		nfs_rw_exit(&rp->r_server->sv_lock);
2060 #ifdef DEBUG
2061 	} else {
2062 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2063 		    "remap_file: fsid attr not provided by server.  rp=%p",
2064 		    (void *)rp));
2065 #endif
2066 	}
2067 	if (is_stub)
2068 		r4_stub_mirrormount(rp);
2069 	else
2070 		r4_stub_none(rp);
2071 	mutex_exit(&rp->r_statelock);
2072 	nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
2073 	sfh4_update(rp->r_fh, &newfh);
2074 	ASSERT(nfs4_consistent_type(vp));
2075 
2076 	/*
2077 	 * If we got parent info, use it to update the parent
2078 	 */
2079 	if (newpfh.nfs_fh4_len != 0) {
2080 		if (rp->r_svnode.sv_dfh != NULL)
2081 			sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
2082 		if (dvp != NULL) {
2083 			/* force update of attrs */
2084 			nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
2085 		}
2086 	}
2087 done:
2088 	if (newfh.nfs_fh4_len != 0)
2089 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2090 	if (newpfh.nfs_fh4_len != 0)
2091 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2092 	if (cred_otw != NULL)
2093 		crfree(cred_otw);
2094 	if (rootvp != NULL)
2095 		VN_RELE(rootvp);
2096 	if (dvp != NULL)
2097 		VN_RELE(dvp);
2098 	if (osp != NULL)
2099 		open_stream_rele(osp, rp);
2100 }
2101 
2102 /*
2103  * Client-side failover support: remap the filehandle for vp if it appears
2104  * necessary.  errors are returned via the nfs4_error_t parameter; though,
2105  * if there is a problem, we will just try again later.
2106  */
2107 
2108 void
2109 nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
2110 {
2111 	if (vp == NULL)
2112 		return;
2113 
2114 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
2115 		return;
2116 
2117 	if (VTOR4(vp)->r_server == mi->mi_curr_serv)
2118 		return;
2119 
2120 	nfs4_remap_file(mi, vp, flags, ep);
2121 }
2122 
2123 /*
2124  * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
2125  *
2126  * Our caller has a filehandle for ".." relative to a particular
2127  * directory object.  We want to find or create a parent vnode
2128  * with that filehandle and return it.  We can of course create
2129  * a vnode from this filehandle, but we need to also make sure
2130  * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
2131  * that we have a parent FH for future reopens as well.  If
2132  * we have a remap failure, we won't be able to reopen this
2133  * file, but we won't treat that as fatal because a reopen
2134  * is at least unlikely.  Someday nfs4_reopen() should look
2135  * for a missing parent FH and try a remap to recover from it.
2136  *
2137  * need_start_op argument indicates whether this function should
2138  * do a start_op before calling remap_lookup().  This should
2139  * be FALSE, if you are the recovery thread or in an op; otherwise,
2140  * set it to TRUE.
2141  */
2142 int
2143 nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
2144     cred_t *cr, vnode_t **vpp, int need_start_op)
2145 {
2146 	mntinfo4_t *mi = VTOMI4(dvp);
2147 	nfs4_fname_t *np = NULL, *pnp = NULL;
2148 	vnode_t *vp = NULL, *rootvp = NULL;
2149 	rnode4_t *rp;
2150 	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
2151 	nfs4_ga_res_t gar, pgar;
2152 	vattr_t va, pva;
2153 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2154 	nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
2155 	nfs4_recov_state_t recov_state;
2156 
2157 #ifdef DEBUG
2158 	/*
2159 	 * ensure need_start_op is correct
2160 	 */
2161 	{
2162 		int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
2163 		    (curthread == mi->mi_recovthread));
2164 		/* C needs a ^^ operator! */
2165 		ASSERT(((need_start_op) && (!no_need_start_op)) ||
2166 		    ((! need_start_op) && (no_need_start_op)));
2167 	}
2168 #endif
2169 	ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
2170 
2171 	NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
2172 	    "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
2173 	    rnode4info(VTOR4(dvp))));
2174 
2175 	/*
2176 	 * rootvp might be needed eventually. Holding it now will
2177 	 * ensure that r4find_unlocked() will find it, if ".." is the root.
2178 	 */
2179 	e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2180 	if (e.error != 0)
2181 		goto out;
2182 	rp = r4find_unlocked(fhp, mi->mi_vfsp);
2183 	if (rp != NULL) {
2184 		*vpp = RTOV4(rp);
2185 		VN_RELE(rootvp);
2186 		return (0);
2187 	}
2188 
2189 	/*
2190 	 * Since we don't have the rnode, we have to go over the wire.
2191 	 * remap_lookup() can get all of the filehandles and attributes
2192 	 * we need in one operation.
2193 	 */
2194 	np = fn_parent(VTOSV(dvp)->sv_name);
2195 	/* if a parent was not found return an error */
2196 	if (np == NULL) {
2197 		e.error = ENOENT;
2198 		goto out;
2199 	}
2200 
2201 	recov_state.rs_flags = 0;
2202 	recov_state.rs_num_retry_despite_err = 0;
2203 recov_retry:
2204 	if (need_start_op) {
2205 		e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
2206 		    &recov_state, NULL);
2207 		if (e.error != 0) {
2208 			goto out;
2209 		}
2210 	}
2211 
2212 	pgar.n4g_va.va_type = VNON;
2213 	gar.n4g_va.va_type = VNON;
2214 
2215 	remap_lookup(np, rootvp, RML_ORDINARY, cr,
2216 	    &newfh, &gar, &newpfh, &pgar, &e);
2217 	if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2218 		if (need_start_op) {
2219 			bool_t abort;
2220 
2221 			abort = nfs4_start_recovery(&e, mi,
2222 			    rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL, NULL,
2223 			    NULL);
2224 			if (abort) {
2225 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2226 				    &recov_state, FALSE);
2227 				if (e.error == 0)
2228 					e.error = EIO;
2229 				goto out;
2230 			}
2231 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2232 			    &recov_state, TRUE);
2233 			goto recov_retry;
2234 		}
2235 		if (e.error == 0)
2236 			e.error = EIO;
2237 		goto out;
2238 	}
2239 
2240 	va = gar.n4g_va;
2241 	pva = pgar.n4g_va;
2242 
2243 	if ((e.error != 0) ||
2244 	    (va.va_type != VDIR)) {
2245 		if (need_start_op)
2246 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2247 			    &recov_state, FALSE);
2248 		if (e.error == 0)
2249 			e.error = EIO;
2250 		goto out;
2251 	}
2252 
2253 	if (e.stat != NFS4_OK) {
2254 		if (need_start_op)
2255 			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2256 			    &recov_state, FALSE);
2257 		e.error = EIO;
2258 		goto out;
2259 	}
2260 
2261 	/*
2262 	 * It is possible for remap_lookup() to return with no error,
2263 	 * but without providing the parent filehandle and attrs.
2264 	 */
2265 	if (pva.va_type != VDIR) {
2266 		/*
2267 		 * Call remap_lookup() again, this time with the
2268 		 * newpfh and pgar args in the first position.
2269 		 */
2270 		pnp = fn_parent(np);
2271 		if (pnp != NULL) {
2272 			remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
2273 			    &newpfh, &pgar, NULL, NULL, &e);
2274 			/*
2275 			 * This remap_lookup call modifies pgar. The following
2276 			 * line prevents trouble when checking the va_type of
2277 			 * pva later in this code.
2278 			 */
2279 			pva = pgar.n4g_va;
2280 
2281 			if (nfs4_needs_recovery(&e, FALSE,
2282 			    mi->mi_vfsp)) {
2283 				if (need_start_op) {
2284 					bool_t abort;
2285 
2286 					abort = nfs4_start_recovery(&e, mi,
2287 					    rootvp, NULL, NULL, NULL,
2288 					    OP_LOOKUP, NULL, NULL, NULL);
2289 					if (abort) {
2290 						nfs4_end_fop(mi, rootvp, NULL,
2291 						    OH_LOOKUP, &recov_state,
2292 						    FALSE);
2293 						if (e.error == 0)
2294 							e.error = EIO;
2295 						goto out;
2296 					}
2297 					nfs4_end_fop(mi, rootvp, NULL,
2298 					    OH_LOOKUP, &recov_state, TRUE);
2299 					goto recov_retry;
2300 				}
2301 				if (e.error == 0)
2302 					e.error = EIO;
2303 				goto out;
2304 			}
2305 
2306 			if (e.stat != NFS4_OK) {
2307 				if (need_start_op)
2308 					nfs4_end_fop(mi, rootvp, NULL,
2309 					    OH_LOOKUP, &recov_state, FALSE);
2310 				e.error = EIO;
2311 				goto out;
2312 			}
2313 		}
2314 		if ((pnp == NULL) ||
2315 		    (e.error != 0) ||
2316 		    (pva.va_type == VNON)) {
2317 			if (need_start_op)
2318 				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2319 				    &recov_state, FALSE);
2320 			if (e.error == 0)
2321 				e.error = EIO;
2322 			goto out;
2323 		}
2324 	}
2325 	ASSERT(newpfh.nfs_fh4_len != 0);
2326 	if (need_start_op)
2327 		nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
2328 	psfh = sfh4_get(&newpfh, mi);
2329 
2330 	sfh = sfh4_get(&newfh, mi);
2331 	vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
2332 
2333 out:
2334 	if (np != NULL)
2335 		fn_rele(&np);
2336 	if (pnp != NULL)
2337 		fn_rele(&pnp);
2338 	if (newfh.nfs_fh4_len != 0)
2339 		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2340 	if (newpfh.nfs_fh4_len != 0)
2341 		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2342 	if (sfh != NULL)
2343 		sfh4_rele(&sfh);
2344 	if (psfh != NULL)
2345 		sfh4_rele(&psfh);
2346 	if (rootvp != NULL)
2347 		VN_RELE(rootvp);
2348 	*vpp = vp;
2349 	return (e.error);
2350 }
2351 
2352 #ifdef DEBUG
2353 size_t r_path_memuse = 0;
2354 #endif
2355 
2356 /*
2357  * NFS client failover support
2358  *
2359  * sv4_free() frees the malloc'd portion of a "servinfo_t".
2360  */
2361 void
2362 sv4_free(servinfo4_t *svp)
2363 {
2364 	servinfo4_t *next;
2365 	struct knetconfig *knconf;
2366 
2367 	while (svp != NULL) {
2368 		next = svp->sv_next;
2369 		if (svp->sv_dhsec)
2370 			sec_clnt_freeinfo(svp->sv_dhsec);
2371 		if (svp->sv_secdata)
2372 			sec_clnt_freeinfo(svp->sv_secdata);
2373 		if (svp->sv_save_secinfo &&
2374 		    svp->sv_save_secinfo != svp->sv_secinfo)
2375 			secinfo_free(svp->sv_save_secinfo);
2376 		if (svp->sv_secinfo)
2377 			secinfo_free(svp->sv_secinfo);
2378 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
2379 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
2380 		knconf = svp->sv_knconf;
2381 		if (knconf != NULL) {
2382 			if (knconf->knc_protofmly != NULL)
2383 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2384 			if (knconf->knc_proto != NULL)
2385 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2386 			kmem_free(knconf, sizeof (*knconf));
2387 		}
2388 		knconf = svp->sv_origknconf;
2389 		if (knconf != NULL) {
2390 			if (knconf->knc_protofmly != NULL)
2391 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2392 			if (knconf->knc_proto != NULL)
2393 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2394 			kmem_free(knconf, sizeof (*knconf));
2395 		}
2396 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
2397 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
2398 		if (svp->sv_path != NULL) {
2399 			kmem_free(svp->sv_path, svp->sv_pathlen);
2400 		}
2401 		nfs_rw_destroy(&svp->sv_lock);
2402 		kmem_free(svp, sizeof (*svp));
2403 		svp = next;
2404 	}
2405 }
2406 
2407 void
2408 nfs4_printfhandle(nfs4_fhandle_t *fhp)
2409 {
2410 	int *ip;
2411 	char *buf;
2412 	size_t bufsize;
2413 	char *cp;
2414 
2415 	/*
2416 	 * 13 == "(file handle:"
2417 	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2418 	 *	1 == ' '
2419 	 *	8 == maximum strlen of "%x"
2420 	 * 3 == ")\n\0"
2421 	 */
2422 	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2423 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
2424 	if (buf == NULL)
2425 		return;
2426 
2427 	cp = buf;
2428 	(void) strcpy(cp, "(file handle:");
2429 	while (*cp != '\0')
2430 		cp++;
2431 	for (ip = (int *)fhp->fh_buf;
2432 	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
2433 	    ip++) {
2434 		(void) sprintf(cp, " %x", *ip);
2435 		while (*cp != '\0')
2436 			cp++;
2437 	}
2438 	(void) strcpy(cp, ")\n");
2439 
2440 	zcmn_err(getzoneid(), CE_CONT, "%s", buf);
2441 
2442 	kmem_free(buf, bufsize);
2443 }
2444 
2445 /*
2446  * The NFSv4 readdir cache subsystem.
2447  *
2448  * We provide a set of interfaces to allow the rest of the system to utilize
2449  * a caching mechanism while encapsulating the details of the actual
2450  * implementation.  This should allow for better maintainability and
2451  * extensibility by consolidating the implementation details in one location.
2452  */
2453 
2454 /*
2455  * Comparator used by AVL routines.
2456  */
2457 static int
2458 rddir4_cache_compar(const void *x, const void *y)
2459 {
2460 	rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
2461 	rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
2462 	rddir4_cache *a = &ai->rc;
2463 	rddir4_cache *b = &bi->rc;
2464 
2465 	if (a->nfs4_cookie == b->nfs4_cookie) {
2466 		if (a->buflen == b->buflen)
2467 			return (0);
2468 		if (a->buflen < b->buflen)
2469 			return (-1);
2470 		return (1);
2471 	}
2472 
2473 	if (a->nfs4_cookie < b->nfs4_cookie)
2474 			return (-1);
2475 
2476 	return (1);
2477 }
2478 
2479 /*
2480  * Allocate an opaque handle for the readdir cache.
2481  */
2482 void
2483 rddir4_cache_create(rnode4_t *rp)
2484 {
2485 	ASSERT(rp->r_dir == NULL);
2486 
2487 	rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2488 
2489 	avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
2490 	    offsetof(rddir4_cache_impl, tree));
2491 }
2492 
2493 /*
2494  *  Purge the cache of all cached readdir responses.
2495  */
2496 void
2497 rddir4_cache_purge(rnode4_t *rp)
2498 {
2499 	rddir4_cache_impl	*rdip;
2500 	rddir4_cache_impl	*nrdip;
2501 
2502 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2503 
2504 	if (rp->r_dir == NULL)
2505 		return;
2506 
2507 	rdip = avl_first(rp->r_dir);
2508 
2509 	while (rdip != NULL) {
2510 		nrdip = AVL_NEXT(rp->r_dir, rdip);
2511 		avl_remove(rp->r_dir, rdip);
2512 		rdip->rc.flags &= ~RDDIRCACHED;
2513 		rddir4_cache_rele(rp, &rdip->rc);
2514 		rdip = nrdip;
2515 	}
2516 	ASSERT(avl_numnodes(rp->r_dir) == 0);
2517 }
2518 
2519 /*
2520  * Destroy the readdir cache.
2521  */
2522 void
2523 rddir4_cache_destroy(rnode4_t *rp)
2524 {
2525 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2526 	if (rp->r_dir == NULL)
2527 		return;
2528 
2529 	rddir4_cache_purge(rp);
2530 	avl_destroy(rp->r_dir);
2531 	kmem_free(rp->r_dir, sizeof (avl_tree_t));
2532 	rp->r_dir = NULL;
2533 }
2534 
2535 /*
2536  * Locate a readdir response from the readdir cache.
2537  *
2538  * Return values:
2539  *
2540  * NULL - If there is an unrecoverable situation like the operation may have
2541  *	  been interrupted.
2542  *
2543  * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
2544  *		    The flags are set approprately, such that the caller knows
2545  *		    what state the entry is in.
2546  */
2547 rddir4_cache *
2548 rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
2549 {
2550 	rddir4_cache_impl	*rdip = NULL;
2551 	rddir4_cache_impl	srdip;
2552 	rddir4_cache		*srdc;
2553 	rddir4_cache		*rdc = NULL;
2554 	rddir4_cache		*nrdc = NULL;
2555 	avl_index_t		where;
2556 
2557 top:
2558 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2559 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2560 	/*
2561 	 * Check to see if the readdir cache has been disabled.  If so, then
2562 	 * simply allocate an rddir4_cache entry and return it, since caching
2563 	 * operations do not apply.
2564 	 */
2565 	if (rp->r_dir == NULL) {
2566 		if (nrdc == NULL) {
2567 			/*
2568 			 * Drop the lock because we are doing a sleeping
2569 			 * allocation.
2570 			 */
2571 			mutex_exit(&rp->r_statelock);
2572 			rdc = rddir4_cache_alloc(KM_SLEEP);
2573 			rdc->nfs4_cookie = cookie;
2574 			rdc->buflen = count;
2575 			mutex_enter(&rp->r_statelock);
2576 			return (rdc);
2577 		}
2578 		return (nrdc);
2579 	}
2580 
2581 	srdc = &srdip.rc;
2582 	srdc->nfs4_cookie = cookie;
2583 	srdc->buflen = count;
2584 
2585 	rdip = avl_find(rp->r_dir, &srdip, &where);
2586 
2587 	/*
2588 	 * If we didn't find an entry then create one and insert it
2589 	 * into the cache.
2590 	 */
2591 	if (rdip == NULL) {
2592 		/*
2593 		 * Check for the case where we have made a second pass through
2594 		 * the cache due to a lockless allocation.  If we find that no
2595 		 * thread has already inserted this entry, do the insert now
2596 		 * and return.
2597 		 */
2598 		if (nrdc != NULL) {
2599 			avl_insert(rp->r_dir, nrdc->data, where);
2600 			nrdc->flags |= RDDIRCACHED;
2601 			rddir4_cache_hold(nrdc);
2602 			return (nrdc);
2603 		}
2604 
2605 #ifdef DEBUG
2606 		nfs4_readdir_cache_misses++;
2607 #endif
2608 		/*
2609 		 * First, try to allocate an entry without sleeping.  If that
2610 		 * fails then drop the lock and do a sleeping allocation.
2611 		 */
2612 		nrdc = rddir4_cache_alloc(KM_NOSLEEP);
2613 		if (nrdc != NULL) {
2614 			nrdc->nfs4_cookie = cookie;
2615 			nrdc->buflen = count;
2616 			avl_insert(rp->r_dir, nrdc->data, where);
2617 			nrdc->flags |= RDDIRCACHED;
2618 			rddir4_cache_hold(nrdc);
2619 			return (nrdc);
2620 		}
2621 
2622 		/*
2623 		 * Drop the lock and do a sleeping allocation.	We incur
2624 		 * additional overhead by having to search the cache again,
2625 		 * but this case should be rare.
2626 		 */
2627 		mutex_exit(&rp->r_statelock);
2628 		nrdc = rddir4_cache_alloc(KM_SLEEP);
2629 		nrdc->nfs4_cookie = cookie;
2630 		nrdc->buflen = count;
2631 		mutex_enter(&rp->r_statelock);
2632 		/*
2633 		 * We need to take another pass through the cache
2634 		 * since we dropped our lock to perform the alloc.
2635 		 * Another thread may have come by and inserted the
2636 		 * entry we are interested in.
2637 		 */
2638 		goto top;
2639 	}
2640 
2641 	/*
2642 	 * Check to see if we need to free our entry.  This can happen if
2643 	 * another thread came along beat us to the insert.  We can
2644 	 * safely call rddir4_cache_free directly because no other thread
2645 	 * would have a reference to this entry.
2646 	 */
2647 	if (nrdc != NULL)
2648 		rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
2649 
2650 #ifdef DEBUG
2651 	nfs4_readdir_cache_hits++;
2652 #endif
2653 	/*
2654 	 * Found something.  Make sure it's ready to return.
2655 	 */
2656 	rdc = &rdip->rc;
2657 	rddir4_cache_hold(rdc);
2658 	/*
2659 	 * If the cache entry is in the process of being filled in, wait
2660 	 * until this completes.  The RDDIRWAIT bit is set to indicate that
2661 	 * someone is waiting and when the thread currently filling the entry
2662 	 * is done, it should do a cv_broadcast to wakeup all of the threads
2663 	 * waiting for it to finish. If the thread wakes up to find that
2664 	 * someone new is now trying to complete the the entry, go back
2665 	 * to sleep.
2666 	 */
2667 	while (rdc->flags & RDDIR) {
2668 		/*
2669 		 * The entry is not complete.
2670 		 */
2671 		nfs_rw_exit(&rp->r_rwlock);
2672 		rdc->flags |= RDDIRWAIT;
2673 #ifdef DEBUG
2674 		nfs4_readdir_cache_waits++;
2675 #endif
2676 		while (rdc->flags & RDDIRWAIT) {
2677 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2678 				/*
2679 				 * We got interrupted, probably the user
2680 				 * typed ^C or an alarm fired.  We free the
2681 				 * new entry if we allocated one.
2682 				 */
2683 				rddir4_cache_rele(rp, rdc);
2684 				mutex_exit(&rp->r_statelock);
2685 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
2686 				    RW_READER, FALSE);
2687 				mutex_enter(&rp->r_statelock);
2688 				return (NULL);
2689 			}
2690 		}
2691 		mutex_exit(&rp->r_statelock);
2692 		(void) nfs_rw_enter_sig(&rp->r_rwlock,
2693 		    RW_READER, FALSE);
2694 		mutex_enter(&rp->r_statelock);
2695 	}
2696 
2697 	/*
2698 	 * The entry we were waiting on may have been purged from
2699 	 * the cache and should no longer be used, release it and
2700 	 * start over.
2701 	 */
2702 	if (!(rdc->flags & RDDIRCACHED)) {
2703 		rddir4_cache_rele(rp, rdc);
2704 		goto top;
2705 	}
2706 
2707 	/*
2708 	 * The entry is completed.  Return it.
2709 	 */
2710 	return (rdc);
2711 }
2712 
2713 /*
2714  * Allocate a cache element and return it.  Can return NULL if memory is
2715  * low.
2716  */
2717 static rddir4_cache *
2718 rddir4_cache_alloc(int flags)
2719 {
2720 	rddir4_cache_impl	*rdip = NULL;
2721 	rddir4_cache		*rc = NULL;
2722 
2723 	rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
2724 
2725 	if (rdip != NULL) {
2726 		rc = &rdip->rc;
2727 		rc->data = (void *)rdip;
2728 		rc->nfs4_cookie = 0;
2729 		rc->nfs4_ncookie = 0;
2730 		rc->entries = NULL;
2731 		rc->eof = 0;
2732 		rc->entlen = 0;
2733 		rc->buflen = 0;
2734 		rc->actlen = 0;
2735 		/*
2736 		 * A readdir is required so set the flag.
2737 		 */
2738 		rc->flags = RDDIRREQ;
2739 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
2740 		rc->error = 0;
2741 		mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
2742 		rdip->count = 1;
2743 #ifdef DEBUG
2744 		atomic_add_64(&clstat4_debug.dirent.value.ui64, 1);
2745 #endif
2746 	}
2747 	return (rc);
2748 }
2749 
2750 /*
2751  * Increment the reference count to this cache element.
2752  */
2753 static void
2754 rddir4_cache_hold(rddir4_cache *rc)
2755 {
2756 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
2757 
2758 	mutex_enter(&rdip->lock);
2759 	rdip->count++;
2760 	mutex_exit(&rdip->lock);
2761 }
2762 
2763 /*
2764  * Release a reference to this cache element.  If the count is zero then
2765  * free the element.
2766  */
2767 void
2768 rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
2769 {
2770 	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
2771 
2772 	ASSERT(MUTEX_HELD(&rp->r_statelock));
2773 
2774 	/*
2775 	 * Check to see if we have any waiters.  If so, we can wake them
2776 	 * so that they can proceed.
2777 	 */
2778 	if (rdc->flags & RDDIRWAIT) {
2779 		rdc->flags &= ~RDDIRWAIT;
2780 		cv_broadcast(&rdc->cv);
2781 	}
2782 
2783 	mutex_enter(&rdip->lock);
2784 	ASSERT(rdip->count > 0);
2785 	if (--rdip->count == 0) {
2786 		mutex_exit(&rdip->lock);
2787 		rddir4_cache_free(rdip);
2788 	} else
2789 		mutex_exit(&rdip->lock);
2790 }
2791 
2792 /*
2793  * Free a cache element.
2794  */
2795 static void
2796 rddir4_cache_free(rddir4_cache_impl *rdip)
2797 {
2798 	rddir4_cache *rc = &rdip->rc;
2799 
2800 #ifdef DEBUG
2801 	atomic_add_64(&clstat4_debug.dirent.value.ui64, -1);
2802 #endif
2803 	if (rc->entries != NULL)
2804 		kmem_free(rc->entries, rc->buflen);
2805 	cv_destroy(&rc->cv);
2806 	mutex_destroy(&rdip->lock);
2807 	kmem_free(rdip, sizeof (*rdip));
2808 }
2809 
2810 /*
2811  * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
2812  * framework.
2813  */
2814 static int
2815 cl4_snapshot(kstat_t *ksp, void *buf, int rw)
2816 {
2817 	ksp->ks_snaptime = gethrtime();
2818 	if (rw == KSTAT_WRITE) {
2819 		bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
2820 #ifdef DEBUG
2821 		/*
2822 		 * Currently only the global zone can write to kstats, but we
2823 		 * add the check just for paranoia.
2824 		 */
2825 		if (INGLOBALZONE(curproc))
2826 			bcopy((char *)buf + sizeof (clstat4_tmpl),
2827 			    &clstat4_debug, sizeof (clstat4_debug));
2828 #endif
2829 	} else {
2830 		bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
2831 #ifdef DEBUG
2832 		/*
2833 		 * If we're displaying the "global" debug kstat values, we
2834 		 * display them as-is to all zones since in fact they apply to
2835 		 * the system as a whole.
2836 		 */
2837 		bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
2838 		    sizeof (clstat4_debug));
2839 #endif
2840 	}
2841 	return (0);
2842 }
2843 
2844 
2845 
2846 /*
2847  * Zone support
2848  */
2849 static void *
2850 clinit4_zone(zoneid_t zoneid)
2851 {
2852 	kstat_t *nfs4_client_kstat;
2853 	struct nfs4_clnt *nfscl;
2854 	uint_t ndata;
2855 
2856 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
2857 	mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
2858 	nfscl->nfscl_chtable4 = NULL;
2859 	nfscl->nfscl_zoneid = zoneid;
2860 
2861 	bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
2862 	ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
2863 #ifdef DEBUG
2864 	ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
2865 #endif
2866 	if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
2867 	    "misc", KSTAT_TYPE_NAMED, ndata,
2868 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
2869 		nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
2870 		nfs4_client_kstat->ks_snapshot = cl4_snapshot;
2871 		kstat_install(nfs4_client_kstat);
2872 	}
2873 	mutex_enter(&nfs4_clnt_list_lock);
2874 	list_insert_head(&nfs4_clnt_list, nfscl);
2875 	mutex_exit(&nfs4_clnt_list_lock);
2876 
2877 	return (nfscl);
2878 }
2879 
2880 /*ARGSUSED*/
2881 static void
2882 clfini4_zone(zoneid_t zoneid, void *arg)
2883 {
2884 	struct nfs4_clnt *nfscl = arg;
2885 	chhead_t *chp, *next;
2886 
2887 	if (nfscl == NULL)
2888 		return;
2889 	mutex_enter(&nfs4_clnt_list_lock);
2890 	list_remove(&nfs4_clnt_list, nfscl);
2891 	mutex_exit(&nfs4_clnt_list_lock);
2892 	clreclaim4_zone(nfscl, 0);
2893 	for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
2894 		ASSERT(chp->ch_list == NULL);
2895 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
2896 		next = chp->ch_next;
2897 		kmem_free(chp, sizeof (*chp));
2898 	}
2899 	kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
2900 	mutex_destroy(&nfscl->nfscl_chtable4_lock);
2901 	kmem_free(nfscl, sizeof (*nfscl));
2902 }
2903 
2904 /*
2905  * Called by endpnt_destructor to make sure the client handles are
2906  * cleaned up before the RPC endpoints.  This becomes a no-op if
2907  * clfini_zone (above) is called first.  This function is needed
2908  * (rather than relying on clfini_zone to clean up) because the ZSD
2909  * callbacks have no ordering mechanism, so we have no way to ensure
2910  * that clfini_zone is called before endpnt_destructor.
2911  */
2912 void
2913 clcleanup4_zone(zoneid_t zoneid)
2914 {
2915 	struct nfs4_clnt *nfscl;
2916 
2917 	mutex_enter(&nfs4_clnt_list_lock);
2918 	nfscl = list_head(&nfs4_clnt_list);
2919 	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
2920 		if (nfscl->nfscl_zoneid == zoneid) {
2921 			clreclaim4_zone(nfscl, 0);
2922 			break;
2923 		}
2924 	}
2925 	mutex_exit(&nfs4_clnt_list_lock);
2926 }
2927 
2928 int
2929 nfs4_subr_init(void)
2930 {
2931 	/*
2932 	 * Allocate and initialize the client handle cache
2933 	 */
2934 	chtab4_cache = kmem_cache_create("client_handle4_cache",
2935 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
2936 	    NULL, 0);
2937 
2938 	/*
2939 	 * Initialize the list of per-zone client handles (and associated data).
2940 	 * This needs to be done before we call zone_key_create().
2941 	 */
2942 	list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
2943 	    offsetof(struct nfs4_clnt, nfscl_node));
2944 
2945 	/*
2946 	 * Initialize the zone_key for per-zone client handle lists.
2947 	 */
2948 	zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
2949 
2950 	if (nfs4err_delay_time == 0)
2951 		nfs4err_delay_time = NFS4ERR_DELAY_TIME;
2952 
2953 	return (0);
2954 }
2955 
2956 int
2957 nfs4_subr_fini(void)
2958 {
2959 	/*
2960 	 * Deallocate the client handle cache
2961 	 */
2962 	kmem_cache_destroy(chtab4_cache);
2963 
2964 	/*
2965 	 * Destroy the zone_key
2966 	 */
2967 	(void) zone_key_delete(nfs4clnt_zone_key);
2968 
2969 	return (0);
2970 }
2971 /*
2972  * Set or Clear direct I/O flag
2973  * VOP_RWLOCK() is held for write access to prevent a race condition
2974  * which would occur if a process is in the middle of a write when
2975  * directio flag gets set. It is possible that all pages may not get flushed.
2976  *
2977  * This is a copy of nfs_directio, changes here may need to be made
2978  * there and vice versa.
2979  */
2980 
2981 int
2982 nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
2983 {
2984 	int	error = 0;
2985 	rnode4_t *rp;
2986 
2987 	rp = VTOR4(vp);
2988 
2989 	if (cmd == DIRECTIO_ON) {
2990 
2991 		if (rp->r_flags & R4DIRECTIO)
2992 			return (0);
2993 
2994 		/*
2995 		 * Flush the page cache.
2996 		 */
2997 
2998 		(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
2999 
3000 		if (rp->r_flags & R4DIRECTIO) {
3001 			VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
3002 			return (0);
3003 		}
3004 
3005 		if (nfs4_has_pages(vp) &&
3006 		    ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
3007 			error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
3008 			    B_INVAL, cr, NULL);
3009 			if (error) {
3010 				if (error == ENOSPC || error == EDQUOT) {
3011 					mutex_enter(&rp->r_statelock);
3012 					if (!rp->r_error)
3013 						rp->r_error = error;
3014 					mutex_exit(&rp->r_statelock);
3015 				}
3016 				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
3017 				return (error);
3018 			}
3019 		}
3020 
3021 		mutex_enter(&rp->r_statelock);
3022 		rp->r_flags |= R4DIRECTIO;
3023 		mutex_exit(&rp->r_statelock);
3024 		VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
3025 		return (0);
3026 	}
3027 
3028 	if (cmd == DIRECTIO_OFF) {
3029 		mutex_enter(&rp->r_statelock);
3030 		rp->r_flags &= ~R4DIRECTIO;	/* disable direct mode */
3031 		mutex_exit(&rp->r_statelock);
3032 		return (0);
3033 	}
3034 
3035 	return (EINVAL);
3036 }
3037 
3038 /*
3039  * Return TRUE if the file has any pages.  Always go back to
3040  * the master vnode to check v_pages since none of the shadows
3041  * can have pages.
3042  */
3043 
3044 bool_t
3045 nfs4_has_pages(vnode_t *vp)
3046 {
3047 	rnode4_t *rp;
3048 
3049 	rp = VTOR4(vp);
3050 	if (IS_SHADOW(vp, rp))
3051 		vp = RTOV4(rp);	/* RTOV4 always gives the master */
3052 
3053 	return (vn_has_cached_data(vp));
3054 }
3055 
3056 /*
3057  * This table is used to determine whether the client should attempt
3058  * failover based on the clnt_stat value returned by CLNT_CALL.  The
3059  * clnt_stat is used as an index into the table.  If
3060  * the error value that corresponds to the clnt_stat value in the
3061  * table is non-zero, then that is the error to be returned AND
3062  * that signals that failover should be attempted.
3063  *
3064  * Special note: If the RPC_ values change, then direct indexing of the
3065  * table is no longer valid, but having the RPC_ values in the table
3066  * allow the functions to detect the change and issue a warning.
3067  * In this case, the code will always attempt failover as a defensive
3068  * measure.
3069  */
3070 
3071 static struct try_failover_tab {
3072 	enum clnt_stat	cstat;
3073 	int		error;
3074 } try_failover_table [] = {
3075 
3076 	RPC_SUCCESS,		0,
3077 	RPC_CANTENCODEARGS,	0,
3078 	RPC_CANTDECODERES,	0,
3079 	RPC_CANTSEND,		ECOMM,
3080 	RPC_CANTRECV,		ECOMM,
3081 	RPC_TIMEDOUT,		ETIMEDOUT,
3082 	RPC_VERSMISMATCH,	0,
3083 	RPC_AUTHERROR,		0,
3084 	RPC_PROGUNAVAIL,	0,
3085 	RPC_PROGVERSMISMATCH,	0,
3086 	RPC_PROCUNAVAIL,	0,
3087 	RPC_CANTDECODEARGS,	0,
3088 	RPC_SYSTEMERROR,	ENOSR,
3089 	RPC_UNKNOWNHOST,	EHOSTUNREACH,
3090 	RPC_RPCBFAILURE,	ENETUNREACH,
3091 	RPC_PROGNOTREGISTERED,	ECONNREFUSED,
3092 	RPC_FAILED,		ETIMEDOUT,
3093 	RPC_UNKNOWNPROTO,	EHOSTUNREACH,
3094 	RPC_INTR,		0,
3095 	RPC_UNKNOWNADDR,	EHOSTUNREACH,
3096 	RPC_TLIERROR,		0,
3097 	RPC_NOBROADCAST,	EHOSTUNREACH,
3098 	RPC_N2AXLATEFAILURE,	ECONNREFUSED,
3099 	RPC_UDERROR,		0,
3100 	RPC_INPROGRESS,		0,
3101 	RPC_STALERACHANDLE,	EINVAL,
3102 	RPC_CANTCONNECT,	ECONNREFUSED,
3103 	RPC_XPRTFAILED,		ECONNABORTED,
3104 	RPC_CANTCREATESTREAM,	ECONNREFUSED,
3105 	RPC_CANTSTORE,		ENOBUFS
3106 };
3107 
3108 /*
3109  * nfs4_try_failover - determine whether the client should
3110  * attempt failover based on the values stored in the nfs4_error_t.
3111  */
3112 int
3113 nfs4_try_failover(nfs4_error_t *ep)
3114 {
3115 	if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
3116 		return (TRUE);
3117 
3118 	if (ep->error && ep->rpc_status != RPC_SUCCESS)
3119 		return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
3120 
3121 	return (FALSE);
3122 }
3123 
3124 /*
3125  * try_failover - internal version of nfs4_try_failover, called
3126  * only by rfscall and aclcall.  Determine if failover is warranted
3127  * based on the clnt_stat and return the error number if it is.
3128  */
3129 static int
3130 try_failover(enum clnt_stat rpc_status)
3131 {
3132 	int err = 0;
3133 
3134 	if (rpc_status == RPC_SUCCESS)
3135 		return (0);
3136 
3137 #ifdef	DEBUG
3138 	if (rpc_status != 0 && nfs4_try_failover_any) {
3139 		err = ETIMEDOUT;
3140 		goto done;
3141 	}
3142 #endif
3143 	/*
3144 	 * The rpc status is used as an index into the table.
3145 	 * If the rpc status is outside of the range of the
3146 	 * table or if the rpc error numbers have been changed
3147 	 * since the table was constructed, then print a warning
3148 	 * (DEBUG only) and try failover anyway.  Otherwise, just
3149 	 * grab the resulting error number out of the table.
3150 	 */
3151 	if (rpc_status < RPC_SUCCESS || rpc_status >=
3152 	    sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
3153 	    try_failover_table[rpc_status].cstat != rpc_status) {
3154 
3155 		err = ETIMEDOUT;
3156 #ifdef	DEBUG
3157 		cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
3158 		    rpc_status);
3159 #endif
3160 	} else
3161 		err = try_failover_table[rpc_status].error;
3162 
3163 done:
3164 	if (rpc_status)
3165 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3166 		    "nfs4_try_failover: %strying failover on error %d",
3167 		    err ? "" : "NOT ", rpc_status));
3168 
3169 	return (err);
3170 }
3171 
3172 void
3173 nfs4_error_zinit(nfs4_error_t *ep)
3174 {
3175 	ep->error = 0;
3176 	ep->stat = NFS4_OK;
3177 	ep->rpc_status = RPC_SUCCESS;
3178 }
3179 
3180 void
3181 nfs4_error_init(nfs4_error_t *ep, int error)
3182 {
3183 	ep->error = error;
3184 	ep->stat = NFS4_OK;
3185 	ep->rpc_status = RPC_SUCCESS;
3186 }
3187 
3188 
3189 #ifdef DEBUG
3190 
3191 /*
3192  * Return a 16-bit hash for filehandle, stateid, clientid, owner.
3193  * use the same algorithm as for NFS v3.
3194  *
3195  */
3196 int
3197 hash16(void *p, int len)
3198 {
3199 	int i, rem;
3200 	uint_t *wp;
3201 	uint_t key = 0;
3202 
3203 	/* protect against non word aligned */
3204 	if ((rem = len & 3) != 0)
3205 		len &= ~3;
3206 
3207 	for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
3208 		key ^= (*wp >> 16) ^ *wp;
3209 	}
3210 
3211 	/* hash left-over bytes */
3212 	for (i = 0; i < rem; i++)
3213 		key ^= *((uchar_t *)p + i);
3214 
3215 	return (key & 0xffff);
3216 }
3217 
3218 /*
3219  * rnode4info - return filehandle and path information for an rnode.
3220  * XXX MT issues: uses a single static buffer, no locking of path.
3221  */
3222 char *
3223 rnode4info(rnode4_t *rp)
3224 {
3225 	static char buf[80];
3226 	nfs4_fhandle_t fhandle;
3227 	char *path;
3228 	char *type;
3229 
3230 	if (rp == NULL)
3231 		return ("null");
3232 	if (rp->r_flags & R4ISXATTR)
3233 		type = "attr";
3234 	else if (RTOV4(rp)->v_flag & V_XATTRDIR)
3235 		type = "attrdir";
3236 	else if (RTOV4(rp)->v_flag & VROOT)
3237 		type = "root";
3238 	else if (RTOV4(rp)->v_type == VDIR)
3239 		type = "dir";
3240 	else if (RTOV4(rp)->v_type == VREG)
3241 		type = "file";
3242 	else
3243 		type = "other";
3244 	sfh4_copyval(rp->r_fh, &fhandle);
3245 	path = fn_path(rp->r_svnode.sv_name);
3246 	(void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
3247 	    (void *)rp, path, type, rp->r_flags,
3248 	    hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
3249 	kmem_free(path, strlen(path)+1);
3250 	return (buf);
3251 }
3252 #endif
3253