1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
29 */
30
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/proc.h>
36 #include <sys/user.h>
37 #include <sys/time.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/socket.h>
42 #include <sys/uio.h>
43 #include <sys/tiuser.h>
44 #include <sys/swap.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/kmem.h>
48 #include <sys/kstat.h>
49 #include <sys/cmn_err.h>
50 #include <sys/vtrace.h>
51 #include <sys/session.h>
52 #include <sys/dnlc.h>
53 #include <sys/bitmap.h>
54 #include <sys/acl.h>
55 #include <sys/ddi.h>
56 #include <sys/pathname.h>
57 #include <sys/flock.h>
58 #include <sys/dirent.h>
59 #include <sys/flock.h>
60 #include <sys/callb.h>
61 #include <sys/atomic.h>
62 #include <sys/list.h>
63 #include <sys/tsol/tnet.h>
64 #include <sys/priv.h>
65 #include <sys/sdt.h>
66 #include <sys/attr.h>
67
68 #include <inet/ip6.h>
69
70 #include <rpc/types.h>
71 #include <rpc/xdr.h>
72 #include <rpc/auth.h>
73 #include <rpc/clnt.h>
74
75 #include <nfs/nfs.h>
76 #include <nfs/nfs4.h>
77 #include <nfs/nfs_clnt.h>
78 #include <nfs/rnode.h>
79 #include <nfs/nfs_acl.h>
80
81 #include <sys/tsol/label.h>
82
83 /*
84 * The hash queues for the access to active and cached rnodes
85 * are organized as doubly linked lists. A reader/writer lock
86 * for each hash bucket is used to control access and to synchronize
87 * lookups, additions, and deletions from the hash queue.
88 *
89 * The rnode freelist is organized as a doubly linked list with
90 * a head pointer. Additions and deletions are synchronized via
91 * a single mutex.
92 *
93 * In order to add an rnode to the free list, it must be hashed into
94 * a hash queue and the exclusive lock to the hash queue be held.
95 * If an rnode is not hashed into a hash queue, then it is destroyed
96 * because it represents no valuable information that can be reused
97 * about the file. The exclusive lock to the hash queue must be
98 * held in order to prevent a lookup in the hash queue from finding
99 * the rnode and using it and assuming that the rnode is not on the
100 * freelist. The lookup in the hash queue will have the hash queue
101 * locked, either exclusive or shared.
102 *
103 * The vnode reference count for each rnode is not allowed to drop
104 * below 1. This prevents external entities, such as the VM
105 * subsystem, from acquiring references to vnodes already on the
106 * freelist and then trying to place them back on the freelist
107 * when their reference is released. This means that the when an
108 * rnode is looked up in the hash queues, then either the rnode
109 * is removed from the freelist and that reference is transferred to
110 * the new reference or the vnode reference count must be incremented
111 * accordingly. The mutex for the freelist must be held in order to
112 * accurately test to see if the rnode is on the freelist or not.
113 * The hash queue lock might be held shared and it is possible that
114 * two different threads may race to remove the rnode from the
115 * freelist. This race can be resolved by holding the mutex for the
116 * freelist. Please note that the mutex for the freelist does not
117 * need to held if the rnode is not on the freelist. It can not be
118 * placed on the freelist due to the requirement that the thread
119 * putting the rnode on the freelist must hold the exclusive lock
120 * to the hash queue and the thread doing the lookup in the hash
121 * queue is holding either a shared or exclusive lock to the hash
122 * queue.
123 *
124 * The lock ordering is:
125 *
126 * hash bucket lock -> vnode lock
127 * hash bucket lock -> freelist lock
128 */
129 static rhashq_t *rtable;
130
131 static kmutex_t rpfreelist_lock;
132 static rnode_t *rpfreelist = NULL;
133 static long rnew = 0;
134 long nrnode = 0;
135
136 static int rtablesize;
137 static int rtablemask;
138
139 static int hashlen = 4;
140
141 static struct kmem_cache *rnode_cache;
142
143 /*
144 * Mutex to protect the following variables:
145 * nfs_major
146 * nfs_minor
147 */
148 kmutex_t nfs_minor_lock;
149 int nfs_major;
150 int nfs_minor;
151
152 /* Do we allow preepoch (negative) time values otw? */
153 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
154
155 /*
156 * Access cache
157 */
158 static acache_hash_t *acache;
159 static long nacache; /* used strictly to size the number of hash queues */
160
161 static int acachesize;
162 static int acachemask;
163 static struct kmem_cache *acache_cache;
164
165 /*
166 * Client side utilities
167 */
168
169 /*
170 * client side statistics
171 */
172 static const struct clstat clstat_tmpl = {
173 { "calls", KSTAT_DATA_UINT64 },
174 { "badcalls", KSTAT_DATA_UINT64 },
175 { "clgets", KSTAT_DATA_UINT64 },
176 { "cltoomany", KSTAT_DATA_UINT64 },
177 #ifdef DEBUG
178 { "clalloc", KSTAT_DATA_UINT64 },
179 { "noresponse", KSTAT_DATA_UINT64 },
180 { "failover", KSTAT_DATA_UINT64 },
181 { "remap", KSTAT_DATA_UINT64 },
182 #endif
183 };
184
185 /*
186 * The following are statistics that describe behavior of the system as a whole
187 * and doesn't correspond to any one particular zone.
188 */
189 #ifdef DEBUG
190 static struct clstat_debug {
191 kstat_named_t nrnode; /* number of allocated rnodes */
192 kstat_named_t access; /* size of access cache */
193 kstat_named_t dirent; /* size of readdir cache */
194 kstat_named_t dirents; /* size of readdir buf cache */
195 kstat_named_t reclaim; /* number of reclaims */
196 kstat_named_t clreclaim; /* number of cl reclaims */
197 kstat_named_t f_reclaim; /* number of free reclaims */
198 kstat_named_t a_reclaim; /* number of active reclaims */
199 kstat_named_t r_reclaim; /* number of rnode reclaims */
200 kstat_named_t rpath; /* bytes used to store rpaths */
201 } clstat_debug = {
202 { "nrnode", KSTAT_DATA_UINT64 },
203 { "access", KSTAT_DATA_UINT64 },
204 { "dirent", KSTAT_DATA_UINT64 },
205 { "dirents", KSTAT_DATA_UINT64 },
206 { "reclaim", KSTAT_DATA_UINT64 },
207 { "clreclaim", KSTAT_DATA_UINT64 },
208 { "f_reclaim", KSTAT_DATA_UINT64 },
209 { "a_reclaim", KSTAT_DATA_UINT64 },
210 { "r_reclaim", KSTAT_DATA_UINT64 },
211 { "r_path", KSTAT_DATA_UINT64 },
212 };
213 #endif /* DEBUG */
214
215 /*
216 * We keep a global list of per-zone client data, so we can clean up all zones
217 * if we get low on memory.
218 */
219 static list_t nfs_clnt_list;
220 static kmutex_t nfs_clnt_list_lock;
221 static zone_key_t nfsclnt_zone_key;
222
223 static struct kmem_cache *chtab_cache;
224
225 /*
226 * Some servers do not properly update the attributes of the
227 * directory when changes are made. To allow interoperability
228 * with these broken servers, the nfs_disable_rddir_cache
229 * parameter must be set in /etc/system
230 */
231 int nfs_disable_rddir_cache = 0;
232
233 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 struct chtab **);
235 void clfree(CLIENT *, struct chtab *);
236 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
237 struct chtab **, struct nfs_clnt *);
238 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
239 struct chtab **, struct nfs_clnt *);
240 static void clreclaim(void *);
241 static int nfs_feedback(int, int, mntinfo_t *);
242 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
243 caddr_t, cred_t *, int *, enum clnt_stat *, int,
244 failinfo_t *);
245 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
246 caddr_t, cred_t *, int *, int, failinfo_t *);
247 static void rinactive(rnode_t *, cred_t *);
248 static int rtablehash(nfs_fhandle *);
249 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
250 struct vnodeops *,
251 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
252 cred_t *),
253 int (*)(const void *, const void *), int *, cred_t *,
254 char *, char *);
255 static void rp_rmfree(rnode_t *);
256 static void rp_addhash(rnode_t *);
257 static void rp_rmhash_locked(rnode_t *);
258 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
259 static void destroy_rnode(rnode_t *);
260 static void rddir_cache_free(rddir_cache *);
261 static int nfs_free_data_reclaim(rnode_t *);
262 static int nfs_active_data_reclaim(rnode_t *);
263 static int nfs_free_reclaim(void);
264 static int nfs_active_reclaim(void);
265 static int nfs_rnode_reclaim(void);
266 static void nfs_reclaim(void *);
267 static int failover_safe(failinfo_t *);
268 static void failover_newserver(mntinfo_t *mi);
269 static void failover_thread(mntinfo_t *mi);
270 static int failover_wait(mntinfo_t *);
271 static int failover_remap(failinfo_t *);
272 static int failover_lookup(char *, vnode_t *,
273 int (*)(vnode_t *, char *, vnode_t **,
274 struct pathname *, int, vnode_t *, cred_t *, int),
275 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
276 vnode_t **);
277 static void nfs_free_r_path(rnode_t *);
278 static void nfs_set_vroot(vnode_t *);
279 static char *nfs_getsrvnames(mntinfo_t *, size_t *);
280
281 /*
282 * from rpcsec module (common/rpcsec)
283 */
284 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
285 extern void sec_clnt_freeh(AUTH *);
286 extern void sec_clnt_freeinfo(struct sec_data *);
287
288 /*
289 * used in mount policy
290 */
291 extern ts_label_t *getflabel_cipso(vfs_t *);
292
293 /*
294 * EIO or EINTR are not recoverable errors.
295 */
296 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
297
298 #ifdef DEBUG
299 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
300 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
301 #else
302 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
303 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
304 #endif
305 /*
306 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
307 */
308 static int
clget_impl(clinfo_t * ci,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)309 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
310 struct chtab **chp, struct nfs_clnt *nfscl)
311 {
312 struct chhead *ch, *newch;
313 struct chhead **plistp;
314 struct chtab *cp;
315 int error;
316 k_sigset_t smask;
317
318 if (newcl == NULL || chp == NULL || ci == NULL)
319 return (EINVAL);
320
321 *newcl = NULL;
322 *chp = NULL;
323
324 /*
325 * Find an unused handle or create one
326 */
327 newch = NULL;
328 nfscl->nfscl_stat.clgets.value.ui64++;
329 top:
330 /*
331 * Find the correct entry in the cache to check for free
332 * client handles. The search is based on the RPC program
333 * number, program version number, dev_t for the transport
334 * device, and the protocol family.
335 */
336 mutex_enter(&nfscl->nfscl_chtable_lock);
337 plistp = &nfscl->nfscl_chtable;
338 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
339 if (ch->ch_prog == ci->cl_prog &&
340 ch->ch_vers == ci->cl_vers &&
341 ch->ch_dev == svp->sv_knconf->knc_rdev &&
342 (strcmp(ch->ch_protofmly,
343 svp->sv_knconf->knc_protofmly) == 0))
344 break;
345 plistp = &ch->ch_next;
346 }
347
348 /*
349 * If we didn't find a cache entry for this quadruple, then
350 * create one. If we don't have one already preallocated,
351 * then drop the cache lock, create one, and then start over.
352 * If we did have a preallocated entry, then just add it to
353 * the front of the list.
354 */
355 if (ch == NULL) {
356 if (newch == NULL) {
357 mutex_exit(&nfscl->nfscl_chtable_lock);
358 newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
359 newch->ch_timesused = 0;
360 newch->ch_prog = ci->cl_prog;
361 newch->ch_vers = ci->cl_vers;
362 newch->ch_dev = svp->sv_knconf->knc_rdev;
363 newch->ch_protofmly = kmem_alloc(
364 strlen(svp->sv_knconf->knc_protofmly) + 1,
365 KM_SLEEP);
366 (void) strcpy(newch->ch_protofmly,
367 svp->sv_knconf->knc_protofmly);
368 newch->ch_list = NULL;
369 goto top;
370 }
371 ch = newch;
372 newch = NULL;
373 ch->ch_next = nfscl->nfscl_chtable;
374 nfscl->nfscl_chtable = ch;
375 /*
376 * We found a cache entry, but if it isn't on the front of the
377 * list, then move it to the front of the list to try to take
378 * advantage of locality of operations.
379 */
380 } else if (ch != nfscl->nfscl_chtable) {
381 *plistp = ch->ch_next;
382 ch->ch_next = nfscl->nfscl_chtable;
383 nfscl->nfscl_chtable = ch;
384 }
385
386 /*
387 * If there was a free client handle cached, then remove it
388 * from the list, init it, and use it.
389 */
390 if (ch->ch_list != NULL) {
391 cp = ch->ch_list;
392 ch->ch_list = cp->ch_list;
393 mutex_exit(&nfscl->nfscl_chtable_lock);
394 if (newch != NULL) {
395 kmem_free(newch->ch_protofmly,
396 strlen(newch->ch_protofmly) + 1);
397 kmem_free(newch, sizeof (*newch));
398 }
399 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
400 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
401 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
402 &cp->ch_client->cl_auth);
403 if (error || cp->ch_client->cl_auth == NULL) {
404 CLNT_DESTROY(cp->ch_client);
405 kmem_cache_free(chtab_cache, cp);
406 return ((error != 0) ? error : EINTR);
407 }
408 ch->ch_timesused++;
409 *newcl = cp->ch_client;
410 *chp = cp;
411 return (0);
412 }
413
414 /*
415 * There weren't any free client handles which fit, so allocate
416 * a new one and use that.
417 */
418 #ifdef DEBUG
419 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
420 #endif
421 mutex_exit(&nfscl->nfscl_chtable_lock);
422
423 nfscl->nfscl_stat.cltoomany.value.ui64++;
424 if (newch != NULL) {
425 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
426 kmem_free(newch, sizeof (*newch));
427 }
428
429 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
430 cp->ch_head = ch;
431
432 sigintr(&smask, (int)ci->cl_flags & MI_INT);
433 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
434 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
435 sigunintr(&smask);
436
437 if (error != 0) {
438 kmem_cache_free(chtab_cache, cp);
439 #ifdef DEBUG
440 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
441 #endif
442 /*
443 * Warning is unnecessary if error is EINTR.
444 */
445 if (error != EINTR) {
446 nfs_cmn_err(error, CE_WARN,
447 "clget: couldn't create handle: %m\n");
448 }
449 return (error);
450 }
451 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
452 auth_destroy(cp->ch_client->cl_auth);
453 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
454 &cp->ch_client->cl_auth);
455 if (error || cp->ch_client->cl_auth == NULL) {
456 CLNT_DESTROY(cp->ch_client);
457 kmem_cache_free(chtab_cache, cp);
458 #ifdef DEBUG
459 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
460 #endif
461 return ((error != 0) ? error : EINTR);
462 }
463 ch->ch_timesused++;
464 *newcl = cp->ch_client;
465 ASSERT(cp->ch_client->cl_nosignal == FALSE);
466 *chp = cp;
467 return (0);
468 }
469
470 int
clget(clinfo_t * ci,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp)471 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
472 struct chtab **chp)
473 {
474 struct nfs_clnt *nfscl;
475
476 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
477 ASSERT(nfscl != NULL);
478
479 return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
480 }
481
482 static int
acl_clget(mntinfo_t * mi,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)483 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
484 struct chtab **chp, struct nfs_clnt *nfscl)
485 {
486 clinfo_t ci;
487 int error;
488
489 /*
490 * Set read buffer size to rsize
491 * and add room for RPC headers.
492 */
493 ci.cl_readsize = mi->mi_tsize;
494 if (ci.cl_readsize != 0)
495 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
496
497 /*
498 * If soft mount and server is down just try once.
499 * meaning: do not retransmit.
500 */
501 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
502 ci.cl_retrans = 0;
503 else
504 ci.cl_retrans = mi->mi_retrans;
505
506 ci.cl_prog = NFS_ACL_PROGRAM;
507 ci.cl_vers = mi->mi_vers;
508 ci.cl_flags = mi->mi_flags;
509
510 /*
511 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
512 * security flavor, the client tries to establish a security context
513 * by contacting the server. If the connection is timed out or reset,
514 * e.g. server reboot, we will try again.
515 */
516 do {
517 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
518
519 if (error == 0)
520 break;
521
522 /*
523 * For forced unmount or zone shutdown, bail out, no retry.
524 */
525 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
526 error = EIO;
527 break;
528 }
529
530 /* do not retry for softmount */
531 if (!(mi->mi_flags & MI_HARD))
532 break;
533
534 /* let the caller deal with the failover case */
535 if (FAILOVER_MOUNT(mi))
536 break;
537
538 } while (error == ETIMEDOUT || error == ECONNRESET);
539
540 return (error);
541 }
542
543 static int
nfs_clget(mntinfo_t * mi,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)544 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
545 struct chtab **chp, struct nfs_clnt *nfscl)
546 {
547 clinfo_t ci;
548 int error;
549
550 /*
551 * Set read buffer size to rsize
552 * and add room for RPC headers.
553 */
554 ci.cl_readsize = mi->mi_tsize;
555 if (ci.cl_readsize != 0)
556 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
557
558 /*
559 * If soft mount and server is down just try once.
560 * meaning: do not retransmit.
561 */
562 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
563 ci.cl_retrans = 0;
564 else
565 ci.cl_retrans = mi->mi_retrans;
566
567 ci.cl_prog = mi->mi_prog;
568 ci.cl_vers = mi->mi_vers;
569 ci.cl_flags = mi->mi_flags;
570
571 /*
572 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
573 * security flavor, the client tries to establish a security context
574 * by contacting the server. If the connection is timed out or reset,
575 * e.g. server reboot, we will try again.
576 */
577 do {
578 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
579
580 if (error == 0)
581 break;
582
583 /*
584 * For forced unmount or zone shutdown, bail out, no retry.
585 */
586 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
587 error = EIO;
588 break;
589 }
590
591 /* do not retry for softmount */
592 if (!(mi->mi_flags & MI_HARD))
593 break;
594
595 /* let the caller deal with the failover case */
596 if (FAILOVER_MOUNT(mi))
597 break;
598
599 } while (error == ETIMEDOUT || error == ECONNRESET);
600
601 return (error);
602 }
603
604 static void
clfree_impl(CLIENT * cl,struct chtab * cp,struct nfs_clnt * nfscl)605 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
606 {
607 if (cl->cl_auth != NULL) {
608 sec_clnt_freeh(cl->cl_auth);
609 cl->cl_auth = NULL;
610 }
611
612 /*
613 * Timestamp this cache entry so that we know when it was last
614 * used.
615 */
616 cp->ch_freed = gethrestime_sec();
617
618 /*
619 * Add the free client handle to the front of the list.
620 * This way, the list will be sorted in youngest to oldest
621 * order.
622 */
623 mutex_enter(&nfscl->nfscl_chtable_lock);
624 cp->ch_list = cp->ch_head->ch_list;
625 cp->ch_head->ch_list = cp;
626 mutex_exit(&nfscl->nfscl_chtable_lock);
627 }
628
629 void
clfree(CLIENT * cl,struct chtab * cp)630 clfree(CLIENT *cl, struct chtab *cp)
631 {
632 struct nfs_clnt *nfscl;
633
634 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
635 ASSERT(nfscl != NULL);
636
637 clfree_impl(cl, cp, nfscl);
638 }
639
640 #define CL_HOLDTIME 60 /* time to hold client handles */
641
642 static void
clreclaim_zone(struct nfs_clnt * nfscl,uint_t cl_holdtime)643 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
644 {
645 struct chhead *ch;
646 struct chtab *cp; /* list of objects that can be reclaimed */
647 struct chtab *cpe;
648 struct chtab *cpl;
649 struct chtab **cpp;
650 #ifdef DEBUG
651 int n = 0;
652 #endif
653
654 /*
655 * Need to reclaim some memory, so step through the cache
656 * looking through the lists for entries which can be freed.
657 */
658 cp = NULL;
659
660 mutex_enter(&nfscl->nfscl_chtable_lock);
661
662 /*
663 * Here we step through each non-NULL quadruple and start to
664 * construct the reclaim list pointed to by cp. Note that
665 * cp will contain all eligible chtab entries. When this traversal
666 * completes, chtab entries from the last quadruple will be at the
667 * front of cp and entries from previously inspected quadruples have
668 * been appended to the rear of cp.
669 */
670 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
671 if (ch->ch_list == NULL)
672 continue;
673 /*
674 * Search each list for entries older then
675 * cl_holdtime seconds. The lists are maintained
676 * in youngest to oldest order so that when the
677 * first entry is found which is old enough, then
678 * all of the rest of the entries on the list will
679 * be old enough as well.
680 */
681 cpl = ch->ch_list;
682 cpp = &ch->ch_list;
683 while (cpl != NULL &&
684 cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
685 cpp = &cpl->ch_list;
686 cpl = cpl->ch_list;
687 }
688 if (cpl != NULL) {
689 *cpp = NULL;
690 if (cp != NULL) {
691 cpe = cpl;
692 while (cpe->ch_list != NULL)
693 cpe = cpe->ch_list;
694 cpe->ch_list = cp;
695 }
696 cp = cpl;
697 }
698 }
699
700 mutex_exit(&nfscl->nfscl_chtable_lock);
701
702 /*
703 * If cp is empty, then there is nothing to reclaim here.
704 */
705 if (cp == NULL)
706 return;
707
708 /*
709 * Step through the list of entries to free, destroying each client
710 * handle and kmem_free'ing the memory for each entry.
711 */
712 while (cp != NULL) {
713 #ifdef DEBUG
714 n++;
715 #endif
716 CLNT_DESTROY(cp->ch_client);
717 cpl = cp->ch_list;
718 kmem_cache_free(chtab_cache, cp);
719 cp = cpl;
720 }
721
722 #ifdef DEBUG
723 /*
724 * Update clalloc so that nfsstat shows the current number
725 * of allocated client handles.
726 */
727 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
728 #endif
729 }
730
731 /* ARGSUSED */
732 static void
clreclaim(void * all)733 clreclaim(void *all)
734 {
735 struct nfs_clnt *nfscl;
736
737 #ifdef DEBUG
738 clstat_debug.clreclaim.value.ui64++;
739 #endif
740 /*
741 * The system is low on memory; go through and try to reclaim some from
742 * every zone on the system.
743 */
744 mutex_enter(&nfs_clnt_list_lock);
745 nfscl = list_head(&nfs_clnt_list);
746 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
747 clreclaim_zone(nfscl, CL_HOLDTIME);
748 mutex_exit(&nfs_clnt_list_lock);
749 }
750
751 /*
752 * Minimum time-out values indexed by call type
753 * These units are in "eights" of a second to avoid multiplies
754 */
755 static unsigned int minimum_timeo[] = {
756 6, 7, 10
757 };
758
759 /*
760 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
761 */
762 #define MAXTIMO (20*hz)
763 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
764 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
765
766 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
767 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
768 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
769
770 /*
771 * Function called when rfscall notices that we have been
772 * re-transmitting, or when we get a response without retransmissions.
773 * Return 1 if the transfer size was adjusted down - 0 if no change.
774 */
775 static int
nfs_feedback(int flag,int which,mntinfo_t * mi)776 nfs_feedback(int flag, int which, mntinfo_t *mi)
777 {
778 int kind;
779 int r = 0;
780
781 mutex_enter(&mi->mi_lock);
782 if (flag == FEEDBACK_REXMIT1) {
783 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
784 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
785 goto done;
786 if (mi->mi_curread > MIN_NFS_TSIZE) {
787 mi->mi_curread /= 2;
788 if (mi->mi_curread < MIN_NFS_TSIZE)
789 mi->mi_curread = MIN_NFS_TSIZE;
790 r = 1;
791 }
792
793 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
794 mi->mi_curwrite /= 2;
795 if (mi->mi_curwrite < MIN_NFS_TSIZE)
796 mi->mi_curwrite = MIN_NFS_TSIZE;
797 r = 1;
798 }
799 } else if (flag == FEEDBACK_OK) {
800 kind = mi->mi_timer_type[which];
801 if (kind == 0 ||
802 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
803 goto done;
804 if (kind == 1) {
805 if (mi->mi_curread >= mi->mi_tsize)
806 goto done;
807 mi->mi_curread += MIN_NFS_TSIZE;
808 if (mi->mi_curread > mi->mi_tsize/2)
809 mi->mi_curread = mi->mi_tsize;
810 } else if (kind == 2) {
811 if (mi->mi_curwrite >= mi->mi_stsize)
812 goto done;
813 mi->mi_curwrite += MIN_NFS_TSIZE;
814 if (mi->mi_curwrite > mi->mi_stsize/2)
815 mi->mi_curwrite = mi->mi_stsize;
816 }
817 }
818 done:
819 mutex_exit(&mi->mi_lock);
820 return (r);
821 }
822
823 #ifdef DEBUG
824 static int rfs2call_hits = 0;
825 static int rfs2call_misses = 0;
826 #endif
827
828 int
rfs2call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,enum nfsstat * statusp,int flags,failinfo_t * fi)829 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
830 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
831 enum nfsstat *statusp, int flags, failinfo_t *fi)
832 {
833 int rpcerror;
834 enum clnt_stat rpc_status;
835
836 ASSERT(statusp != NULL);
837
838 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
839 cr, douprintf, &rpc_status, flags, fi);
840 if (!rpcerror) {
841 /*
842 * See crnetadjust() for comments.
843 */
844 if (*statusp == NFSERR_ACCES &&
845 (cr = crnetadjust(cr)) != NULL) {
846 #ifdef DEBUG
847 rfs2call_hits++;
848 #endif
849 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
850 resp, cr, douprintf, NULL, flags, fi);
851 crfree(cr);
852 #ifdef DEBUG
853 if (*statusp == NFSERR_ACCES)
854 rfs2call_misses++;
855 #endif
856 }
857 } else if (rpc_status == RPC_PROCUNAVAIL) {
858 *statusp = NFSERR_OPNOTSUPP;
859 rpcerror = 0;
860 }
861
862 return (rpcerror);
863 }
864
865 #define NFS3_JUKEBOX_DELAY 10 * hz
866
867 static clock_t nfs3_jukebox_delay = 0;
868
869 #ifdef DEBUG
870 static int rfs3call_hits = 0;
871 static int rfs3call_misses = 0;
872 #endif
873
874 int
rfs3call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,nfsstat3 * statusp,int flags,failinfo_t * fi)875 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
876 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
877 nfsstat3 *statusp, int flags, failinfo_t *fi)
878 {
879 int rpcerror;
880 int user_informed;
881
882 user_informed = 0;
883 do {
884 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
885 cr, douprintf, NULL, flags, fi);
886 if (!rpcerror) {
887 cred_t *crr;
888 if (*statusp == NFS3ERR_JUKEBOX) {
889 if (ttoproc(curthread) == &p0) {
890 rpcerror = EAGAIN;
891 break;
892 }
893 if (!user_informed) {
894 user_informed = 1;
895 uprintf(
896 "file temporarily unavailable on the server, retrying...\n");
897 }
898 delay(nfs3_jukebox_delay);
899 }
900 /*
901 * See crnetadjust() for comments.
902 */
903 else if (*statusp == NFS3ERR_ACCES &&
904 (crr = crnetadjust(cr)) != NULL) {
905 #ifdef DEBUG
906 rfs3call_hits++;
907 #endif
908 rpcerror = rfscall(mi, which, xdrargs, argsp,
909 xdrres, resp, crr, douprintf,
910 NULL, flags, fi);
911
912 crfree(crr);
913 #ifdef DEBUG
914 if (*statusp == NFS3ERR_ACCES)
915 rfs3call_misses++;
916 #endif
917 }
918 }
919 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
920
921 return (rpcerror);
922 }
923
924 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
925 #define INC_READERS(mi) { \
926 mi->mi_readers++; \
927 }
928 #define DEC_READERS(mi) { \
929 mi->mi_readers--; \
930 if (mi->mi_readers == 0) \
931 cv_broadcast(&mi->mi_failover_cv); \
932 }
933
934 static int
rfscall(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * douprintf,enum clnt_stat * rpc_status,int flags,failinfo_t * fi)935 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
936 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
937 enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
938 {
939 CLIENT *client;
940 struct chtab *ch;
941 cred_t *cr = icr;
942 enum clnt_stat status;
943 struct rpc_err rpcerr, rpcerr_tmp;
944 struct timeval wait;
945 int timeo; /* in units of hz */
946 int my_rsize, my_wsize;
947 bool_t tryagain;
948 bool_t cred_cloned = FALSE;
949 k_sigset_t smask;
950 servinfo_t *svp;
951 struct nfs_clnt *nfscl;
952 zoneid_t zoneid = getzoneid();
953 char *msg;
954 #ifdef DEBUG
955 char *bufp;
956 #endif
957
958
959 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
960 "rfscall_start:which %d mi %p", which, mi);
961
962 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
963 ASSERT(nfscl != NULL);
964
965 nfscl->nfscl_stat.calls.value.ui64++;
966 mi->mi_reqs[which].value.ui64++;
967
968 rpcerr.re_status = RPC_SUCCESS;
969
970 /*
971 * In case of forced unmount or zone shutdown, return EIO.
972 */
973
974 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
975 rpcerr.re_status = RPC_FAILED;
976 rpcerr.re_errno = EIO;
977 return (rpcerr.re_errno);
978 }
979
980 /*
981 * Remember the transfer sizes in case
982 * nfs_feedback changes them underneath us.
983 */
984 my_rsize = mi->mi_curread;
985 my_wsize = mi->mi_curwrite;
986
987 /*
988 * NFS client failover support
989 *
990 * If this rnode is not in sync with the current server (VALID_FH),
991 * we'd like to do a remap to get in sync. We can be interrupted
992 * in failover_remap(), and if so we'll bail. Otherwise, we'll
993 * use the best info we have to try the RPC. Part of that is
994 * unconditionally updating the filehandle copy kept for V3.
995 *
996 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
997 * rw_enter(); we're trying to keep the current server from being
998 * changed on us until we're done with the remapping and have a
999 * matching client handle. We don't want to sending a filehandle
1000 * to the wrong host.
1001 */
1002 failoverretry:
1003 if (FAILOVER_MOUNT(mi)) {
1004 mutex_enter(&mi->mi_lock);
1005 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1006 if (failover_wait(mi)) {
1007 mutex_exit(&mi->mi_lock);
1008 return (EINTR);
1009 }
1010 }
1011 INC_READERS(mi);
1012 mutex_exit(&mi->mi_lock);
1013 if (fi) {
1014 if (!VALID_FH(fi) &&
1015 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1016 int remaperr;
1017
1018 svp = mi->mi_curr_serv;
1019 remaperr = failover_remap(fi);
1020 if (remaperr != 0) {
1021 #ifdef DEBUG
1022 if (remaperr != EINTR)
1023 nfs_cmn_err(remaperr, CE_WARN,
1024 "rfscall couldn't failover: %m");
1025 #endif
1026 mutex_enter(&mi->mi_lock);
1027 DEC_READERS(mi);
1028 mutex_exit(&mi->mi_lock);
1029 /*
1030 * If failover_remap returns ETIMEDOUT
1031 * and the filesystem is hard mounted
1032 * we have to retry the call with a new
1033 * server.
1034 */
1035 if ((mi->mi_flags & MI_HARD) &&
1036 IS_RECOVERABLE_ERROR(remaperr)) {
1037 if (svp == mi->mi_curr_serv)
1038 failover_newserver(mi);
1039 rpcerr.re_status = RPC_SUCCESS;
1040 goto failoverretry;
1041 }
1042 rpcerr.re_errno = remaperr;
1043 return (remaperr);
1044 }
1045 }
1046 if (fi->fhp && fi->copyproc)
1047 (*fi->copyproc)(fi->fhp, fi->vp);
1048 }
1049 }
1050
1051 /* For TSOL, use a new cred which has net_mac_aware flag */
1052 if (!cred_cloned && is_system_labeled()) {
1053 cred_cloned = TRUE;
1054 cr = crdup(icr);
1055 (void) setpflags(NET_MAC_AWARE, 1, cr);
1056 }
1057
1058 /*
1059 * clget() calls clnt_tli_kinit() which clears the xid, so we
1060 * are guaranteed to reprocess the retry as a new request.
1061 */
1062 svp = mi->mi_curr_serv;
1063 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1064
1065 if (FAILOVER_MOUNT(mi)) {
1066 mutex_enter(&mi->mi_lock);
1067 DEC_READERS(mi);
1068 mutex_exit(&mi->mi_lock);
1069
1070 if ((rpcerr.re_errno == ETIMEDOUT ||
1071 rpcerr.re_errno == ECONNRESET) &&
1072 failover_safe(fi)) {
1073 if (svp == mi->mi_curr_serv)
1074 failover_newserver(mi);
1075 goto failoverretry;
1076 }
1077 }
1078 if (rpcerr.re_errno != 0)
1079 return (rpcerr.re_errno);
1080
1081 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1082 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1083 timeo = (mi->mi_timeo * hz) / 10;
1084 } else {
1085 mutex_enter(&mi->mi_lock);
1086 timeo = CLNT_SETTIMERS(client,
1087 &(mi->mi_timers[mi->mi_timer_type[which]]),
1088 &(mi->mi_timers[NFS_CALLTYPES]),
1089 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1090 (void (*)())NULL, (caddr_t)mi, 0);
1091 mutex_exit(&mi->mi_lock);
1092 }
1093
1094 /*
1095 * If hard mounted fs, retry call forever unless hard error occurs.
1096 */
1097 do {
1098 tryagain = FALSE;
1099
1100 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1101 status = RPC_FAILED;
1102 rpcerr.re_status = RPC_FAILED;
1103 rpcerr.re_errno = EIO;
1104 break;
1105 }
1106
1107 TICK_TO_TIMEVAL(timeo, &wait);
1108
1109 /*
1110 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1111 * and SIGTERM. (Preserving the existing masks).
1112 * Mask out SIGINT if mount option nointr is specified.
1113 */
1114 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1115 if (!(mi->mi_flags & MI_INT))
1116 client->cl_nosignal = TRUE;
1117
1118 /*
1119 * If there is a current signal, then don't bother
1120 * even trying to send out the request because we
1121 * won't be able to block waiting for the response.
1122 * Simply assume RPC_INTR and get on with it.
1123 */
1124 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1125 status = RPC_INTR;
1126 else {
1127 status = CLNT_CALL(client, which, xdrargs, argsp,
1128 xdrres, resp, wait);
1129 }
1130
1131 if (!(mi->mi_flags & MI_INT))
1132 client->cl_nosignal = FALSE;
1133 /*
1134 * restore original signal mask
1135 */
1136 sigunintr(&smask);
1137
1138 switch (status) {
1139 case RPC_SUCCESS:
1140 if ((mi->mi_flags & MI_DYNAMIC) &&
1141 mi->mi_timer_type[which] != 0 &&
1142 (mi->mi_curread != my_rsize ||
1143 mi->mi_curwrite != my_wsize))
1144 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1145 break;
1146
1147 case RPC_INTR:
1148 /*
1149 * There is no way to recover from this error,
1150 * even if mount option nointr is specified.
1151 * SIGKILL, for example, cannot be blocked.
1152 */
1153 rpcerr.re_status = RPC_INTR;
1154 rpcerr.re_errno = EINTR;
1155 break;
1156
1157 case RPC_UDERROR:
1158 /*
1159 * If the NFS server is local (vold) and
1160 * it goes away then we get RPC_UDERROR.
1161 * This is a retryable error, so we would
1162 * loop, so check to see if the specific
1163 * error was ECONNRESET, indicating that
1164 * target did not exist at all. If so,
1165 * return with RPC_PROGUNAVAIL and
1166 * ECONNRESET to indicate why.
1167 */
1168 CLNT_GETERR(client, &rpcerr);
1169 if (rpcerr.re_errno == ECONNRESET) {
1170 rpcerr.re_status = RPC_PROGUNAVAIL;
1171 rpcerr.re_errno = ECONNRESET;
1172 break;
1173 }
1174 /*FALLTHROUGH*/
1175
1176 default: /* probably RPC_TIMEDOUT */
1177 if (IS_UNRECOVERABLE_RPC(status))
1178 break;
1179
1180 /*
1181 * increment server not responding count
1182 */
1183 mutex_enter(&mi->mi_lock);
1184 mi->mi_noresponse++;
1185 mutex_exit(&mi->mi_lock);
1186 #ifdef DEBUG
1187 nfscl->nfscl_stat.noresponse.value.ui64++;
1188 #endif
1189
1190 if (!(mi->mi_flags & MI_HARD)) {
1191 if (!(mi->mi_flags & MI_SEMISOFT) ||
1192 (mi->mi_ss_call_type[which] == 0))
1193 break;
1194 }
1195
1196 /*
1197 * The call is in progress (over COTS).
1198 * Try the CLNT_CALL again, but don't
1199 * print a noisy error message.
1200 */
1201 if (status == RPC_INPROGRESS) {
1202 tryagain = TRUE;
1203 break;
1204 }
1205
1206 if (flags & RFSCALL_SOFT)
1207 break;
1208
1209 /*
1210 * On zone shutdown, just move on.
1211 */
1212 if (zone_status_get(curproc->p_zone) >=
1213 ZONE_IS_SHUTTING_DOWN) {
1214 rpcerr.re_status = RPC_FAILED;
1215 rpcerr.re_errno = EIO;
1216 break;
1217 }
1218
1219 /*
1220 * NFS client failover support
1221 *
1222 * If the current server just failed us, we'll
1223 * start the process of finding a new server.
1224 * After that, we can just retry.
1225 */
1226 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1227 if (svp == mi->mi_curr_serv)
1228 failover_newserver(mi);
1229 clfree_impl(client, ch, nfscl);
1230 goto failoverretry;
1231 }
1232
1233 tryagain = TRUE;
1234 timeo = backoff(timeo);
1235
1236 CLNT_GETERR(client, &rpcerr_tmp);
1237 if ((status == RPC_CANTSEND) &&
1238 (rpcerr_tmp.re_errno == ENOBUFS))
1239 msg = SRV_QFULL_MSG;
1240 else
1241 msg = SRV_NOTRESP_MSG;
1242
1243 mutex_enter(&mi->mi_lock);
1244 if (!(mi->mi_flags & MI_PRINTED)) {
1245 mi->mi_flags |= MI_PRINTED;
1246 mutex_exit(&mi->mi_lock);
1247 #ifdef DEBUG
1248 zprintf(zoneid, msg, mi->mi_vers,
1249 svp->sv_hostname);
1250 #else
1251 zprintf(zoneid, msg, svp->sv_hostname);
1252 #endif
1253 } else
1254 mutex_exit(&mi->mi_lock);
1255 if (*douprintf && nfs_has_ctty()) {
1256 *douprintf = 0;
1257 if (!(mi->mi_flags & MI_NOPRINT))
1258 #ifdef DEBUG
1259 uprintf(msg, mi->mi_vers,
1260 svp->sv_hostname);
1261 #else
1262 uprintf(msg, svp->sv_hostname);
1263 #endif
1264 }
1265
1266 /*
1267 * If doing dynamic adjustment of transfer
1268 * size and if it's a read or write call
1269 * and if the transfer size changed while
1270 * retransmitting or if the feedback routine
1271 * changed the transfer size,
1272 * then exit rfscall so that the transfer
1273 * size can be adjusted at the vnops level.
1274 */
1275 if ((mi->mi_flags & MI_DYNAMIC) &&
1276 mi->mi_timer_type[which] != 0 &&
1277 (mi->mi_curread != my_rsize ||
1278 mi->mi_curwrite != my_wsize ||
1279 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1280 /*
1281 * On read or write calls, return
1282 * back to the vnode ops level if
1283 * the transfer size changed.
1284 */
1285 clfree_impl(client, ch, nfscl);
1286 if (cred_cloned)
1287 crfree(cr);
1288 return (ENFS_TRYAGAIN);
1289 }
1290 }
1291 } while (tryagain);
1292
1293 if (status != RPC_SUCCESS) {
1294 /*
1295 * Let soft mounts use the timed out message.
1296 */
1297 if (status == RPC_INPROGRESS)
1298 status = RPC_TIMEDOUT;
1299 nfscl->nfscl_stat.badcalls.value.ui64++;
1300 if (status != RPC_INTR) {
1301 mutex_enter(&mi->mi_lock);
1302 mi->mi_flags |= MI_DOWN;
1303 mutex_exit(&mi->mi_lock);
1304 CLNT_GETERR(client, &rpcerr);
1305 #ifdef DEBUG
1306 bufp = clnt_sperror(client, svp->sv_hostname);
1307 zprintf(zoneid, "NFS%d %s failed for %s\n",
1308 mi->mi_vers, mi->mi_rfsnames[which], bufp);
1309 if (nfs_has_ctty()) {
1310 if (!(mi->mi_flags & MI_NOPRINT)) {
1311 uprintf("NFS%d %s failed for %s\n",
1312 mi->mi_vers, mi->mi_rfsnames[which],
1313 bufp);
1314 }
1315 }
1316 kmem_free(bufp, MAXPATHLEN);
1317 #else
1318 zprintf(zoneid,
1319 "NFS %s failed for server %s: error %d (%s)\n",
1320 mi->mi_rfsnames[which], svp->sv_hostname,
1321 status, clnt_sperrno(status));
1322 if (nfs_has_ctty()) {
1323 if (!(mi->mi_flags & MI_NOPRINT)) {
1324 uprintf(
1325 "NFS %s failed for server %s: error %d (%s)\n",
1326 mi->mi_rfsnames[which],
1327 svp->sv_hostname, status,
1328 clnt_sperrno(status));
1329 }
1330 }
1331 #endif
1332 /*
1333 * when CLNT_CALL() fails with RPC_AUTHERROR,
1334 * re_errno is set appropriately depending on
1335 * the authentication error
1336 */
1337 if (status == RPC_VERSMISMATCH ||
1338 status == RPC_PROGVERSMISMATCH)
1339 rpcerr.re_errno = EIO;
1340 }
1341 } else {
1342 /*
1343 * Test the value of mi_down and mi_printed without
1344 * holding the mi_lock mutex. If they are both zero,
1345 * then it is okay to skip the down and printed
1346 * processing. This saves on a mutex_enter and
1347 * mutex_exit pair for a normal, successful RPC.
1348 * This was just complete overhead.
1349 */
1350 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1351 mutex_enter(&mi->mi_lock);
1352 mi->mi_flags &= ~MI_DOWN;
1353 if (mi->mi_flags & MI_PRINTED) {
1354 mi->mi_flags &= ~MI_PRINTED;
1355 mutex_exit(&mi->mi_lock);
1356 #ifdef DEBUG
1357 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358 zprintf(zoneid, "NFS%d server %s ok\n",
1359 mi->mi_vers, svp->sv_hostname);
1360 #else
1361 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362 zprintf(zoneid, "NFS server %s ok\n",
1363 svp->sv_hostname);
1364 #endif
1365 } else
1366 mutex_exit(&mi->mi_lock);
1367 }
1368
1369 if (*douprintf == 0) {
1370 if (!(mi->mi_flags & MI_NOPRINT))
1371 #ifdef DEBUG
1372 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1373 uprintf("NFS%d server %s ok\n",
1374 mi->mi_vers, svp->sv_hostname);
1375 #else
1376 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1377 uprintf("NFS server %s ok\n", svp->sv_hostname);
1378 #endif
1379 *douprintf = 1;
1380 }
1381 }
1382
1383 clfree_impl(client, ch, nfscl);
1384 if (cred_cloned)
1385 crfree(cr);
1386
1387 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1388
1389 if (rpc_status != NULL)
1390 *rpc_status = rpcerr.re_status;
1391
1392 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1393 rpcerr.re_errno);
1394
1395 return (rpcerr.re_errno);
1396 }
1397
1398 #ifdef DEBUG
1399 static int acl2call_hits = 0;
1400 static int acl2call_misses = 0;
1401 #endif
1402
1403 int
acl2call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,enum nfsstat * statusp,int flags,failinfo_t * fi)1404 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1405 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1406 enum nfsstat *statusp, int flags, failinfo_t *fi)
1407 {
1408 int rpcerror;
1409
1410 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1411 cr, douprintf, flags, fi);
1412 if (!rpcerror) {
1413 /*
1414 * See comments with crnetadjust().
1415 */
1416 if (*statusp == NFSERR_ACCES &&
1417 (cr = crnetadjust(cr)) != NULL) {
1418 #ifdef DEBUG
1419 acl2call_hits++;
1420 #endif
1421 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1422 resp, cr, douprintf, flags, fi);
1423 crfree(cr);
1424 #ifdef DEBUG
1425 if (*statusp == NFSERR_ACCES)
1426 acl2call_misses++;
1427 #endif
1428 }
1429 }
1430
1431 return (rpcerror);
1432 }
1433
1434 #ifdef DEBUG
1435 static int acl3call_hits = 0;
1436 static int acl3call_misses = 0;
1437 #endif
1438
1439 int
acl3call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,nfsstat3 * statusp,int flags,failinfo_t * fi)1440 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1441 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1442 nfsstat3 *statusp, int flags, failinfo_t *fi)
1443 {
1444 int rpcerror;
1445 int user_informed;
1446
1447 user_informed = 0;
1448
1449 do {
1450 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1451 cr, douprintf, flags, fi);
1452 if (!rpcerror) {
1453 cred_t *crr;
1454 if (*statusp == NFS3ERR_JUKEBOX) {
1455 if (!user_informed) {
1456 user_informed = 1;
1457 uprintf(
1458 "file temporarily unavailable on the server, retrying...\n");
1459 }
1460 delay(nfs3_jukebox_delay);
1461 }
1462 /*
1463 * See crnetadjust() for comments.
1464 */
1465 else if (*statusp == NFS3ERR_ACCES &&
1466 (crr = crnetadjust(cr)) != NULL) {
1467 #ifdef DEBUG
1468 acl3call_hits++;
1469 #endif
1470 rpcerror = aclcall(mi, which, xdrargs, argsp,
1471 xdrres, resp, crr, douprintf, flags, fi);
1472
1473 crfree(crr);
1474 #ifdef DEBUG
1475 if (*statusp == NFS3ERR_ACCES)
1476 acl3call_misses++;
1477 #endif
1478 }
1479 }
1480 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1481
1482 return (rpcerror);
1483 }
1484
1485 static int
aclcall(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * douprintf,int flags,failinfo_t * fi)1486 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1487 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1488 int flags, failinfo_t *fi)
1489 {
1490 CLIENT *client;
1491 struct chtab *ch;
1492 cred_t *cr = icr;
1493 bool_t cred_cloned = FALSE;
1494 enum clnt_stat status;
1495 struct rpc_err rpcerr;
1496 struct timeval wait;
1497 int timeo; /* in units of hz */
1498 #if 0 /* notyet */
1499 int my_rsize, my_wsize;
1500 #endif
1501 bool_t tryagain;
1502 k_sigset_t smask;
1503 servinfo_t *svp;
1504 struct nfs_clnt *nfscl;
1505 zoneid_t zoneid = getzoneid();
1506 #ifdef DEBUG
1507 char *bufp;
1508 #endif
1509
1510 #if 0 /* notyet */
1511 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1512 "rfscall_start:which %d mi %p", which, mi);
1513 #endif
1514
1515 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1516 ASSERT(nfscl != NULL);
1517
1518 nfscl->nfscl_stat.calls.value.ui64++;
1519 mi->mi_aclreqs[which].value.ui64++;
1520
1521 rpcerr.re_status = RPC_SUCCESS;
1522
1523 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1524 rpcerr.re_status = RPC_FAILED;
1525 rpcerr.re_errno = EIO;
1526 return (rpcerr.re_errno);
1527 }
1528
1529 #if 0 /* notyet */
1530 /*
1531 * Remember the transfer sizes in case
1532 * nfs_feedback changes them underneath us.
1533 */
1534 my_rsize = mi->mi_curread;
1535 my_wsize = mi->mi_curwrite;
1536 #endif
1537
1538 /*
1539 * NFS client failover support
1540 *
1541 * If this rnode is not in sync with the current server (VALID_FH),
1542 * we'd like to do a remap to get in sync. We can be interrupted
1543 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1544 * use the best info we have to try the RPC. Part of that is
1545 * unconditionally updating the filehandle copy kept for V3.
1546 *
1547 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1548 * rw_enter(); we're trying to keep the current server from being
1549 * changed on us until we're done with the remapping and have a
1550 * matching client handle. We don't want to sending a filehandle
1551 * to the wrong host.
1552 */
1553 failoverretry:
1554 if (FAILOVER_MOUNT(mi)) {
1555 mutex_enter(&mi->mi_lock);
1556 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1557 if (failover_wait(mi)) {
1558 mutex_exit(&mi->mi_lock);
1559 return (EINTR);
1560 }
1561 }
1562 INC_READERS(mi);
1563 mutex_exit(&mi->mi_lock);
1564 if (fi) {
1565 if (!VALID_FH(fi) &&
1566 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1567 int remaperr;
1568
1569 svp = mi->mi_curr_serv;
1570 remaperr = failover_remap(fi);
1571 if (remaperr != 0) {
1572 #ifdef DEBUG
1573 if (remaperr != EINTR)
1574 nfs_cmn_err(remaperr, CE_WARN,
1575 "aclcall couldn't failover: %m");
1576 #endif
1577 mutex_enter(&mi->mi_lock);
1578 DEC_READERS(mi);
1579 mutex_exit(&mi->mi_lock);
1580
1581 /*
1582 * If failover_remap returns ETIMEDOUT
1583 * and the filesystem is hard mounted
1584 * we have to retry the call with a new
1585 * server.
1586 */
1587 if ((mi->mi_flags & MI_HARD) &&
1588 IS_RECOVERABLE_ERROR(remaperr)) {
1589 if (svp == mi->mi_curr_serv)
1590 failover_newserver(mi);
1591 rpcerr.re_status = RPC_SUCCESS;
1592 goto failoverretry;
1593 }
1594 return (remaperr);
1595 }
1596 }
1597 if (fi->fhp && fi->copyproc)
1598 (*fi->copyproc)(fi->fhp, fi->vp);
1599 }
1600 }
1601
1602 /* For TSOL, use a new cred which has net_mac_aware flag */
1603 if (!cred_cloned && is_system_labeled()) {
1604 cred_cloned = TRUE;
1605 cr = crdup(icr);
1606 (void) setpflags(NET_MAC_AWARE, 1, cr);
1607 }
1608
1609 /*
1610 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1611 * are guaranteed to reprocess the retry as a new request.
1612 */
1613 svp = mi->mi_curr_serv;
1614 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1615 if (FAILOVER_MOUNT(mi)) {
1616 mutex_enter(&mi->mi_lock);
1617 DEC_READERS(mi);
1618 mutex_exit(&mi->mi_lock);
1619
1620 if ((rpcerr.re_errno == ETIMEDOUT ||
1621 rpcerr.re_errno == ECONNRESET) &&
1622 failover_safe(fi)) {
1623 if (svp == mi->mi_curr_serv)
1624 failover_newserver(mi);
1625 goto failoverretry;
1626 }
1627 }
1628 if (rpcerr.re_errno != 0) {
1629 if (cred_cloned)
1630 crfree(cr);
1631 return (rpcerr.re_errno);
1632 }
1633
1634 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1635 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1636 timeo = (mi->mi_timeo * hz) / 10;
1637 } else {
1638 mutex_enter(&mi->mi_lock);
1639 timeo = CLNT_SETTIMERS(client,
1640 &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1641 &(mi->mi_timers[NFS_CALLTYPES]),
1642 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1643 (void (*)()) 0, (caddr_t)mi, 0);
1644 mutex_exit(&mi->mi_lock);
1645 }
1646
1647 /*
1648 * If hard mounted fs, retry call forever unless hard error occurs.
1649 */
1650 do {
1651 tryagain = FALSE;
1652
1653 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1654 status = RPC_FAILED;
1655 rpcerr.re_status = RPC_FAILED;
1656 rpcerr.re_errno = EIO;
1657 break;
1658 }
1659
1660 TICK_TO_TIMEVAL(timeo, &wait);
1661
1662 /*
1663 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1664 * and SIGTERM. (Preserving the existing masks).
1665 * Mask out SIGINT if mount option nointr is specified.
1666 */
1667 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1668 if (!(mi->mi_flags & MI_INT))
1669 client->cl_nosignal = TRUE;
1670
1671 /*
1672 * If there is a current signal, then don't bother
1673 * even trying to send out the request because we
1674 * won't be able to block waiting for the response.
1675 * Simply assume RPC_INTR and get on with it.
1676 */
1677 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1678 status = RPC_INTR;
1679 else {
1680 status = CLNT_CALL(client, which, xdrargs, argsp,
1681 xdrres, resp, wait);
1682 }
1683
1684 if (!(mi->mi_flags & MI_INT))
1685 client->cl_nosignal = FALSE;
1686 /*
1687 * restore original signal mask
1688 */
1689 sigunintr(&smask);
1690
1691 switch (status) {
1692 case RPC_SUCCESS:
1693 #if 0 /* notyet */
1694 if ((mi->mi_flags & MI_DYNAMIC) &&
1695 mi->mi_timer_type[which] != 0 &&
1696 (mi->mi_curread != my_rsize ||
1697 mi->mi_curwrite != my_wsize))
1698 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1699 #endif
1700 break;
1701
1702 /*
1703 * Unfortunately, there are servers in the world which
1704 * are not coded correctly. They are not prepared to
1705 * handle RPC requests to the NFS port which are not
1706 * NFS requests. Thus, they may try to process the
1707 * NFS_ACL request as if it were an NFS request. This
1708 * does not work. Generally, an error will be generated
1709 * on the client because it will not be able to decode
1710 * the response from the server. However, it seems
1711 * possible that the server may not be able to decode
1712 * the arguments. Thus, the criteria for deciding
1713 * whether the server supports NFS_ACL or not is whether
1714 * the following RPC errors are returned from CLNT_CALL.
1715 */
1716 case RPC_CANTDECODERES:
1717 case RPC_PROGUNAVAIL:
1718 case RPC_CANTDECODEARGS:
1719 case RPC_PROGVERSMISMATCH:
1720 mutex_enter(&mi->mi_lock);
1721 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1722 mutex_exit(&mi->mi_lock);
1723 break;
1724
1725 /*
1726 * If the server supports NFS_ACL but not the new ops
1727 * for extended attributes, make sure we don't retry.
1728 */
1729 case RPC_PROCUNAVAIL:
1730 mutex_enter(&mi->mi_lock);
1731 mi->mi_flags &= ~MI_EXTATTR;
1732 mutex_exit(&mi->mi_lock);
1733 break;
1734
1735 case RPC_INTR:
1736 /*
1737 * There is no way to recover from this error,
1738 * even if mount option nointr is specified.
1739 * SIGKILL, for example, cannot be blocked.
1740 */
1741 rpcerr.re_status = RPC_INTR;
1742 rpcerr.re_errno = EINTR;
1743 break;
1744
1745 case RPC_UDERROR:
1746 /*
1747 * If the NFS server is local (vold) and
1748 * it goes away then we get RPC_UDERROR.
1749 * This is a retryable error, so we would
1750 * loop, so check to see if the specific
1751 * error was ECONNRESET, indicating that
1752 * target did not exist at all. If so,
1753 * return with RPC_PROGUNAVAIL and
1754 * ECONNRESET to indicate why.
1755 */
1756 CLNT_GETERR(client, &rpcerr);
1757 if (rpcerr.re_errno == ECONNRESET) {
1758 rpcerr.re_status = RPC_PROGUNAVAIL;
1759 rpcerr.re_errno = ECONNRESET;
1760 break;
1761 }
1762 /*FALLTHROUGH*/
1763
1764 default: /* probably RPC_TIMEDOUT */
1765 if (IS_UNRECOVERABLE_RPC(status))
1766 break;
1767
1768 /*
1769 * increment server not responding count
1770 */
1771 mutex_enter(&mi->mi_lock);
1772 mi->mi_noresponse++;
1773 mutex_exit(&mi->mi_lock);
1774 #ifdef DEBUG
1775 nfscl->nfscl_stat.noresponse.value.ui64++;
1776 #endif
1777
1778 if (!(mi->mi_flags & MI_HARD)) {
1779 if (!(mi->mi_flags & MI_SEMISOFT) ||
1780 (mi->mi_acl_ss_call_type[which] == 0))
1781 break;
1782 }
1783
1784 /*
1785 * The call is in progress (over COTS).
1786 * Try the CLNT_CALL again, but don't
1787 * print a noisy error message.
1788 */
1789 if (status == RPC_INPROGRESS) {
1790 tryagain = TRUE;
1791 break;
1792 }
1793
1794 if (flags & RFSCALL_SOFT)
1795 break;
1796
1797 /*
1798 * On zone shutdown, just move on.
1799 */
1800 if (zone_status_get(curproc->p_zone) >=
1801 ZONE_IS_SHUTTING_DOWN) {
1802 rpcerr.re_status = RPC_FAILED;
1803 rpcerr.re_errno = EIO;
1804 break;
1805 }
1806
1807 /*
1808 * NFS client failover support
1809 *
1810 * If the current server just failed us, we'll
1811 * start the process of finding a new server.
1812 * After that, we can just retry.
1813 */
1814 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1815 if (svp == mi->mi_curr_serv)
1816 failover_newserver(mi);
1817 clfree_impl(client, ch, nfscl);
1818 goto failoverretry;
1819 }
1820
1821 tryagain = TRUE;
1822 timeo = backoff(timeo);
1823 mutex_enter(&mi->mi_lock);
1824 if (!(mi->mi_flags & MI_PRINTED)) {
1825 mi->mi_flags |= MI_PRINTED;
1826 mutex_exit(&mi->mi_lock);
1827 #ifdef DEBUG
1828 zprintf(zoneid,
1829 "NFS_ACL%d server %s not responding still trying\n",
1830 mi->mi_vers, svp->sv_hostname);
1831 #else
1832 zprintf(zoneid,
1833 "NFS server %s not responding still trying\n",
1834 svp->sv_hostname);
1835 #endif
1836 } else
1837 mutex_exit(&mi->mi_lock);
1838 if (*douprintf && nfs_has_ctty()) {
1839 *douprintf = 0;
1840 if (!(mi->mi_flags & MI_NOPRINT))
1841 #ifdef DEBUG
1842 uprintf(
1843 "NFS_ACL%d server %s not responding still trying\n",
1844 mi->mi_vers, svp->sv_hostname);
1845 #else
1846 uprintf(
1847 "NFS server %s not responding still trying\n",
1848 svp->sv_hostname);
1849 #endif
1850 }
1851
1852 #if 0 /* notyet */
1853 /*
1854 * If doing dynamic adjustment of transfer
1855 * size and if it's a read or write call
1856 * and if the transfer size changed while
1857 * retransmitting or if the feedback routine
1858 * changed the transfer size,
1859 * then exit rfscall so that the transfer
1860 * size can be adjusted at the vnops level.
1861 */
1862 if ((mi->mi_flags & MI_DYNAMIC) &&
1863 mi->mi_acl_timer_type[which] != 0 &&
1864 (mi->mi_curread != my_rsize ||
1865 mi->mi_curwrite != my_wsize ||
1866 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1867 /*
1868 * On read or write calls, return
1869 * back to the vnode ops level if
1870 * the transfer size changed.
1871 */
1872 clfree_impl(client, ch, nfscl);
1873 if (cred_cloned)
1874 crfree(cr);
1875 return (ENFS_TRYAGAIN);
1876 }
1877 #endif
1878 }
1879 } while (tryagain);
1880
1881 if (status != RPC_SUCCESS) {
1882 /*
1883 * Let soft mounts use the timed out message.
1884 */
1885 if (status == RPC_INPROGRESS)
1886 status = RPC_TIMEDOUT;
1887 nfscl->nfscl_stat.badcalls.value.ui64++;
1888 if (status == RPC_CANTDECODERES ||
1889 status == RPC_PROGUNAVAIL ||
1890 status == RPC_PROCUNAVAIL ||
1891 status == RPC_CANTDECODEARGS ||
1892 status == RPC_PROGVERSMISMATCH)
1893 CLNT_GETERR(client, &rpcerr);
1894 else if (status != RPC_INTR) {
1895 mutex_enter(&mi->mi_lock);
1896 mi->mi_flags |= MI_DOWN;
1897 mutex_exit(&mi->mi_lock);
1898 CLNT_GETERR(client, &rpcerr);
1899 #ifdef DEBUG
1900 bufp = clnt_sperror(client, svp->sv_hostname);
1901 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1902 mi->mi_vers, mi->mi_aclnames[which], bufp);
1903 if (nfs_has_ctty()) {
1904 if (!(mi->mi_flags & MI_NOPRINT)) {
1905 uprintf("NFS_ACL%d %s failed for %s\n",
1906 mi->mi_vers, mi->mi_aclnames[which],
1907 bufp);
1908 }
1909 }
1910 kmem_free(bufp, MAXPATHLEN);
1911 #else
1912 zprintf(zoneid,
1913 "NFS %s failed for server %s: error %d (%s)\n",
1914 mi->mi_aclnames[which], svp->sv_hostname,
1915 status, clnt_sperrno(status));
1916 if (nfs_has_ctty()) {
1917 if (!(mi->mi_flags & MI_NOPRINT))
1918 uprintf(
1919 "NFS %s failed for server %s: error %d (%s)\n",
1920 mi->mi_aclnames[which],
1921 svp->sv_hostname, status,
1922 clnt_sperrno(status));
1923 }
1924 #endif
1925 /*
1926 * when CLNT_CALL() fails with RPC_AUTHERROR,
1927 * re_errno is set appropriately depending on
1928 * the authentication error
1929 */
1930 if (status == RPC_VERSMISMATCH ||
1931 status == RPC_PROGVERSMISMATCH)
1932 rpcerr.re_errno = EIO;
1933 }
1934 } else {
1935 /*
1936 * Test the value of mi_down and mi_printed without
1937 * holding the mi_lock mutex. If they are both zero,
1938 * then it is okay to skip the down and printed
1939 * processing. This saves on a mutex_enter and
1940 * mutex_exit pair for a normal, successful RPC.
1941 * This was just complete overhead.
1942 */
1943 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1944 mutex_enter(&mi->mi_lock);
1945 mi->mi_flags &= ~MI_DOWN;
1946 if (mi->mi_flags & MI_PRINTED) {
1947 mi->mi_flags &= ~MI_PRINTED;
1948 mutex_exit(&mi->mi_lock);
1949 #ifdef DEBUG
1950 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1951 mi->mi_vers, svp->sv_hostname);
1952 #else
1953 zprintf(zoneid, "NFS server %s ok\n",
1954 svp->sv_hostname);
1955 #endif
1956 } else
1957 mutex_exit(&mi->mi_lock);
1958 }
1959
1960 if (*douprintf == 0) {
1961 if (!(mi->mi_flags & MI_NOPRINT))
1962 #ifdef DEBUG
1963 uprintf("NFS_ACL%d server %s ok\n",
1964 mi->mi_vers, svp->sv_hostname);
1965 #else
1966 uprintf("NFS server %s ok\n", svp->sv_hostname);
1967 #endif
1968 *douprintf = 1;
1969 }
1970 }
1971
1972 clfree_impl(client, ch, nfscl);
1973 if (cred_cloned)
1974 crfree(cr);
1975
1976 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1977
1978 #if 0 /* notyet */
1979 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1980 rpcerr.re_errno);
1981 #endif
1982
1983 return (rpcerr.re_errno);
1984 }
1985
1986 int
vattr_to_sattr(struct vattr * vap,struct nfssattr * sa)1987 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1988 {
1989 uint_t mask = vap->va_mask;
1990
1991 if (!(mask & AT_MODE))
1992 sa->sa_mode = (uint32_t)-1;
1993 else
1994 sa->sa_mode = vap->va_mode;
1995 if (!(mask & AT_UID))
1996 sa->sa_uid = (uint32_t)-1;
1997 else
1998 sa->sa_uid = (uint32_t)vap->va_uid;
1999 if (!(mask & AT_GID))
2000 sa->sa_gid = (uint32_t)-1;
2001 else
2002 sa->sa_gid = (uint32_t)vap->va_gid;
2003 if (!(mask & AT_SIZE))
2004 sa->sa_size = (uint32_t)-1;
2005 else
2006 sa->sa_size = (uint32_t)vap->va_size;
2007 if (!(mask & AT_ATIME))
2008 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2009 else {
2010 /* check time validity */
2011 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2012 return (EOVERFLOW);
2013 }
2014 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2015 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2016 }
2017 if (!(mask & AT_MTIME))
2018 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2019 else {
2020 /* check time validity */
2021 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2022 return (EOVERFLOW);
2023 }
2024 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2025 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2026 }
2027 return (0);
2028 }
2029
2030 int
vattr_to_sattr3(struct vattr * vap,sattr3 * sa)2031 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2032 {
2033 uint_t mask = vap->va_mask;
2034
2035 if (!(mask & AT_MODE))
2036 sa->mode.set_it = FALSE;
2037 else {
2038 sa->mode.set_it = TRUE;
2039 sa->mode.mode = (mode3)vap->va_mode;
2040 }
2041 if (!(mask & AT_UID))
2042 sa->uid.set_it = FALSE;
2043 else {
2044 sa->uid.set_it = TRUE;
2045 sa->uid.uid = (uid3)vap->va_uid;
2046 }
2047 if (!(mask & AT_GID))
2048 sa->gid.set_it = FALSE;
2049 else {
2050 sa->gid.set_it = TRUE;
2051 sa->gid.gid = (gid3)vap->va_gid;
2052 }
2053 if (!(mask & AT_SIZE))
2054 sa->size.set_it = FALSE;
2055 else {
2056 sa->size.set_it = TRUE;
2057 sa->size.size = (size3)vap->va_size;
2058 }
2059 if (!(mask & AT_ATIME))
2060 sa->atime.set_it = DONT_CHANGE;
2061 else {
2062 /* check time validity */
2063 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2064 return (EOVERFLOW);
2065 }
2066 sa->atime.set_it = SET_TO_CLIENT_TIME;
2067 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2068 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2069 }
2070 if (!(mask & AT_MTIME))
2071 sa->mtime.set_it = DONT_CHANGE;
2072 else {
2073 /* check time validity */
2074 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2075 return (EOVERFLOW);
2076 }
2077 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2078 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2079 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2080 }
2081 return (0);
2082 }
2083
2084 void
setdiropargs(struct nfsdiropargs * da,char * nm,vnode_t * dvp)2085 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2086 {
2087
2088 da->da_fhandle = VTOFH(dvp);
2089 da->da_name = nm;
2090 da->da_flags = 0;
2091 }
2092
2093 void
setdiropargs3(diropargs3 * da,char * nm,vnode_t * dvp)2094 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2095 {
2096
2097 da->dirp = VTOFH3(dvp);
2098 da->name = nm;
2099 }
2100
2101 int
setdirgid(vnode_t * dvp,gid_t * gidp,cred_t * cr)2102 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2103 {
2104 int error;
2105 rnode_t *rp;
2106 struct vattr va;
2107
2108 va.va_mask = AT_MODE | AT_GID;
2109 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2110 if (error)
2111 return (error);
2112
2113 /*
2114 * To determine the expected group-id of the created file:
2115 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2116 * GRPID option, and the directory's set-gid bit is clear,
2117 * then use the process's gid.
2118 * 2) Otherwise, set the group-id to the gid of the parent directory.
2119 */
2120 rp = VTOR(dvp);
2121 mutex_enter(&rp->r_statelock);
2122 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2123 *gidp = crgetgid(cr);
2124 else
2125 *gidp = va.va_gid;
2126 mutex_exit(&rp->r_statelock);
2127 return (0);
2128 }
2129
2130 int
setdirmode(vnode_t * dvp,mode_t * omp,cred_t * cr)2131 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2132 {
2133 int error;
2134 struct vattr va;
2135
2136 va.va_mask = AT_MODE;
2137 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2138 if (error)
2139 return (error);
2140
2141 /*
2142 * Modify the expected mode (om) so that the set-gid bit matches
2143 * that of the parent directory (dvp).
2144 */
2145 if (va.va_mode & VSGID)
2146 *omp |= VSGID;
2147 else
2148 *omp &= ~VSGID;
2149 return (0);
2150 }
2151
2152 void
nfs_setswaplike(vnode_t * vp,vattr_t * vap)2153 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2154 {
2155
2156 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2157 if (!(vp->v_flag & VSWAPLIKE)) {
2158 mutex_enter(&vp->v_lock);
2159 vp->v_flag |= VSWAPLIKE;
2160 mutex_exit(&vp->v_lock);
2161 }
2162 } else {
2163 if (vp->v_flag & VSWAPLIKE) {
2164 mutex_enter(&vp->v_lock);
2165 vp->v_flag &= ~VSWAPLIKE;
2166 mutex_exit(&vp->v_lock);
2167 }
2168 }
2169 }
2170
2171 /*
2172 * Free the resources associated with an rnode.
2173 */
2174 static void
rinactive(rnode_t * rp,cred_t * cr)2175 rinactive(rnode_t *rp, cred_t *cr)
2176 {
2177 vnode_t *vp;
2178 cred_t *cred;
2179 char *contents;
2180 int size;
2181 vsecattr_t *vsp;
2182 int error;
2183 nfs3_pathconf_info *info;
2184
2185 /*
2186 * Before freeing anything, wait until all asynchronous
2187 * activity is done on this rnode. This will allow all
2188 * asynchronous read ahead and write behind i/o's to
2189 * finish.
2190 */
2191 mutex_enter(&rp->r_statelock);
2192 while (rp->r_count > 0)
2193 cv_wait(&rp->r_cv, &rp->r_statelock);
2194 mutex_exit(&rp->r_statelock);
2195
2196 /*
2197 * Flush and invalidate all pages associated with the vnode.
2198 */
2199 vp = RTOV(rp);
2200 if (vn_has_cached_data(vp)) {
2201 ASSERT(vp->v_type != VCHR);
2202 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2203 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2204 if (error && (error == ENOSPC || error == EDQUOT)) {
2205 mutex_enter(&rp->r_statelock);
2206 if (!rp->r_error)
2207 rp->r_error = error;
2208 mutex_exit(&rp->r_statelock);
2209 }
2210 }
2211 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2212 }
2213
2214 /*
2215 * Free any held credentials and caches which may be associated
2216 * with this rnode.
2217 */
2218 mutex_enter(&rp->r_statelock);
2219 cred = rp->r_cred;
2220 rp->r_cred = NULL;
2221 contents = rp->r_symlink.contents;
2222 size = rp->r_symlink.size;
2223 rp->r_symlink.contents = NULL;
2224 vsp = rp->r_secattr;
2225 rp->r_secattr = NULL;
2226 info = rp->r_pathconf;
2227 rp->r_pathconf = NULL;
2228 mutex_exit(&rp->r_statelock);
2229
2230 /*
2231 * Free the held credential.
2232 */
2233 if (cred != NULL)
2234 crfree(cred);
2235
2236 /*
2237 * Free the access cache entries.
2238 */
2239 (void) nfs_access_purge_rp(rp);
2240
2241 /*
2242 * Free the readdir cache entries.
2243 */
2244 if (HAVE_RDDIR_CACHE(rp))
2245 nfs_purge_rddir_cache(vp);
2246
2247 /*
2248 * Free the symbolic link cache.
2249 */
2250 if (contents != NULL) {
2251
2252 kmem_free((void *)contents, size);
2253 }
2254
2255 /*
2256 * Free any cached ACL.
2257 */
2258 if (vsp != NULL)
2259 nfs_acl_free(vsp);
2260
2261 /*
2262 * Free any cached pathconf information.
2263 */
2264 if (info != NULL)
2265 kmem_free(info, sizeof (*info));
2266 }
2267
2268 /*
2269 * Return a vnode for the given NFS Version 2 file handle.
2270 * If no rnode exists for this fhandle, create one and put it
2271 * into the hash queues. If the rnode for this fhandle
2272 * already exists, return it.
2273 *
2274 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2275 */
2276 vnode_t *
makenfsnode(fhandle_t * fh,struct nfsfattr * attr,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2277 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2278 hrtime_t t, cred_t *cr, char *dnm, char *nm)
2279 {
2280 int newnode;
2281 int index;
2282 vnode_t *vp;
2283 nfs_fhandle nfh;
2284 vattr_t va;
2285
2286 nfh.fh_len = NFS_FHSIZE;
2287 bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2288
2289 index = rtablehash(&nfh);
2290 rw_enter(&rtable[index].r_lock, RW_READER);
2291
2292 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2293 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2294
2295 if (attr != NULL) {
2296 if (!newnode) {
2297 rw_exit(&rtable[index].r_lock);
2298 (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2299 } else {
2300 if (attr->na_type < NFNON || attr->na_type > NFSOC)
2301 vp->v_type = VBAD;
2302 else
2303 vp->v_type = n2v_type(attr);
2304 /*
2305 * A translation here seems to be necessary
2306 * because this function can be called
2307 * with `attr' that has come from the wire,
2308 * and been operated on by vattr_to_nattr().
2309 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2310 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2311 * ->makenfsnode().
2312 */
2313 if ((attr->na_rdev & 0xffff0000) == 0)
2314 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2315 else
2316 vp->v_rdev = expldev(n2v_rdev(attr));
2317 nfs_attrcache(vp, attr, t);
2318 rw_exit(&rtable[index].r_lock);
2319 }
2320 } else {
2321 if (newnode) {
2322 PURGE_ATTRCACHE(vp);
2323 }
2324 rw_exit(&rtable[index].r_lock);
2325 }
2326
2327 return (vp);
2328 }
2329
2330 /*
2331 * Return a vnode for the given NFS Version 3 file handle.
2332 * If no rnode exists for this fhandle, create one and put it
2333 * into the hash queues. If the rnode for this fhandle
2334 * already exists, return it.
2335 *
2336 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2337 */
2338 vnode_t *
makenfs3node_va(nfs_fh3 * fh,vattr_t * vap,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2339 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2340 cred_t *cr, char *dnm, char *nm)
2341 {
2342 int newnode;
2343 int index;
2344 vnode_t *vp;
2345
2346 index = rtablehash((nfs_fhandle *)fh);
2347 rw_enter(&rtable[index].r_lock, RW_READER);
2348
2349 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2350 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2351 dnm, nm);
2352
2353 if (vap == NULL) {
2354 if (newnode) {
2355 PURGE_ATTRCACHE(vp);
2356 }
2357 rw_exit(&rtable[index].r_lock);
2358 return (vp);
2359 }
2360
2361 if (!newnode) {
2362 rw_exit(&rtable[index].r_lock);
2363 nfs_attr_cache(vp, vap, t, cr);
2364 } else {
2365 rnode_t *rp = VTOR(vp);
2366
2367 vp->v_type = vap->va_type;
2368 vp->v_rdev = vap->va_rdev;
2369
2370 mutex_enter(&rp->r_statelock);
2371 if (rp->r_mtime <= t)
2372 nfs_attrcache_va(vp, vap);
2373 mutex_exit(&rp->r_statelock);
2374 rw_exit(&rtable[index].r_lock);
2375 }
2376
2377 return (vp);
2378 }
2379
2380 vnode_t *
makenfs3node(nfs_fh3 * fh,fattr3 * attr,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2381 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2382 cred_t *cr, char *dnm, char *nm)
2383 {
2384 int newnode;
2385 int index;
2386 vnode_t *vp;
2387 vattr_t va;
2388
2389 index = rtablehash((nfs_fhandle *)fh);
2390 rw_enter(&rtable[index].r_lock, RW_READER);
2391
2392 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2393 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2394 dnm, nm);
2395
2396 if (attr == NULL) {
2397 if (newnode) {
2398 PURGE_ATTRCACHE(vp);
2399 }
2400 rw_exit(&rtable[index].r_lock);
2401 return (vp);
2402 }
2403
2404 if (!newnode) {
2405 rw_exit(&rtable[index].r_lock);
2406 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2407 } else {
2408 if (attr->type < NF3REG || attr->type > NF3FIFO)
2409 vp->v_type = VBAD;
2410 else
2411 vp->v_type = nf3_to_vt[attr->type];
2412 vp->v_rdev = makedevice(attr->rdev.specdata1,
2413 attr->rdev.specdata2);
2414 nfs3_attrcache(vp, attr, t);
2415 rw_exit(&rtable[index].r_lock);
2416 }
2417
2418 return (vp);
2419 }
2420
2421 /*
2422 * Read this comment before making changes to rtablehash()!
2423 * This is a hash function in which seemingly obvious and harmless
2424 * changes can cause escalations costing million dollars!
2425 * Know what you are doing.
2426 *
2427 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2428 * algorithm is currently detailed here:
2429 *
2430 * http://burtleburtle.net/bob/hash/doobs.html
2431 *
2432 * Of course, the above link may not be valid by the time you are reading
2433 * this, but suffice it to say that the one-at-a-time algorithm works well in
2434 * almost all cases. If you are changing the algorithm be sure to verify that
2435 * the hash algorithm still provides even distribution in all cases and with
2436 * any server returning filehandles in whatever order (sequential or random).
2437 */
2438 static int
rtablehash(nfs_fhandle * fh)2439 rtablehash(nfs_fhandle *fh)
2440 {
2441 ulong_t hash, len, i;
2442 char *key;
2443
2444 key = fh->fh_buf;
2445 len = (ulong_t)fh->fh_len;
2446 for (hash = 0, i = 0; i < len; i++) {
2447 hash += key[i];
2448 hash += (hash << 10);
2449 hash ^= (hash >> 6);
2450 }
2451 hash += (hash << 3);
2452 hash ^= (hash >> 11);
2453 hash += (hash << 15);
2454 return (hash & rtablemask);
2455 }
2456
2457 static vnode_t *
make_rnode(nfs_fhandle * fh,rhashq_t * rhtp,struct vfs * vfsp,struct vnodeops * vops,int (* putapage)(vnode_t *,page_t *,u_offset_t *,size_t *,int,cred_t *),int (* compar)(const void *,const void *),int * newnode,cred_t * cr,char * dnm,char * nm)2458 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2459 struct vnodeops *vops,
2460 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2461 int (*compar)(const void *, const void *),
2462 int *newnode, cred_t *cr, char *dnm, char *nm)
2463 {
2464 rnode_t *rp;
2465 rnode_t *trp;
2466 vnode_t *vp;
2467 mntinfo_t *mi;
2468
2469 ASSERT(RW_READ_HELD(&rhtp->r_lock));
2470
2471 mi = VFTOMI(vfsp);
2472 start:
2473 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2474 vp = RTOV(rp);
2475 nfs_set_vroot(vp);
2476 *newnode = 0;
2477 return (vp);
2478 }
2479 rw_exit(&rhtp->r_lock);
2480
2481 mutex_enter(&rpfreelist_lock);
2482 if (rpfreelist != NULL && rnew >= nrnode) {
2483 rp = rpfreelist;
2484 rp_rmfree(rp);
2485 mutex_exit(&rpfreelist_lock);
2486
2487 vp = RTOV(rp);
2488
2489 if (rp->r_flags & RHASHED) {
2490 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2491 mutex_enter(&vp->v_lock);
2492 if (vp->v_count > 1) {
2493 VN_RELE_LOCKED(vp);
2494 mutex_exit(&vp->v_lock);
2495 rw_exit(&rp->r_hashq->r_lock);
2496 rw_enter(&rhtp->r_lock, RW_READER);
2497 goto start;
2498 }
2499 mutex_exit(&vp->v_lock);
2500 rp_rmhash_locked(rp);
2501 rw_exit(&rp->r_hashq->r_lock);
2502 }
2503
2504 rinactive(rp, cr);
2505
2506 mutex_enter(&vp->v_lock);
2507 if (vp->v_count > 1) {
2508 VN_RELE_LOCKED(vp);
2509 mutex_exit(&vp->v_lock);
2510 rw_enter(&rhtp->r_lock, RW_READER);
2511 goto start;
2512 }
2513 mutex_exit(&vp->v_lock);
2514 vn_invalid(vp);
2515 /*
2516 * destroy old locks before bzero'ing and
2517 * recreating the locks below.
2518 */
2519 nfs_rw_destroy(&rp->r_rwlock);
2520 nfs_rw_destroy(&rp->r_lkserlock);
2521 mutex_destroy(&rp->r_statelock);
2522 cv_destroy(&rp->r_cv);
2523 cv_destroy(&rp->r_commit.c_cv);
2524 nfs_free_r_path(rp);
2525 avl_destroy(&rp->r_dir);
2526 /*
2527 * Make sure that if rnode is recycled then
2528 * VFS count is decremented properly before
2529 * reuse.
2530 */
2531 VFS_RELE(vp->v_vfsp);
2532 vn_reinit(vp);
2533 } else {
2534 vnode_t *new_vp;
2535
2536 mutex_exit(&rpfreelist_lock);
2537
2538 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2539 new_vp = vn_alloc(KM_SLEEP);
2540
2541 atomic_inc_ulong((ulong_t *)&rnew);
2542 #ifdef DEBUG
2543 clstat_debug.nrnode.value.ui64++;
2544 #endif
2545 vp = new_vp;
2546 }
2547
2548 bzero(rp, sizeof (*rp));
2549 rp->r_vnode = vp;
2550 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2551 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2552 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2553 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2554 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2555 rp->r_fh.fh_len = fh->fh_len;
2556 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2557 rp->r_server = mi->mi_curr_serv;
2558 if (FAILOVER_MOUNT(mi)) {
2559 /*
2560 * If replicated servers, stash pathnames
2561 */
2562 if (dnm != NULL && nm != NULL) {
2563 char *s, *p;
2564 uint_t len;
2565
2566 len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2567 rp->r_path = kmem_alloc(len, KM_SLEEP);
2568 #ifdef DEBUG
2569 clstat_debug.rpath.value.ui64 += len;
2570 #endif
2571 s = rp->r_path;
2572 for (p = dnm; *p; p++)
2573 *s++ = *p;
2574 *s++ = '/';
2575 for (p = nm; *p; p++)
2576 *s++ = *p;
2577 *s = '\0';
2578 } else {
2579 /* special case for root */
2580 rp->r_path = kmem_alloc(2, KM_SLEEP);
2581 #ifdef DEBUG
2582 clstat_debug.rpath.value.ui64 += 2;
2583 #endif
2584 *rp->r_path = '.';
2585 *(rp->r_path + 1) = '\0';
2586 }
2587 }
2588 VFS_HOLD(vfsp);
2589 rp->r_putapage = putapage;
2590 rp->r_hashq = rhtp;
2591 rp->r_flags = RREADDIRPLUS;
2592 avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2593 offsetof(rddir_cache, tree));
2594 vn_setops(vp, vops);
2595 vp->v_data = (caddr_t)rp;
2596 vp->v_vfsp = vfsp;
2597 vp->v_type = VNON;
2598 vp->v_flag |= VMODSORT;
2599 nfs_set_vroot(vp);
2600
2601 /*
2602 * There is a race condition if someone else
2603 * alloc's the rnode while no locks are held, so we
2604 * check again and recover if found.
2605 */
2606 rw_enter(&rhtp->r_lock, RW_WRITER);
2607 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2608 vp = RTOV(trp);
2609 nfs_set_vroot(vp);
2610 *newnode = 0;
2611 rw_exit(&rhtp->r_lock);
2612 rp_addfree(rp, cr);
2613 rw_enter(&rhtp->r_lock, RW_READER);
2614 return (vp);
2615 }
2616 rp_addhash(rp);
2617 *newnode = 1;
2618 return (vp);
2619 }
2620
2621 /*
2622 * Callback function to check if the page should be marked as
2623 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2624 */
2625 int
nfs_setmod_check(page_t * pp)2626 nfs_setmod_check(page_t *pp)
2627 {
2628 if (pp->p_fsdata != C_NOCOMMIT) {
2629 pp->p_fsdata = C_NOCOMMIT;
2630 return (1);
2631 }
2632 return (0);
2633 }
2634
2635 static void
nfs_set_vroot(vnode_t * vp)2636 nfs_set_vroot(vnode_t *vp)
2637 {
2638 rnode_t *rp;
2639 nfs_fhandle *rootfh;
2640
2641 rp = VTOR(vp);
2642 rootfh = &rp->r_server->sv_fhandle;
2643 if (rootfh->fh_len == rp->r_fh.fh_len &&
2644 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2645 if (!(vp->v_flag & VROOT)) {
2646 mutex_enter(&vp->v_lock);
2647 vp->v_flag |= VROOT;
2648 mutex_exit(&vp->v_lock);
2649 }
2650 }
2651 }
2652
2653 static void
nfs_free_r_path(rnode_t * rp)2654 nfs_free_r_path(rnode_t *rp)
2655 {
2656 char *path;
2657 size_t len;
2658
2659 path = rp->r_path;
2660 if (path) {
2661 rp->r_path = NULL;
2662 len = strlen(path) + 1;
2663 kmem_free(path, len);
2664 #ifdef DEBUG
2665 clstat_debug.rpath.value.ui64 -= len;
2666 #endif
2667 }
2668 }
2669
2670 /*
2671 * Put an rnode on the free list.
2672 *
2673 * Rnodes which were allocated above and beyond the normal limit
2674 * are immediately freed.
2675 */
2676 void
rp_addfree(rnode_t * rp,cred_t * cr)2677 rp_addfree(rnode_t *rp, cred_t *cr)
2678 {
2679 vnode_t *vp;
2680 struct vfs *vfsp;
2681
2682 vp = RTOV(rp);
2683 ASSERT(vp->v_count >= 1);
2684 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2685
2686 /*
2687 * If we have too many rnodes allocated and there are no
2688 * references to this rnode, or if the rnode is no longer
2689 * accessible by it does not reside in the hash queues,
2690 * or if an i/o error occurred while writing to the file,
2691 * then just free it instead of putting it on the rnode
2692 * freelist.
2693 */
2694 vfsp = vp->v_vfsp;
2695 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2696 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2697 if (rp->r_flags & RHASHED) {
2698 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2699 mutex_enter(&vp->v_lock);
2700 if (vp->v_count > 1) {
2701 VN_RELE_LOCKED(vp);
2702 mutex_exit(&vp->v_lock);
2703 rw_exit(&rp->r_hashq->r_lock);
2704 return;
2705 }
2706 mutex_exit(&vp->v_lock);
2707 rp_rmhash_locked(rp);
2708 rw_exit(&rp->r_hashq->r_lock);
2709 }
2710
2711 rinactive(rp, cr);
2712
2713 /*
2714 * Recheck the vnode reference count. We need to
2715 * make sure that another reference has not been
2716 * acquired while we were not holding v_lock. The
2717 * rnode is not in the rnode hash queues, so the
2718 * only way for a reference to have been acquired
2719 * is for a VOP_PUTPAGE because the rnode was marked
2720 * with RDIRTY or for a modified page. This
2721 * reference may have been acquired before our call
2722 * to rinactive. The i/o may have been completed,
2723 * thus allowing rinactive to complete, but the
2724 * reference to the vnode may not have been released
2725 * yet. In any case, the rnode can not be destroyed
2726 * until the other references to this vnode have been
2727 * released. The other references will take care of
2728 * either destroying the rnode or placing it on the
2729 * rnode freelist. If there are no other references,
2730 * then the rnode may be safely destroyed.
2731 */
2732 mutex_enter(&vp->v_lock);
2733 if (vp->v_count > 1) {
2734 VN_RELE_LOCKED(vp);
2735 mutex_exit(&vp->v_lock);
2736 return;
2737 }
2738 mutex_exit(&vp->v_lock);
2739
2740 destroy_rnode(rp);
2741 return;
2742 }
2743
2744 /*
2745 * Lock the hash queue and then recheck the reference count
2746 * to ensure that no other threads have acquired a reference
2747 * to indicate that the rnode should not be placed on the
2748 * freelist. If another reference has been acquired, then
2749 * just release this one and let the other thread complete
2750 * the processing of adding this rnode to the freelist.
2751 */
2752 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2753
2754 mutex_enter(&vp->v_lock);
2755 if (vp->v_count > 1) {
2756 VN_RELE_LOCKED(vp);
2757 mutex_exit(&vp->v_lock);
2758 rw_exit(&rp->r_hashq->r_lock);
2759 return;
2760 }
2761 mutex_exit(&vp->v_lock);
2762
2763 /*
2764 * If there is no cached data or metadata for this file, then
2765 * put the rnode on the front of the freelist so that it will
2766 * be reused before other rnodes which may have cached data or
2767 * metadata associated with them.
2768 */
2769 mutex_enter(&rpfreelist_lock);
2770 if (rpfreelist == NULL) {
2771 rp->r_freef = rp;
2772 rp->r_freeb = rp;
2773 rpfreelist = rp;
2774 } else {
2775 rp->r_freef = rpfreelist;
2776 rp->r_freeb = rpfreelist->r_freeb;
2777 rpfreelist->r_freeb->r_freef = rp;
2778 rpfreelist->r_freeb = rp;
2779 if (!vn_has_cached_data(vp) &&
2780 !HAVE_RDDIR_CACHE(rp) &&
2781 rp->r_symlink.contents == NULL &&
2782 rp->r_secattr == NULL &&
2783 rp->r_pathconf == NULL)
2784 rpfreelist = rp;
2785 }
2786 mutex_exit(&rpfreelist_lock);
2787
2788 rw_exit(&rp->r_hashq->r_lock);
2789 }
2790
2791 /*
2792 * Remove an rnode from the free list.
2793 *
2794 * The caller must be holding rpfreelist_lock and the rnode
2795 * must be on the freelist.
2796 */
2797 static void
rp_rmfree(rnode_t * rp)2798 rp_rmfree(rnode_t *rp)
2799 {
2800
2801 ASSERT(MUTEX_HELD(&rpfreelist_lock));
2802 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2803
2804 if (rp == rpfreelist) {
2805 rpfreelist = rp->r_freef;
2806 if (rp == rpfreelist)
2807 rpfreelist = NULL;
2808 }
2809
2810 rp->r_freeb->r_freef = rp->r_freef;
2811 rp->r_freef->r_freeb = rp->r_freeb;
2812
2813 rp->r_freef = rp->r_freeb = NULL;
2814 }
2815
2816 /*
2817 * Put a rnode in the hash table.
2818 *
2819 * The caller must be holding the exclusive hash queue lock.
2820 */
2821 static void
rp_addhash(rnode_t * rp)2822 rp_addhash(rnode_t *rp)
2823 {
2824 mntinfo_t *mi;
2825
2826 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2827 ASSERT(!(rp->r_flags & RHASHED));
2828
2829 rp->r_hashf = rp->r_hashq->r_hashf;
2830 rp->r_hashq->r_hashf = rp;
2831 rp->r_hashb = (rnode_t *)rp->r_hashq;
2832 rp->r_hashf->r_hashb = rp;
2833
2834 mutex_enter(&rp->r_statelock);
2835 rp->r_flags |= RHASHED;
2836 mutex_exit(&rp->r_statelock);
2837
2838 mi = VTOMI(RTOV(rp));
2839 mutex_enter(&mi->mi_rnodes_lock);
2840 list_insert_tail(&mi->mi_rnodes, rp);
2841 mutex_exit(&mi->mi_rnodes_lock);
2842 }
2843
2844 /*
2845 * Remove a rnode from the hash table.
2846 *
2847 * The caller must be holding the hash queue lock.
2848 */
2849 static void
rp_rmhash_locked(rnode_t * rp)2850 rp_rmhash_locked(rnode_t *rp)
2851 {
2852 mntinfo_t *mi;
2853
2854 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2855 ASSERT(rp->r_flags & RHASHED);
2856
2857 rp->r_hashb->r_hashf = rp->r_hashf;
2858 rp->r_hashf->r_hashb = rp->r_hashb;
2859
2860 mutex_enter(&rp->r_statelock);
2861 rp->r_flags &= ~RHASHED;
2862 mutex_exit(&rp->r_statelock);
2863
2864 mi = VTOMI(RTOV(rp));
2865 mutex_enter(&mi->mi_rnodes_lock);
2866 if (list_link_active(&rp->r_mi_link))
2867 list_remove(&mi->mi_rnodes, rp);
2868 mutex_exit(&mi->mi_rnodes_lock);
2869 }
2870
2871 /*
2872 * Remove a rnode from the hash table.
2873 *
2874 * The caller must not be holding the hash queue lock.
2875 */
2876 void
rp_rmhash(rnode_t * rp)2877 rp_rmhash(rnode_t *rp)
2878 {
2879
2880 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2881 rp_rmhash_locked(rp);
2882 rw_exit(&rp->r_hashq->r_lock);
2883 }
2884
2885 /*
2886 * Lookup a rnode by fhandle.
2887 *
2888 * The caller must be holding the hash queue lock, either shared or exclusive.
2889 */
2890 static rnode_t *
rfind(rhashq_t * rhtp,nfs_fhandle * fh,struct vfs * vfsp)2891 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2892 {
2893 rnode_t *rp;
2894 vnode_t *vp;
2895
2896 ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2897
2898 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2899 vp = RTOV(rp);
2900 if (vp->v_vfsp == vfsp &&
2901 rp->r_fh.fh_len == fh->fh_len &&
2902 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2903 /*
2904 * remove rnode from free list, if necessary.
2905 */
2906 if (rp->r_freef != NULL) {
2907 mutex_enter(&rpfreelist_lock);
2908 /*
2909 * If the rnode is on the freelist,
2910 * then remove it and use that reference
2911 * as the new reference. Otherwise,
2912 * need to increment the reference count.
2913 */
2914 if (rp->r_freef != NULL) {
2915 rp_rmfree(rp);
2916 mutex_exit(&rpfreelist_lock);
2917 } else {
2918 mutex_exit(&rpfreelist_lock);
2919 VN_HOLD(vp);
2920 }
2921 } else
2922 VN_HOLD(vp);
2923 return (rp);
2924 }
2925 }
2926 return (NULL);
2927 }
2928
2929 /*
2930 * Return 1 if there is an active vnode belonging to this vfs in the
2931 * rtable cache.
2932 *
2933 * Several of these checks are done without holding the usual
2934 * locks. This is safe because destroy_rtable(), rp_addfree(),
2935 * etc. will redo the necessary checks before actually destroying
2936 * any rnodes.
2937 */
2938 int
check_rtable(struct vfs * vfsp)2939 check_rtable(struct vfs *vfsp)
2940 {
2941 rnode_t *rp;
2942 vnode_t *vp;
2943 mntinfo_t *mi;
2944
2945 ASSERT(vfsp != NULL);
2946 mi = VFTOMI(vfsp);
2947
2948 mutex_enter(&mi->mi_rnodes_lock);
2949 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
2950 rp = list_next(&mi->mi_rnodes, rp)) {
2951 vp = RTOV(rp);
2952
2953 if (rp->r_freef == NULL ||
2954 (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) ||
2955 rp->r_count > 0) {
2956 mutex_exit(&mi->mi_rnodes_lock);
2957 return (1);
2958 }
2959 }
2960 mutex_exit(&mi->mi_rnodes_lock);
2961
2962 return (0);
2963 }
2964
2965 /*
2966 * Destroy inactive vnodes from the hash queues which belong to this
2967 * vfs. It is essential that we destroy all inactive vnodes during a
2968 * forced unmount as well as during a normal unmount.
2969 */
2970 void
destroy_rtable(struct vfs * vfsp,cred_t * cr)2971 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2972 {
2973 rnode_t *rp;
2974 mntinfo_t *mi;
2975
2976 ASSERT(vfsp != NULL);
2977
2978 mi = VFTOMI(vfsp);
2979
2980 mutex_enter(&rpfreelist_lock);
2981 mutex_enter(&mi->mi_rnodes_lock);
2982 while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
2983 /*
2984 * If the rnode is no longer on the freelist it is not
2985 * ours and it will be handled by some other thread, so
2986 * skip it.
2987 */
2988 if (rp->r_freef == NULL)
2989 continue;
2990 mutex_exit(&mi->mi_rnodes_lock);
2991
2992 rp_rmfree(rp);
2993 mutex_exit(&rpfreelist_lock);
2994
2995 rp_rmhash(rp);
2996
2997 /*
2998 * This call to rp_addfree will end up destroying the
2999 * rnode, but in a safe way with the appropriate set
3000 * of checks done.
3001 */
3002 rp_addfree(rp, cr);
3003
3004 mutex_enter(&rpfreelist_lock);
3005 mutex_enter(&mi->mi_rnodes_lock);
3006 }
3007 mutex_exit(&mi->mi_rnodes_lock);
3008 mutex_exit(&rpfreelist_lock);
3009 }
3010
3011 /*
3012 * This routine destroys all the resources associated with the rnode
3013 * and then the rnode itself.
3014 */
3015 static void
destroy_rnode(rnode_t * rp)3016 destroy_rnode(rnode_t *rp)
3017 {
3018 vnode_t *vp;
3019 vfs_t *vfsp;
3020
3021 vp = RTOV(rp);
3022 vfsp = vp->v_vfsp;
3023
3024 ASSERT(vp->v_count == 1);
3025 ASSERT(rp->r_count == 0);
3026 ASSERT(rp->r_lmpl == NULL);
3027 ASSERT(rp->r_mapcnt == 0);
3028 ASSERT(!(rp->r_flags & RHASHED));
3029 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3030 atomic_dec_ulong((ulong_t *)&rnew);
3031 #ifdef DEBUG
3032 clstat_debug.nrnode.value.ui64--;
3033 #endif
3034 nfs_rw_destroy(&rp->r_rwlock);
3035 nfs_rw_destroy(&rp->r_lkserlock);
3036 mutex_destroy(&rp->r_statelock);
3037 cv_destroy(&rp->r_cv);
3038 cv_destroy(&rp->r_commit.c_cv);
3039 if (rp->r_flags & RDELMAPLIST)
3040 list_destroy(&rp->r_indelmap);
3041 nfs_free_r_path(rp);
3042 avl_destroy(&rp->r_dir);
3043 vn_invalid(vp);
3044 vn_free(vp);
3045 kmem_cache_free(rnode_cache, rp);
3046 VFS_RELE(vfsp);
3047 }
3048
3049 /*
3050 * Flush all vnodes in this (or every) vfs.
3051 * Used by nfs_sync and by nfs_unmount.
3052 */
3053 void
rflush(struct vfs * vfsp,cred_t * cr)3054 rflush(struct vfs *vfsp, cred_t *cr)
3055 {
3056 int index;
3057 rnode_t *rp;
3058 vnode_t *vp, **vplist;
3059 long num, cnt;
3060
3061 /*
3062 * Check to see whether there is anything to do.
3063 */
3064 num = rnew;
3065 if (num == 0)
3066 return;
3067
3068 /*
3069 * Allocate a slot for all currently active rnodes on the
3070 * supposition that they all may need flushing.
3071 */
3072 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3073 cnt = 0;
3074
3075 /*
3076 * If the vfs is known we can do fast path by iterating all rnodes that
3077 * belongs to this vfs. This is much faster than the traditional way
3078 * of iterating rtable (below) in a case there is a lot of rnodes that
3079 * does not belong to our vfs.
3080 */
3081 if (vfsp != NULL) {
3082 mntinfo_t *mi = VFTOMI(vfsp);
3083
3084 mutex_enter(&mi->mi_rnodes_lock);
3085 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
3086 rp = list_next(&mi->mi_rnodes, rp)) {
3087 vp = RTOV(rp);
3088 /*
3089 * Don't bother sync'ing a vp if it
3090 * is part of virtual swap device or
3091 * if VFS is read-only
3092 */
3093 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3094 continue;
3095 /*
3096 * If the vnode has pages and is marked as either dirty
3097 * or mmap'd, hold and add this vnode to the list of
3098 * vnodes to flush.
3099 */
3100 ASSERT(vp->v_vfsp == vfsp);
3101 if (vn_has_cached_data(vp) &&
3102 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3103 VN_HOLD(vp);
3104 vplist[cnt++] = vp;
3105 if (cnt == num) {
3106 /*
3107 * The vplist is full because there is
3108 * too many rnodes. We are done for
3109 * now.
3110 */
3111 break;
3112 }
3113 }
3114 }
3115 mutex_exit(&mi->mi_rnodes_lock);
3116
3117 goto done;
3118 }
3119
3120 ASSERT(vfsp == NULL);
3121
3122 /*
3123 * Walk the hash queues looking for rnodes with page
3124 * lists associated with them. Make a list of these
3125 * files.
3126 */
3127 for (index = 0; index < rtablesize; index++) {
3128 rw_enter(&rtable[index].r_lock, RW_READER);
3129 for (rp = rtable[index].r_hashf;
3130 rp != (rnode_t *)(&rtable[index]);
3131 rp = rp->r_hashf) {
3132 vp = RTOV(rp);
3133 /*
3134 * Don't bother sync'ing a vp if it
3135 * is part of virtual swap device or
3136 * if VFS is read-only
3137 */
3138 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3139 continue;
3140 /*
3141 * If the vnode has pages and is marked as either dirty
3142 * or mmap'd, hold and add this vnode to the list of
3143 * vnodes to flush.
3144 */
3145 if (vn_has_cached_data(vp) &&
3146 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3147 VN_HOLD(vp);
3148 vplist[cnt++] = vp;
3149 if (cnt == num) {
3150 rw_exit(&rtable[index].r_lock);
3151 /*
3152 * The vplist is full because there is
3153 * too many rnodes. We are done for
3154 * now.
3155 */
3156 goto done;
3157 }
3158 }
3159 }
3160 rw_exit(&rtable[index].r_lock);
3161 }
3162
3163 done:
3164
3165 /*
3166 * Flush and release all of the files on the list.
3167 */
3168 while (cnt-- > 0) {
3169 vp = vplist[cnt];
3170 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3171 VN_RELE(vp);
3172 }
3173
3174 /*
3175 * Free the space allocated to hold the list.
3176 */
3177 kmem_free(vplist, num * sizeof (*vplist));
3178 }
3179
3180 /*
3181 * This probably needs to be larger than or equal to
3182 * log2(sizeof (struct rnode)) due to the way that rnodes are
3183 * allocated.
3184 */
3185 #define ACACHE_SHIFT_BITS 9
3186
3187 static int
acachehash(rnode_t * rp,cred_t * cr)3188 acachehash(rnode_t *rp, cred_t *cr)
3189 {
3190
3191 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3192 acachemask);
3193 }
3194
3195 #ifdef DEBUG
3196 static long nfs_access_cache_hits = 0;
3197 static long nfs_access_cache_misses = 0;
3198 #endif
3199
3200 nfs_access_type_t
nfs_access_check(rnode_t * rp,uint32_t acc,cred_t * cr)3201 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3202 {
3203 vnode_t *vp;
3204 acache_t *ap;
3205 acache_hash_t *hp;
3206 nfs_access_type_t all;
3207
3208 vp = RTOV(rp);
3209 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3210 return (NFS_ACCESS_UNKNOWN);
3211
3212 if (rp->r_acache != NULL) {
3213 hp = &acache[acachehash(rp, cr)];
3214 rw_enter(&hp->lock, RW_READER);
3215 ap = hp->next;
3216 while (ap != (acache_t *)hp) {
3217 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3218 if ((ap->known & acc) == acc) {
3219 #ifdef DEBUG
3220 nfs_access_cache_hits++;
3221 #endif
3222 if ((ap->allowed & acc) == acc)
3223 all = NFS_ACCESS_ALLOWED;
3224 else
3225 all = NFS_ACCESS_DENIED;
3226 } else {
3227 #ifdef DEBUG
3228 nfs_access_cache_misses++;
3229 #endif
3230 all = NFS_ACCESS_UNKNOWN;
3231 }
3232 rw_exit(&hp->lock);
3233 return (all);
3234 }
3235 ap = ap->next;
3236 }
3237 rw_exit(&hp->lock);
3238 }
3239
3240 #ifdef DEBUG
3241 nfs_access_cache_misses++;
3242 #endif
3243 return (NFS_ACCESS_UNKNOWN);
3244 }
3245
3246 void
nfs_access_cache(rnode_t * rp,uint32_t acc,uint32_t resacc,cred_t * cr)3247 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3248 {
3249 acache_t *ap;
3250 acache_t *nap;
3251 acache_hash_t *hp;
3252
3253 hp = &acache[acachehash(rp, cr)];
3254
3255 /*
3256 * Allocate now assuming that mostly an allocation will be
3257 * required. This allows the allocation to happen without
3258 * holding the hash bucket locked.
3259 */
3260 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3261 if (nap != NULL) {
3262 nap->known = acc;
3263 nap->allowed = resacc;
3264 nap->rnode = rp;
3265 crhold(cr);
3266 nap->cred = cr;
3267 nap->hashq = hp;
3268 }
3269
3270 rw_enter(&hp->lock, RW_WRITER);
3271
3272 if (rp->r_acache != NULL) {
3273 ap = hp->next;
3274 while (ap != (acache_t *)hp) {
3275 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3276 ap->known |= acc;
3277 ap->allowed &= ~acc;
3278 ap->allowed |= resacc;
3279 rw_exit(&hp->lock);
3280 if (nap != NULL) {
3281 crfree(nap->cred);
3282 kmem_cache_free(acache_cache, nap);
3283 }
3284 return;
3285 }
3286 ap = ap->next;
3287 }
3288 }
3289
3290 if (nap != NULL) {
3291 #ifdef DEBUG
3292 clstat_debug.access.value.ui64++;
3293 #endif
3294 nap->next = hp->next;
3295 hp->next = nap;
3296 nap->next->prev = nap;
3297 nap->prev = (acache_t *)hp;
3298
3299 mutex_enter(&rp->r_statelock);
3300 nap->list = rp->r_acache;
3301 rp->r_acache = nap;
3302 mutex_exit(&rp->r_statelock);
3303 }
3304
3305 rw_exit(&hp->lock);
3306 }
3307
3308 int
nfs_access_purge_rp(rnode_t * rp)3309 nfs_access_purge_rp(rnode_t *rp)
3310 {
3311 acache_t *ap;
3312 acache_t *tmpap;
3313 acache_t *rplist;
3314
3315 /*
3316 * If there aren't any cached entries, then there is nothing
3317 * to free.
3318 */
3319 if (rp->r_acache == NULL)
3320 return (0);
3321
3322 mutex_enter(&rp->r_statelock);
3323 rplist = rp->r_acache;
3324 rp->r_acache = NULL;
3325 mutex_exit(&rp->r_statelock);
3326
3327 /*
3328 * Loop through each entry in the list pointed to in the
3329 * rnode. Remove each of these entries from the hash
3330 * queue that it is on and remove it from the list in
3331 * the rnode.
3332 */
3333 for (ap = rplist; ap != NULL; ap = tmpap) {
3334 rw_enter(&ap->hashq->lock, RW_WRITER);
3335 ap->prev->next = ap->next;
3336 ap->next->prev = ap->prev;
3337 rw_exit(&ap->hashq->lock);
3338
3339 tmpap = ap->list;
3340 crfree(ap->cred);
3341 kmem_cache_free(acache_cache, ap);
3342 #ifdef DEBUG
3343 clstat_debug.access.value.ui64--;
3344 #endif
3345 }
3346
3347 return (1);
3348 }
3349
3350 static const char prefix[] = ".nfs";
3351
3352 static kmutex_t newnum_lock;
3353
3354 int
newnum(void)3355 newnum(void)
3356 {
3357 static uint_t newnum = 0;
3358 uint_t id;
3359
3360 mutex_enter(&newnum_lock);
3361 if (newnum == 0)
3362 newnum = gethrestime_sec() & 0xffff;
3363 id = newnum++;
3364 mutex_exit(&newnum_lock);
3365 return (id);
3366 }
3367
3368 char *
newname(void)3369 newname(void)
3370 {
3371 char *news;
3372 char *s;
3373 const char *p;
3374 uint_t id;
3375
3376 id = newnum();
3377 news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3378 s = news;
3379 p = prefix;
3380 while (*p != '\0')
3381 *s++ = *p++;
3382 while (id != 0) {
3383 *s++ = "0123456789ABCDEF"[id & 0x0f];
3384 id >>= 4;
3385 }
3386 *s = '\0';
3387 return (news);
3388 }
3389
3390 /*
3391 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3392 * framework.
3393 */
3394 static int
cl_snapshot(kstat_t * ksp,void * buf,int rw)3395 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3396 {
3397 ksp->ks_snaptime = gethrtime();
3398 if (rw == KSTAT_WRITE) {
3399 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3400 #ifdef DEBUG
3401 /*
3402 * Currently only the global zone can write to kstats, but we
3403 * add the check just for paranoia.
3404 */
3405 if (INGLOBALZONE(curproc))
3406 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3407 sizeof (clstat_debug));
3408 #endif
3409 } else {
3410 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3411 #ifdef DEBUG
3412 /*
3413 * If we're displaying the "global" debug kstat values, we
3414 * display them as-is to all zones since in fact they apply to
3415 * the system as a whole.
3416 */
3417 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3418 sizeof (clstat_debug));
3419 #endif
3420 }
3421 return (0);
3422 }
3423
3424 static void *
clinit_zone(zoneid_t zoneid)3425 clinit_zone(zoneid_t zoneid)
3426 {
3427 kstat_t *nfs_client_kstat;
3428 struct nfs_clnt *nfscl;
3429 uint_t ndata;
3430
3431 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3432 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3433 nfscl->nfscl_chtable = NULL;
3434 nfscl->nfscl_zoneid = zoneid;
3435
3436 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3437 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3438 #ifdef DEBUG
3439 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3440 #endif
3441 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3442 "misc", KSTAT_TYPE_NAMED, ndata,
3443 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3444 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3445 nfs_client_kstat->ks_snapshot = cl_snapshot;
3446 kstat_install(nfs_client_kstat);
3447 }
3448 mutex_enter(&nfs_clnt_list_lock);
3449 list_insert_head(&nfs_clnt_list, nfscl);
3450 mutex_exit(&nfs_clnt_list_lock);
3451 return (nfscl);
3452 }
3453
3454 /*ARGSUSED*/
3455 static void
clfini_zone(zoneid_t zoneid,void * arg)3456 clfini_zone(zoneid_t zoneid, void *arg)
3457 {
3458 struct nfs_clnt *nfscl = arg;
3459 chhead_t *chp, *next;
3460
3461 if (nfscl == NULL)
3462 return;
3463 mutex_enter(&nfs_clnt_list_lock);
3464 list_remove(&nfs_clnt_list, nfscl);
3465 mutex_exit(&nfs_clnt_list_lock);
3466 clreclaim_zone(nfscl, 0);
3467 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3468 ASSERT(chp->ch_list == NULL);
3469 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3470 next = chp->ch_next;
3471 kmem_free(chp, sizeof (*chp));
3472 }
3473 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3474 mutex_destroy(&nfscl->nfscl_chtable_lock);
3475 kmem_free(nfscl, sizeof (*nfscl));
3476 }
3477
3478 /*
3479 * Called by endpnt_destructor to make sure the client handles are
3480 * cleaned up before the RPC endpoints. This becomes a no-op if
3481 * clfini_zone (above) is called first. This function is needed
3482 * (rather than relying on clfini_zone to clean up) because the ZSD
3483 * callbacks have no ordering mechanism, so we have no way to ensure
3484 * that clfini_zone is called before endpnt_destructor.
3485 */
3486 void
clcleanup_zone(zoneid_t zoneid)3487 clcleanup_zone(zoneid_t zoneid)
3488 {
3489 struct nfs_clnt *nfscl;
3490
3491 mutex_enter(&nfs_clnt_list_lock);
3492 nfscl = list_head(&nfs_clnt_list);
3493 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3494 if (nfscl->nfscl_zoneid == zoneid) {
3495 clreclaim_zone(nfscl, 0);
3496 break;
3497 }
3498 }
3499 mutex_exit(&nfs_clnt_list_lock);
3500 }
3501
3502 int
nfs_subrinit(void)3503 nfs_subrinit(void)
3504 {
3505 int i;
3506 ulong_t nrnode_max;
3507
3508 /*
3509 * Allocate and initialize the rnode hash queues
3510 */
3511 if (nrnode <= 0)
3512 nrnode = ncsize;
3513 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3514 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3515 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3516 "!setting nrnode to max value of %ld", nrnode_max);
3517 nrnode = nrnode_max;
3518 }
3519
3520 rtablesize = 1 << highbit(nrnode / hashlen);
3521 rtablemask = rtablesize - 1;
3522 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3523 for (i = 0; i < rtablesize; i++) {
3524 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3525 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3526 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3527 }
3528 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3529 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3530
3531 /*
3532 * Allocate and initialize the access cache
3533 */
3534
3535 /*
3536 * Initial guess is one access cache entry per rnode unless
3537 * nacache is set to a non-zero value and then it is used to
3538 * indicate a guess at the number of access cache entries.
3539 */
3540 if (nacache > 0)
3541 acachesize = 1 << highbit(nacache / hashlen);
3542 else
3543 acachesize = rtablesize;
3544 acachemask = acachesize - 1;
3545 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3546 for (i = 0; i < acachesize; i++) {
3547 acache[i].next = (acache_t *)&acache[i];
3548 acache[i].prev = (acache_t *)&acache[i];
3549 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3550 }
3551 acache_cache = kmem_cache_create("nfs_access_cache",
3552 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3553 /*
3554 * Allocate and initialize the client handle cache
3555 */
3556 chtab_cache = kmem_cache_create("client_handle_cache",
3557 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3558 /*
3559 * Initialize the list of per-zone client handles (and associated data).
3560 * This needs to be done before we call zone_key_create().
3561 */
3562 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3563 offsetof(struct nfs_clnt, nfscl_node));
3564 /*
3565 * Initialize the zone_key for per-zone client handle lists.
3566 */
3567 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3568 /*
3569 * Initialize the various mutexes and reader/writer locks
3570 */
3571 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3572 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3573 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3574
3575 /*
3576 * Assign unique major number for all nfs mounts
3577 */
3578 if ((nfs_major = getudev()) == -1) {
3579 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3580 "nfs: init: can't get unique device number");
3581 nfs_major = 0;
3582 }
3583 nfs_minor = 0;
3584
3585 if (nfs3_jukebox_delay == 0)
3586 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3587
3588 return (0);
3589 }
3590
3591 void
nfs_subrfini(void)3592 nfs_subrfini(void)
3593 {
3594 int i;
3595
3596 /*
3597 * Deallocate the rnode hash queues
3598 */
3599 kmem_cache_destroy(rnode_cache);
3600
3601 for (i = 0; i < rtablesize; i++)
3602 rw_destroy(&rtable[i].r_lock);
3603 kmem_free(rtable, rtablesize * sizeof (*rtable));
3604
3605 /*
3606 * Deallocated the access cache
3607 */
3608 kmem_cache_destroy(acache_cache);
3609
3610 for (i = 0; i < acachesize; i++)
3611 rw_destroy(&acache[i].lock);
3612 kmem_free(acache, acachesize * sizeof (*acache));
3613
3614 /*
3615 * Deallocate the client handle cache
3616 */
3617 kmem_cache_destroy(chtab_cache);
3618
3619 /*
3620 * Destroy the various mutexes and reader/writer locks
3621 */
3622 mutex_destroy(&rpfreelist_lock);
3623 mutex_destroy(&newnum_lock);
3624 mutex_destroy(&nfs_minor_lock);
3625 (void) zone_key_delete(nfsclnt_zone_key);
3626 }
3627
3628 enum nfsstat
puterrno(int error)3629 puterrno(int error)
3630 {
3631
3632 switch (error) {
3633 case EOPNOTSUPP:
3634 return (NFSERR_OPNOTSUPP);
3635 case ENAMETOOLONG:
3636 return (NFSERR_NAMETOOLONG);
3637 case ENOTEMPTY:
3638 return (NFSERR_NOTEMPTY);
3639 case EDQUOT:
3640 return (NFSERR_DQUOT);
3641 case ESTALE:
3642 return (NFSERR_STALE);
3643 case EREMOTE:
3644 return (NFSERR_REMOTE);
3645 case ENOSYS:
3646 return (NFSERR_OPNOTSUPP);
3647 case EOVERFLOW:
3648 return (NFSERR_INVAL);
3649 default:
3650 return ((enum nfsstat)error);
3651 }
3652 /* NOTREACHED */
3653 }
3654
3655 int
geterrno(enum nfsstat status)3656 geterrno(enum nfsstat status)
3657 {
3658
3659 switch (status) {
3660 case NFSERR_OPNOTSUPP:
3661 return (EOPNOTSUPP);
3662 case NFSERR_NAMETOOLONG:
3663 return (ENAMETOOLONG);
3664 case NFSERR_NOTEMPTY:
3665 return (ENOTEMPTY);
3666 case NFSERR_DQUOT:
3667 return (EDQUOT);
3668 case NFSERR_STALE:
3669 return (ESTALE);
3670 case NFSERR_REMOTE:
3671 return (EREMOTE);
3672 case NFSERR_WFLUSH:
3673 return (EIO);
3674 default:
3675 return ((int)status);
3676 }
3677 /* NOTREACHED */
3678 }
3679
3680 enum nfsstat3
puterrno3(int error)3681 puterrno3(int error)
3682 {
3683
3684 #ifdef DEBUG
3685 switch (error) {
3686 case 0:
3687 return (NFS3_OK);
3688 case EPERM:
3689 return (NFS3ERR_PERM);
3690 case ENOENT:
3691 return (NFS3ERR_NOENT);
3692 case EIO:
3693 return (NFS3ERR_IO);
3694 case ENXIO:
3695 return (NFS3ERR_NXIO);
3696 case EACCES:
3697 return (NFS3ERR_ACCES);
3698 case EEXIST:
3699 return (NFS3ERR_EXIST);
3700 case EXDEV:
3701 return (NFS3ERR_XDEV);
3702 case ENODEV:
3703 return (NFS3ERR_NODEV);
3704 case ENOTDIR:
3705 return (NFS3ERR_NOTDIR);
3706 case EISDIR:
3707 return (NFS3ERR_ISDIR);
3708 case EINVAL:
3709 return (NFS3ERR_INVAL);
3710 case EFBIG:
3711 return (NFS3ERR_FBIG);
3712 case ENOSPC:
3713 return (NFS3ERR_NOSPC);
3714 case EROFS:
3715 return (NFS3ERR_ROFS);
3716 case EMLINK:
3717 return (NFS3ERR_MLINK);
3718 case ENAMETOOLONG:
3719 return (NFS3ERR_NAMETOOLONG);
3720 case ENOTEMPTY:
3721 return (NFS3ERR_NOTEMPTY);
3722 case EDQUOT:
3723 return (NFS3ERR_DQUOT);
3724 case ESTALE:
3725 return (NFS3ERR_STALE);
3726 case EREMOTE:
3727 return (NFS3ERR_REMOTE);
3728 case ENOSYS:
3729 case EOPNOTSUPP:
3730 return (NFS3ERR_NOTSUPP);
3731 case EOVERFLOW:
3732 return (NFS3ERR_INVAL);
3733 default:
3734 zcmn_err(getzoneid(), CE_WARN,
3735 "puterrno3: got error %d", error);
3736 return ((enum nfsstat3)error);
3737 }
3738 #else
3739 switch (error) {
3740 case ENAMETOOLONG:
3741 return (NFS3ERR_NAMETOOLONG);
3742 case ENOTEMPTY:
3743 return (NFS3ERR_NOTEMPTY);
3744 case EDQUOT:
3745 return (NFS3ERR_DQUOT);
3746 case ESTALE:
3747 return (NFS3ERR_STALE);
3748 case ENOSYS:
3749 case EOPNOTSUPP:
3750 return (NFS3ERR_NOTSUPP);
3751 case EREMOTE:
3752 return (NFS3ERR_REMOTE);
3753 case EOVERFLOW:
3754 return (NFS3ERR_INVAL);
3755 default:
3756 return ((enum nfsstat3)error);
3757 }
3758 #endif
3759 }
3760
3761 int
geterrno3(enum nfsstat3 status)3762 geterrno3(enum nfsstat3 status)
3763 {
3764
3765 #ifdef DEBUG
3766 switch (status) {
3767 case NFS3_OK:
3768 return (0);
3769 case NFS3ERR_PERM:
3770 return (EPERM);
3771 case NFS3ERR_NOENT:
3772 return (ENOENT);
3773 case NFS3ERR_IO:
3774 return (EIO);
3775 case NFS3ERR_NXIO:
3776 return (ENXIO);
3777 case NFS3ERR_ACCES:
3778 return (EACCES);
3779 case NFS3ERR_EXIST:
3780 return (EEXIST);
3781 case NFS3ERR_XDEV:
3782 return (EXDEV);
3783 case NFS3ERR_NODEV:
3784 return (ENODEV);
3785 case NFS3ERR_NOTDIR:
3786 return (ENOTDIR);
3787 case NFS3ERR_ISDIR:
3788 return (EISDIR);
3789 case NFS3ERR_INVAL:
3790 return (EINVAL);
3791 case NFS3ERR_FBIG:
3792 return (EFBIG);
3793 case NFS3ERR_NOSPC:
3794 return (ENOSPC);
3795 case NFS3ERR_ROFS:
3796 return (EROFS);
3797 case NFS3ERR_MLINK:
3798 return (EMLINK);
3799 case NFS3ERR_NAMETOOLONG:
3800 return (ENAMETOOLONG);
3801 case NFS3ERR_NOTEMPTY:
3802 return (ENOTEMPTY);
3803 case NFS3ERR_DQUOT:
3804 return (EDQUOT);
3805 case NFS3ERR_STALE:
3806 return (ESTALE);
3807 case NFS3ERR_REMOTE:
3808 return (EREMOTE);
3809 case NFS3ERR_BADHANDLE:
3810 return (ESTALE);
3811 case NFS3ERR_NOT_SYNC:
3812 return (EINVAL);
3813 case NFS3ERR_BAD_COOKIE:
3814 return (ENOENT);
3815 case NFS3ERR_NOTSUPP:
3816 return (EOPNOTSUPP);
3817 case NFS3ERR_TOOSMALL:
3818 return (EINVAL);
3819 case NFS3ERR_SERVERFAULT:
3820 return (EIO);
3821 case NFS3ERR_BADTYPE:
3822 return (EINVAL);
3823 case NFS3ERR_JUKEBOX:
3824 return (ENXIO);
3825 default:
3826 zcmn_err(getzoneid(), CE_WARN,
3827 "geterrno3: got status %d", status);
3828 return ((int)status);
3829 }
3830 #else
3831 switch (status) {
3832 case NFS3ERR_NAMETOOLONG:
3833 return (ENAMETOOLONG);
3834 case NFS3ERR_NOTEMPTY:
3835 return (ENOTEMPTY);
3836 case NFS3ERR_DQUOT:
3837 return (EDQUOT);
3838 case NFS3ERR_STALE:
3839 case NFS3ERR_BADHANDLE:
3840 return (ESTALE);
3841 case NFS3ERR_NOTSUPP:
3842 return (EOPNOTSUPP);
3843 case NFS3ERR_REMOTE:
3844 return (EREMOTE);
3845 case NFS3ERR_NOT_SYNC:
3846 case NFS3ERR_TOOSMALL:
3847 case NFS3ERR_BADTYPE:
3848 return (EINVAL);
3849 case NFS3ERR_BAD_COOKIE:
3850 return (ENOENT);
3851 case NFS3ERR_SERVERFAULT:
3852 return (EIO);
3853 case NFS3ERR_JUKEBOX:
3854 return (ENXIO);
3855 default:
3856 return ((int)status);
3857 }
3858 #endif
3859 }
3860
3861 rddir_cache *
rddir_cache_alloc(int flags)3862 rddir_cache_alloc(int flags)
3863 {
3864 rddir_cache *rc;
3865
3866 rc = kmem_alloc(sizeof (*rc), flags);
3867 if (rc != NULL) {
3868 rc->entries = NULL;
3869 rc->flags = RDDIR;
3870 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3871 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3872 rc->count = 1;
3873 #ifdef DEBUG
3874 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3875 #endif
3876 }
3877 return (rc);
3878 }
3879
3880 static void
rddir_cache_free(rddir_cache * rc)3881 rddir_cache_free(rddir_cache *rc)
3882 {
3883
3884 #ifdef DEBUG
3885 atomic_dec_64(&clstat_debug.dirent.value.ui64);
3886 #endif
3887 if (rc->entries != NULL) {
3888 #ifdef DEBUG
3889 rddir_cache_buf_free(rc->entries, rc->buflen);
3890 #else
3891 kmem_free(rc->entries, rc->buflen);
3892 #endif
3893 }
3894 cv_destroy(&rc->cv);
3895 mutex_destroy(&rc->lock);
3896 kmem_free(rc, sizeof (*rc));
3897 }
3898
3899 void
rddir_cache_hold(rddir_cache * rc)3900 rddir_cache_hold(rddir_cache *rc)
3901 {
3902
3903 mutex_enter(&rc->lock);
3904 rc->count++;
3905 mutex_exit(&rc->lock);
3906 }
3907
3908 void
rddir_cache_rele(rddir_cache * rc)3909 rddir_cache_rele(rddir_cache *rc)
3910 {
3911
3912 mutex_enter(&rc->lock);
3913 ASSERT(rc->count > 0);
3914 if (--rc->count == 0) {
3915 mutex_exit(&rc->lock);
3916 rddir_cache_free(rc);
3917 } else
3918 mutex_exit(&rc->lock);
3919 }
3920
3921 #ifdef DEBUG
3922 char *
rddir_cache_buf_alloc(size_t size,int flags)3923 rddir_cache_buf_alloc(size_t size, int flags)
3924 {
3925 char *rc;
3926
3927 rc = kmem_alloc(size, flags);
3928 if (rc != NULL)
3929 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3930 return (rc);
3931 }
3932
3933 void
rddir_cache_buf_free(void * addr,size_t size)3934 rddir_cache_buf_free(void *addr, size_t size)
3935 {
3936
3937 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3938 kmem_free(addr, size);
3939 }
3940 #endif
3941
3942 static int
nfs_free_data_reclaim(rnode_t * rp)3943 nfs_free_data_reclaim(rnode_t *rp)
3944 {
3945 char *contents;
3946 int size;
3947 vsecattr_t *vsp;
3948 nfs3_pathconf_info *info;
3949 int freed;
3950 cred_t *cred;
3951
3952 /*
3953 * Free any held credentials and caches which
3954 * may be associated with this rnode.
3955 */
3956 mutex_enter(&rp->r_statelock);
3957 cred = rp->r_cred;
3958 rp->r_cred = NULL;
3959 contents = rp->r_symlink.contents;
3960 size = rp->r_symlink.size;
3961 rp->r_symlink.contents = NULL;
3962 vsp = rp->r_secattr;
3963 rp->r_secattr = NULL;
3964 info = rp->r_pathconf;
3965 rp->r_pathconf = NULL;
3966 mutex_exit(&rp->r_statelock);
3967
3968 if (cred != NULL)
3969 crfree(cred);
3970
3971 /*
3972 * Free the access cache entries.
3973 */
3974 freed = nfs_access_purge_rp(rp);
3975
3976 if (!HAVE_RDDIR_CACHE(rp) &&
3977 contents == NULL &&
3978 vsp == NULL &&
3979 info == NULL)
3980 return (freed);
3981
3982 /*
3983 * Free the readdir cache entries
3984 */
3985 if (HAVE_RDDIR_CACHE(rp))
3986 nfs_purge_rddir_cache(RTOV(rp));
3987
3988 /*
3989 * Free the symbolic link cache.
3990 */
3991 if (contents != NULL) {
3992
3993 kmem_free((void *)contents, size);
3994 }
3995
3996 /*
3997 * Free any cached ACL.
3998 */
3999 if (vsp != NULL)
4000 nfs_acl_free(vsp);
4001
4002 /*
4003 * Free any cached pathconf information.
4004 */
4005 if (info != NULL)
4006 kmem_free(info, sizeof (*info));
4007
4008 return (1);
4009 }
4010
4011 static int
nfs_active_data_reclaim(rnode_t * rp)4012 nfs_active_data_reclaim(rnode_t *rp)
4013 {
4014 char *contents;
4015 int size;
4016 vsecattr_t *vsp;
4017 nfs3_pathconf_info *info;
4018 int freed;
4019
4020 /*
4021 * Free any held credentials and caches which
4022 * may be associated with this rnode.
4023 */
4024 if (!mutex_tryenter(&rp->r_statelock))
4025 return (0);
4026 contents = rp->r_symlink.contents;
4027 size = rp->r_symlink.size;
4028 rp->r_symlink.contents = NULL;
4029 vsp = rp->r_secattr;
4030 rp->r_secattr = NULL;
4031 info = rp->r_pathconf;
4032 rp->r_pathconf = NULL;
4033 mutex_exit(&rp->r_statelock);
4034
4035 /*
4036 * Free the access cache entries.
4037 */
4038 freed = nfs_access_purge_rp(rp);
4039
4040 if (!HAVE_RDDIR_CACHE(rp) &&
4041 contents == NULL &&
4042 vsp == NULL &&
4043 info == NULL)
4044 return (freed);
4045
4046 /*
4047 * Free the readdir cache entries
4048 */
4049 if (HAVE_RDDIR_CACHE(rp))
4050 nfs_purge_rddir_cache(RTOV(rp));
4051
4052 /*
4053 * Free the symbolic link cache.
4054 */
4055 if (contents != NULL) {
4056
4057 kmem_free((void *)contents, size);
4058 }
4059
4060 /*
4061 * Free any cached ACL.
4062 */
4063 if (vsp != NULL)
4064 nfs_acl_free(vsp);
4065
4066 /*
4067 * Free any cached pathconf information.
4068 */
4069 if (info != NULL)
4070 kmem_free(info, sizeof (*info));
4071
4072 return (1);
4073 }
4074
4075 static int
nfs_free_reclaim(void)4076 nfs_free_reclaim(void)
4077 {
4078 int freed;
4079 rnode_t *rp;
4080
4081 #ifdef DEBUG
4082 clstat_debug.f_reclaim.value.ui64++;
4083 #endif
4084 freed = 0;
4085 mutex_enter(&rpfreelist_lock);
4086 rp = rpfreelist;
4087 if (rp != NULL) {
4088 do {
4089 if (nfs_free_data_reclaim(rp))
4090 freed = 1;
4091 } while ((rp = rp->r_freef) != rpfreelist);
4092 }
4093 mutex_exit(&rpfreelist_lock);
4094 return (freed);
4095 }
4096
4097 static int
nfs_active_reclaim(void)4098 nfs_active_reclaim(void)
4099 {
4100 int freed;
4101 int index;
4102 rnode_t *rp;
4103
4104 #ifdef DEBUG
4105 clstat_debug.a_reclaim.value.ui64++;
4106 #endif
4107 freed = 0;
4108 for (index = 0; index < rtablesize; index++) {
4109 rw_enter(&rtable[index].r_lock, RW_READER);
4110 for (rp = rtable[index].r_hashf;
4111 rp != (rnode_t *)(&rtable[index]);
4112 rp = rp->r_hashf) {
4113 if (nfs_active_data_reclaim(rp))
4114 freed = 1;
4115 }
4116 rw_exit(&rtable[index].r_lock);
4117 }
4118 return (freed);
4119 }
4120
4121 static int
nfs_rnode_reclaim(void)4122 nfs_rnode_reclaim(void)
4123 {
4124 int freed;
4125 rnode_t *rp;
4126 vnode_t *vp;
4127
4128 #ifdef DEBUG
4129 clstat_debug.r_reclaim.value.ui64++;
4130 #endif
4131 freed = 0;
4132 mutex_enter(&rpfreelist_lock);
4133 while ((rp = rpfreelist) != NULL) {
4134 rp_rmfree(rp);
4135 mutex_exit(&rpfreelist_lock);
4136 if (rp->r_flags & RHASHED) {
4137 vp = RTOV(rp);
4138 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4139 mutex_enter(&vp->v_lock);
4140 if (vp->v_count > 1) {
4141 VN_RELE_LOCKED(vp);
4142 mutex_exit(&vp->v_lock);
4143 rw_exit(&rp->r_hashq->r_lock);
4144 mutex_enter(&rpfreelist_lock);
4145 continue;
4146 }
4147 mutex_exit(&vp->v_lock);
4148 rp_rmhash_locked(rp);
4149 rw_exit(&rp->r_hashq->r_lock);
4150 }
4151 /*
4152 * This call to rp_addfree will end up destroying the
4153 * rnode, but in a safe way with the appropriate set
4154 * of checks done.
4155 */
4156 rp_addfree(rp, CRED());
4157 mutex_enter(&rpfreelist_lock);
4158 }
4159 mutex_exit(&rpfreelist_lock);
4160 return (freed);
4161 }
4162
4163 /*ARGSUSED*/
4164 static void
nfs_reclaim(void * cdrarg)4165 nfs_reclaim(void *cdrarg)
4166 {
4167
4168 #ifdef DEBUG
4169 clstat_debug.reclaim.value.ui64++;
4170 #endif
4171 if (nfs_free_reclaim())
4172 return;
4173
4174 if (nfs_active_reclaim())
4175 return;
4176
4177 (void) nfs_rnode_reclaim();
4178 }
4179
4180 /*
4181 * NFS client failover support
4182 *
4183 * Routines to copy filehandles
4184 */
4185 void
nfscopyfh(caddr_t fhp,vnode_t * vp)4186 nfscopyfh(caddr_t fhp, vnode_t *vp)
4187 {
4188 fhandle_t *dest = (fhandle_t *)fhp;
4189
4190 if (dest != NULL)
4191 *dest = *VTOFH(vp);
4192 }
4193
4194 void
nfs3copyfh(caddr_t fhp,vnode_t * vp)4195 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4196 {
4197 nfs_fh3 *dest = (nfs_fh3 *)fhp;
4198
4199 if (dest != NULL)
4200 *dest = *VTOFH3(vp);
4201 }
4202
4203 /*
4204 * NFS client failover support
4205 *
4206 * failover_safe() will test various conditions to ensure that
4207 * failover is permitted for this vnode. It will be denied
4208 * if:
4209 * 1) the operation in progress does not support failover (NULL fi)
4210 * 2) there are no available replicas (NULL mi_servers->sv_next)
4211 * 3) any locks are outstanding on this file
4212 */
4213 static int
failover_safe(failinfo_t * fi)4214 failover_safe(failinfo_t *fi)
4215 {
4216
4217 /*
4218 * Does this op permit failover?
4219 */
4220 if (fi == NULL || fi->vp == NULL)
4221 return (0);
4222
4223 /*
4224 * Are there any alternates to failover to?
4225 */
4226 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4227 return (0);
4228
4229 /*
4230 * Disable check; we've forced local locking
4231 *
4232 * if (flk_has_remote_locks(fi->vp))
4233 * return (0);
4234 */
4235
4236 /*
4237 * If we have no partial path, we can't do anything
4238 */
4239 if (VTOR(fi->vp)->r_path == NULL)
4240 return (0);
4241
4242 return (1);
4243 }
4244
4245 #include <sys/thread.h>
4246
4247 /*
4248 * NFS client failover support
4249 *
4250 * failover_newserver() will start a search for a new server,
4251 * preferably by starting an async thread to do the work. If
4252 * someone is already doing this (recognizable by MI_BINDINPROG
4253 * being set), it will simply return and the calling thread
4254 * will queue on the mi_failover_cv condition variable.
4255 */
4256 static void
failover_newserver(mntinfo_t * mi)4257 failover_newserver(mntinfo_t *mi)
4258 {
4259 /*
4260 * Check if someone else is doing this already
4261 */
4262 mutex_enter(&mi->mi_lock);
4263 if (mi->mi_flags & MI_BINDINPROG) {
4264 mutex_exit(&mi->mi_lock);
4265 return;
4266 }
4267 mi->mi_flags |= MI_BINDINPROG;
4268
4269 /*
4270 * Need to hold the vfs struct so that it can't be released
4271 * while the failover thread is selecting a new server.
4272 */
4273 VFS_HOLD(mi->mi_vfsp);
4274
4275 /*
4276 * Start a thread to do the real searching.
4277 */
4278 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4279
4280 mutex_exit(&mi->mi_lock);
4281 }
4282
4283 /*
4284 * NFS client failover support
4285 *
4286 * failover_thread() will find a new server to replace the one
4287 * currently in use, wake up other threads waiting on this mount
4288 * point, and die. It will start at the head of the server list
4289 * and poll servers until it finds one with an NFS server which is
4290 * registered and responds to a NULL procedure ping.
4291 *
4292 * XXX failover_thread is unsafe within the scope of the
4293 * present model defined for cpr to suspend the system.
4294 * Specifically, over-the-wire calls made by the thread
4295 * are unsafe. The thread needs to be reevaluated in case of
4296 * future updates to the cpr suspend model.
4297 */
4298 static void
failover_thread(mntinfo_t * mi)4299 failover_thread(mntinfo_t *mi)
4300 {
4301 servinfo_t *svp = NULL;
4302 CLIENT *cl;
4303 enum clnt_stat status;
4304 struct timeval tv;
4305 int error;
4306 int oncethru = 0;
4307 callb_cpr_t cprinfo;
4308 rnode_t *rp;
4309 int index;
4310 char *srvnames;
4311 size_t srvnames_len;
4312 struct nfs_clnt *nfscl = NULL;
4313 zoneid_t zoneid = getzoneid();
4314
4315 #ifdef DEBUG
4316 /*
4317 * This is currently only needed to access counters which exist on
4318 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4319 * on non-DEBUG kernels.
4320 */
4321 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4322 ASSERT(nfscl != NULL);
4323 #endif
4324
4325 /*
4326 * Its safe to piggyback on the mi_lock since failover_newserver()
4327 * code guarantees that there will be only one failover thread
4328 * per mountinfo at any instance.
4329 */
4330 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4331 "failover_thread");
4332
4333 mutex_enter(&mi->mi_lock);
4334 while (mi->mi_readers) {
4335 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4336 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4337 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4338 }
4339 mutex_exit(&mi->mi_lock);
4340
4341 tv.tv_sec = 2;
4342 tv.tv_usec = 0;
4343
4344 /*
4345 * Ping the null NFS procedure of every server in
4346 * the list until one responds. We always start
4347 * at the head of the list and always skip the one
4348 * that is current, since it's caused us a problem.
4349 */
4350 while (svp == NULL) {
4351 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4352 if (!oncethru && svp == mi->mi_curr_serv)
4353 continue;
4354
4355 /*
4356 * If the file system was forcibly umounted
4357 * while trying to do a failover, then just
4358 * give up on the failover. It won't matter
4359 * what the server is.
4360 */
4361 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4362 svp = NULL;
4363 goto done;
4364 }
4365
4366 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4367 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4368 if (error)
4369 continue;
4370
4371 if (!(mi->mi_flags & MI_INT))
4372 cl->cl_nosignal = TRUE;
4373 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4374 xdr_void, NULL, tv);
4375 if (!(mi->mi_flags & MI_INT))
4376 cl->cl_nosignal = FALSE;
4377 AUTH_DESTROY(cl->cl_auth);
4378 CLNT_DESTROY(cl);
4379 if (status == RPC_SUCCESS) {
4380 if (svp == mi->mi_curr_serv) {
4381 #ifdef DEBUG
4382 zcmn_err(zoneid, CE_NOTE,
4383 "NFS%d: failing over: selecting original server %s",
4384 mi->mi_vers, svp->sv_hostname);
4385 #else
4386 zcmn_err(zoneid, CE_NOTE,
4387 "NFS: failing over: selecting original server %s",
4388 svp->sv_hostname);
4389 #endif
4390 } else {
4391 #ifdef DEBUG
4392 zcmn_err(zoneid, CE_NOTE,
4393 "NFS%d: failing over from %s to %s",
4394 mi->mi_vers,
4395 mi->mi_curr_serv->sv_hostname,
4396 svp->sv_hostname);
4397 #else
4398 zcmn_err(zoneid, CE_NOTE,
4399 "NFS: failing over from %s to %s",
4400 mi->mi_curr_serv->sv_hostname,
4401 svp->sv_hostname);
4402 #endif
4403 }
4404 break;
4405 }
4406 }
4407
4408 if (svp == NULL) {
4409 if (!oncethru) {
4410 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4411 #ifdef DEBUG
4412 zprintf(zoneid,
4413 "NFS%d servers %s not responding "
4414 "still trying\n", mi->mi_vers, srvnames);
4415 #else
4416 zprintf(zoneid, "NFS servers %s not responding "
4417 "still trying\n", srvnames);
4418 #endif
4419 oncethru = 1;
4420 }
4421 mutex_enter(&mi->mi_lock);
4422 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4423 mutex_exit(&mi->mi_lock);
4424 delay(hz);
4425 mutex_enter(&mi->mi_lock);
4426 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4427 mutex_exit(&mi->mi_lock);
4428 }
4429 }
4430
4431 if (oncethru) {
4432 #ifdef DEBUG
4433 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4434 #else
4435 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4436 #endif
4437 }
4438
4439 if (svp != mi->mi_curr_serv) {
4440 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4441 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4442 rw_enter(&rtable[index].r_lock, RW_WRITER);
4443 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4444 mi->mi_vfsp);
4445 if (rp != NULL) {
4446 if (rp->r_flags & RHASHED)
4447 rp_rmhash_locked(rp);
4448 rw_exit(&rtable[index].r_lock);
4449 rp->r_server = svp;
4450 rp->r_fh = svp->sv_fhandle;
4451 (void) nfs_free_data_reclaim(rp);
4452 index = rtablehash(&rp->r_fh);
4453 rp->r_hashq = &rtable[index];
4454 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4455 vn_exists(RTOV(rp));
4456 rp_addhash(rp);
4457 rw_exit(&rp->r_hashq->r_lock);
4458 VN_RELE(RTOV(rp));
4459 } else
4460 rw_exit(&rtable[index].r_lock);
4461 }
4462
4463 done:
4464 if (oncethru)
4465 kmem_free(srvnames, srvnames_len);
4466 mutex_enter(&mi->mi_lock);
4467 mi->mi_flags &= ~MI_BINDINPROG;
4468 if (svp != NULL) {
4469 mi->mi_curr_serv = svp;
4470 mi->mi_failover++;
4471 #ifdef DEBUG
4472 nfscl->nfscl_stat.failover.value.ui64++;
4473 #endif
4474 }
4475 cv_broadcast(&mi->mi_failover_cv);
4476 CALLB_CPR_EXIT(&cprinfo);
4477 VFS_RELE(mi->mi_vfsp);
4478 zthread_exit();
4479 /* NOTREACHED */
4480 }
4481
4482 /*
4483 * NFS client failover support
4484 *
4485 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4486 * is cleared, meaning that failover is complete. Called with
4487 * mi_lock mutex held.
4488 */
4489 static int
failover_wait(mntinfo_t * mi)4490 failover_wait(mntinfo_t *mi)
4491 {
4492 k_sigset_t smask;
4493
4494 /*
4495 * If someone else is hunting for a living server,
4496 * sleep until it's done. After our sleep, we may
4497 * be bound to the right server and get off cheaply.
4498 */
4499 while (mi->mi_flags & MI_BINDINPROG) {
4500 /*
4501 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4502 * and SIGTERM. (Preserving the existing masks).
4503 * Mask out SIGINT if mount option nointr is specified.
4504 */
4505 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4506 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4507 /*
4508 * restore original signal mask
4509 */
4510 sigunintr(&smask);
4511 return (EINTR);
4512 }
4513 /*
4514 * restore original signal mask
4515 */
4516 sigunintr(&smask);
4517 }
4518 return (0);
4519 }
4520
4521 /*
4522 * NFS client failover support
4523 *
4524 * failover_remap() will do a partial pathname lookup and find the
4525 * desired vnode on the current server. The interim vnode will be
4526 * discarded after we pilfer the new filehandle.
4527 *
4528 * Side effects:
4529 * - This routine will also update the filehandle in the args structure
4530 * pointed to by the fi->fhp pointer if it is non-NULL.
4531 */
4532
4533 static int
failover_remap(failinfo_t * fi)4534 failover_remap(failinfo_t *fi)
4535 {
4536 vnode_t *vp, *nvp, *rootvp;
4537 rnode_t *rp, *nrp;
4538 mntinfo_t *mi;
4539 int error;
4540 #ifdef DEBUG
4541 struct nfs_clnt *nfscl;
4542
4543 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4544 ASSERT(nfscl != NULL);
4545 #endif
4546 /*
4547 * Sanity check
4548 */
4549 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4550 return (EINVAL);
4551 vp = fi->vp;
4552 rp = VTOR(vp);
4553 mi = VTOMI(vp);
4554
4555 if (!(vp->v_flag & VROOT)) {
4556 /*
4557 * Given the root fh, use the path stored in
4558 * the rnode to find the fh for the new server.
4559 */
4560 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4561 if (error)
4562 return (error);
4563
4564 error = failover_lookup(rp->r_path, rootvp,
4565 fi->lookupproc, fi->xattrdirproc, &nvp);
4566
4567 VN_RELE(rootvp);
4568
4569 if (error)
4570 return (error);
4571
4572 /*
4573 * If we found the same rnode, we're done now
4574 */
4575 if (nvp == vp) {
4576 /*
4577 * Failed and the new server may physically be same
4578 * OR may share a same disk subsystem. In this case
4579 * file handle for a particular file path is not going
4580 * to change, given the same filehandle lookup will
4581 * always locate the same rnode as the existing one.
4582 * All we might need to do is to update the r_server
4583 * with the current servinfo.
4584 */
4585 if (!VALID_FH(fi)) {
4586 rp->r_server = mi->mi_curr_serv;
4587 }
4588 VN_RELE(nvp);
4589 return (0);
4590 }
4591
4592 /*
4593 * Try to make it so that no one else will find this
4594 * vnode because it is just a temporary to hold the
4595 * new file handle until that file handle can be
4596 * copied to the original vnode/rnode.
4597 */
4598 nrp = VTOR(nvp);
4599 mutex_enter(&mi->mi_remap_lock);
4600 /*
4601 * Some other thread could have raced in here and could
4602 * have done the remap for this particular rnode before
4603 * this thread here. Check for rp->r_server and
4604 * mi->mi_curr_serv and return if they are same.
4605 */
4606 if (VALID_FH(fi)) {
4607 mutex_exit(&mi->mi_remap_lock);
4608 VN_RELE(nvp);
4609 return (0);
4610 }
4611
4612 if (nrp->r_flags & RHASHED)
4613 rp_rmhash(nrp);
4614
4615 /*
4616 * As a heuristic check on the validity of the new
4617 * file, check that the size and type match against
4618 * that we remember from the old version.
4619 */
4620 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4621 mutex_exit(&mi->mi_remap_lock);
4622 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4623 "NFS replicas %s and %s: file %s not same.",
4624 rp->r_server->sv_hostname,
4625 nrp->r_server->sv_hostname, rp->r_path);
4626 VN_RELE(nvp);
4627 return (EINVAL);
4628 }
4629
4630 /*
4631 * snarf the filehandle from the new rnode
4632 * then release it, again while updating the
4633 * hash queues for the rnode.
4634 */
4635 if (rp->r_flags & RHASHED)
4636 rp_rmhash(rp);
4637 rp->r_server = mi->mi_curr_serv;
4638 rp->r_fh = nrp->r_fh;
4639 rp->r_hashq = nrp->r_hashq;
4640 /*
4641 * Copy the attributes from the new rnode to the old
4642 * rnode. This will help to reduce unnecessary page
4643 * cache flushes.
4644 */
4645 rp->r_attr = nrp->r_attr;
4646 rp->r_attrtime = nrp->r_attrtime;
4647 rp->r_mtime = nrp->r_mtime;
4648 (void) nfs_free_data_reclaim(rp);
4649 nfs_setswaplike(vp, &rp->r_attr);
4650 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4651 rp_addhash(rp);
4652 rw_exit(&rp->r_hashq->r_lock);
4653 mutex_exit(&mi->mi_remap_lock);
4654 VN_RELE(nvp);
4655 }
4656
4657 /*
4658 * Update successful failover remap count
4659 */
4660 mutex_enter(&mi->mi_lock);
4661 mi->mi_remap++;
4662 mutex_exit(&mi->mi_lock);
4663 #ifdef DEBUG
4664 nfscl->nfscl_stat.remap.value.ui64++;
4665 #endif
4666
4667 /*
4668 * If we have a copied filehandle to update, do it now.
4669 */
4670 if (fi->fhp != NULL && fi->copyproc != NULL)
4671 (*fi->copyproc)(fi->fhp, vp);
4672
4673 return (0);
4674 }
4675
4676 /*
4677 * NFS client failover support
4678 *
4679 * We want a simple pathname lookup routine to parse the pieces
4680 * of path in rp->r_path. We know that the path was a created
4681 * as rnodes were made, so we know we have only to deal with
4682 * paths that look like:
4683 * dir1/dir2/dir3/file
4684 * Any evidence of anything like .., symlinks, and ENOTDIR
4685 * are hard errors, because they mean something in this filesystem
4686 * is different from the one we came from, or has changed under
4687 * us in some way. If this is true, we want the failure.
4688 *
4689 * Extended attributes: if the filesystem is mounted with extended
4690 * attributes enabled (-o xattr), the attribute directory will be
4691 * represented in the r_path as the magic name XATTR_RPATH. So if
4692 * we see that name in the pathname, is must be because this node
4693 * is an extended attribute. Therefore, look it up that way.
4694 */
4695 static int
failover_lookup(char * path,vnode_t * root,int (* lookupproc)(vnode_t *,char *,vnode_t **,struct pathname *,int,vnode_t *,cred_t *,int),int (* xattrdirproc)(vnode_t *,vnode_t **,bool_t,cred_t *,int),vnode_t ** new)4696 failover_lookup(char *path, vnode_t *root,
4697 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4698 vnode_t *, cred_t *, int),
4699 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4700 vnode_t **new)
4701 {
4702 vnode_t *dvp, *nvp;
4703 int error = EINVAL;
4704 char *s, *p, *tmppath;
4705 size_t len;
4706 mntinfo_t *mi;
4707 bool_t xattr;
4708
4709 /* Make local copy of path */
4710 len = strlen(path) + 1;
4711 tmppath = kmem_alloc(len, KM_SLEEP);
4712 (void) strcpy(tmppath, path);
4713 s = tmppath;
4714
4715 dvp = root;
4716 VN_HOLD(dvp);
4717 mi = VTOMI(root);
4718 xattr = mi->mi_flags & MI_EXTATTR;
4719
4720 do {
4721 p = strchr(s, '/');
4722 if (p != NULL)
4723 *p = '\0';
4724 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4725 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4726 RFSCALL_SOFT);
4727 } else {
4728 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4729 CRED(), RFSCALL_SOFT);
4730 }
4731 if (p != NULL)
4732 *p++ = '/';
4733 if (error) {
4734 VN_RELE(dvp);
4735 kmem_free(tmppath, len);
4736 return (error);
4737 }
4738 s = p;
4739 VN_RELE(dvp);
4740 dvp = nvp;
4741 } while (p != NULL);
4742
4743 if (nvp != NULL && new != NULL)
4744 *new = nvp;
4745 kmem_free(tmppath, len);
4746 return (0);
4747 }
4748
4749 /*
4750 * NFS client failover support
4751 *
4752 * sv_free() frees the malloc'd portion of a "servinfo_t".
4753 */
4754 void
sv_free(servinfo_t * svp)4755 sv_free(servinfo_t *svp)
4756 {
4757 servinfo_t *next;
4758 struct knetconfig *knconf;
4759
4760 while (svp != NULL) {
4761 next = svp->sv_next;
4762 if (svp->sv_secdata)
4763 sec_clnt_freeinfo(svp->sv_secdata);
4764 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4765 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4766 knconf = svp->sv_knconf;
4767 if (knconf != NULL) {
4768 if (knconf->knc_protofmly != NULL)
4769 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4770 if (knconf->knc_proto != NULL)
4771 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4772 kmem_free(knconf, sizeof (*knconf));
4773 }
4774 knconf = svp->sv_origknconf;
4775 if (knconf != NULL) {
4776 if (knconf->knc_protofmly != NULL)
4777 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4778 if (knconf->knc_proto != NULL)
4779 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4780 kmem_free(knconf, sizeof (*knconf));
4781 }
4782 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4783 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4784 mutex_destroy(&svp->sv_lock);
4785 kmem_free(svp, sizeof (*svp));
4786 svp = next;
4787 }
4788 }
4789
4790 /*
4791 * Only can return non-zero if intr != 0.
4792 */
4793 int
nfs_rw_enter_sig(nfs_rwlock_t * l,krw_t rw,int intr)4794 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4795 {
4796
4797 mutex_enter(&l->lock);
4798
4799 /*
4800 * If this is a nested enter, then allow it. There
4801 * must be as many exits as enters through.
4802 */
4803 if (l->owner == curthread) {
4804 /* lock is held for writing by current thread */
4805 ASSERT(rw == RW_READER || rw == RW_WRITER);
4806 l->count--;
4807 } else if (rw == RW_READER) {
4808 /*
4809 * While there is a writer active or writers waiting,
4810 * then wait for them to finish up and move on. Then,
4811 * increment the count to indicate that a reader is
4812 * active.
4813 */
4814 while (l->count < 0 || l->waiters > 0) {
4815 if (intr) {
4816 klwp_t *lwp = ttolwp(curthread);
4817
4818 if (lwp != NULL)
4819 lwp->lwp_nostop++;
4820 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4821 if (lwp != NULL)
4822 lwp->lwp_nostop--;
4823 mutex_exit(&l->lock);
4824 return (EINTR);
4825 }
4826 if (lwp != NULL)
4827 lwp->lwp_nostop--;
4828 } else
4829 cv_wait(&l->cv_rd, &l->lock);
4830 }
4831 ASSERT(l->count < INT_MAX);
4832 #ifdef DEBUG
4833 if ((l->count % 10000) == 9999)
4834 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4835 "rwlock @ %p\n", l->count, (void *)&l);
4836 #endif
4837 l->count++;
4838 } else {
4839 ASSERT(rw == RW_WRITER);
4840 /*
4841 * While there are readers active or a writer
4842 * active, then wait for all of the readers
4843 * to finish or for the writer to finish.
4844 * Then, set the owner field to curthread and
4845 * decrement count to indicate that a writer
4846 * is active.
4847 */
4848 while (l->count != 0) {
4849 l->waiters++;
4850 if (intr) {
4851 klwp_t *lwp = ttolwp(curthread);
4852
4853 if (lwp != NULL)
4854 lwp->lwp_nostop++;
4855 if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4856 if (lwp != NULL)
4857 lwp->lwp_nostop--;
4858 l->waiters--;
4859 /*
4860 * If there are readers active and no
4861 * writers waiting then wake up all of
4862 * the waiting readers (if any).
4863 */
4864 if (l->count > 0 && l->waiters == 0)
4865 cv_broadcast(&l->cv_rd);
4866 mutex_exit(&l->lock);
4867 return (EINTR);
4868 }
4869 if (lwp != NULL)
4870 lwp->lwp_nostop--;
4871 } else
4872 cv_wait(&l->cv, &l->lock);
4873 l->waiters--;
4874 }
4875 ASSERT(l->owner == NULL);
4876 l->owner = curthread;
4877 l->count--;
4878 }
4879
4880 mutex_exit(&l->lock);
4881
4882 return (0);
4883 }
4884
4885 /*
4886 * If the lock is available, obtain it and return non-zero. If there is
4887 * already a conflicting lock, return 0 immediately.
4888 */
4889
4890 int
nfs_rw_tryenter(nfs_rwlock_t * l,krw_t rw)4891 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4892 {
4893 mutex_enter(&l->lock);
4894
4895 /*
4896 * If this is a nested enter, then allow it. There
4897 * must be as many exits as enters through.
4898 */
4899 if (l->owner == curthread) {
4900 /* lock is held for writing by current thread */
4901 ASSERT(rw == RW_READER || rw == RW_WRITER);
4902 l->count--;
4903 } else if (rw == RW_READER) {
4904 /*
4905 * If there is a writer active or writers waiting, deny the
4906 * lock. Otherwise, bump the count of readers.
4907 */
4908 if (l->count < 0 || l->waiters > 0) {
4909 mutex_exit(&l->lock);
4910 return (0);
4911 }
4912 l->count++;
4913 } else {
4914 ASSERT(rw == RW_WRITER);
4915 /*
4916 * If there are readers active or a writer active, deny the
4917 * lock. Otherwise, set the owner field to curthread and
4918 * decrement count to indicate that a writer is active.
4919 */
4920 if (l->count != 0) {
4921 mutex_exit(&l->lock);
4922 return (0);
4923 }
4924 ASSERT(l->owner == NULL);
4925 l->owner = curthread;
4926 l->count--;
4927 }
4928
4929 mutex_exit(&l->lock);
4930
4931 return (1);
4932 }
4933
4934 void
nfs_rw_exit(nfs_rwlock_t * l)4935 nfs_rw_exit(nfs_rwlock_t *l)
4936 {
4937
4938 mutex_enter(&l->lock);
4939
4940 if (l->owner != NULL) {
4941 ASSERT(l->owner == curthread);
4942
4943 /*
4944 * To release a writer lock increment count to indicate that
4945 * there is one less writer active. If this was the last of
4946 * possibly nested writer locks, then clear the owner field as
4947 * well to indicate that there is no writer active.
4948 */
4949 ASSERT(l->count < 0);
4950 l->count++;
4951 if (l->count == 0) {
4952 l->owner = NULL;
4953
4954 /*
4955 * If there are no writers waiting then wakeup all of
4956 * the waiting readers (if any).
4957 */
4958 if (l->waiters == 0)
4959 cv_broadcast(&l->cv_rd);
4960 }
4961 } else {
4962 /*
4963 * To release a reader lock just decrement count to indicate
4964 * that there is one less reader active.
4965 */
4966 ASSERT(l->count > 0);
4967 l->count--;
4968 }
4969
4970 /*
4971 * If there are no readers active nor a writer active and there is a
4972 * writer waiting we need to wake up it.
4973 */
4974 if (l->count == 0 && l->waiters > 0)
4975 cv_signal(&l->cv);
4976 mutex_exit(&l->lock);
4977 }
4978
4979 int
nfs_rw_lock_held(nfs_rwlock_t * l,krw_t rw)4980 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4981 {
4982
4983 if (rw == RW_READER)
4984 return (l->count > 0);
4985 ASSERT(rw == RW_WRITER);
4986 return (l->count < 0);
4987 }
4988
4989 /* ARGSUSED */
4990 void
nfs_rw_init(nfs_rwlock_t * l,char * name,krw_type_t type,void * arg)4991 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4992 {
4993
4994 l->count = 0;
4995 l->waiters = 0;
4996 l->owner = NULL;
4997 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4998 cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4999 cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
5000 }
5001
5002 void
nfs_rw_destroy(nfs_rwlock_t * l)5003 nfs_rw_destroy(nfs_rwlock_t *l)
5004 {
5005
5006 mutex_destroy(&l->lock);
5007 cv_destroy(&l->cv);
5008 cv_destroy(&l->cv_rd);
5009 }
5010
5011 int
nfs3_rddir_compar(const void * x,const void * y)5012 nfs3_rddir_compar(const void *x, const void *y)
5013 {
5014 rddir_cache *a = (rddir_cache *)x;
5015 rddir_cache *b = (rddir_cache *)y;
5016
5017 if (a->nfs3_cookie == b->nfs3_cookie) {
5018 if (a->buflen == b->buflen)
5019 return (0);
5020 if (a->buflen < b->buflen)
5021 return (-1);
5022 return (1);
5023 }
5024
5025 if (a->nfs3_cookie < b->nfs3_cookie)
5026 return (-1);
5027
5028 return (1);
5029 }
5030
5031 int
nfs_rddir_compar(const void * x,const void * y)5032 nfs_rddir_compar(const void *x, const void *y)
5033 {
5034 rddir_cache *a = (rddir_cache *)x;
5035 rddir_cache *b = (rddir_cache *)y;
5036
5037 if (a->nfs_cookie == b->nfs_cookie) {
5038 if (a->buflen == b->buflen)
5039 return (0);
5040 if (a->buflen < b->buflen)
5041 return (-1);
5042 return (1);
5043 }
5044
5045 if (a->nfs_cookie < b->nfs_cookie)
5046 return (-1);
5047
5048 return (1);
5049 }
5050
5051 static char *
nfs_getsrvnames(mntinfo_t * mi,size_t * len)5052 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5053 {
5054 servinfo_t *s;
5055 char *srvnames;
5056 char *namep;
5057 size_t length;
5058
5059 /*
5060 * Calculate the length of the string required to hold all
5061 * of the server names plus either a comma or a null
5062 * character following each individual one.
5063 */
5064 length = 0;
5065 for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5066 length += s->sv_hostnamelen;
5067
5068 srvnames = kmem_alloc(length, KM_SLEEP);
5069
5070 namep = srvnames;
5071 for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5072 (void) strcpy(namep, s->sv_hostname);
5073 namep += s->sv_hostnamelen - 1;
5074 *namep++ = ',';
5075 }
5076 *--namep = '\0';
5077
5078 *len = length;
5079
5080 return (srvnames);
5081 }
5082
5083 /*
5084 * These two functions are temporary and designed for the upgrade-workaround
5085 * only. They cannot be used for general zone-crossing NFS client support, and
5086 * will be removed shortly.
5087 *
5088 * When the workaround is enabled, all NFS traffic is forced into the global
5089 * zone. These functions are called when the code needs to refer to the state
5090 * of the underlying network connection. They're not called when the function
5091 * needs to refer to the state of the process that invoked the system call.
5092 * (E.g., when checking whether the zone is shutting down during the mount()
5093 * call.)
5094 */
5095
5096 struct zone *
nfs_zone(void)5097 nfs_zone(void)
5098 {
5099 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5100 }
5101
5102 zoneid_t
nfs_zoneid(void)5103 nfs_zoneid(void)
5104 {
5105 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5106 }
5107
5108 /*
5109 * nfs_mount_label_policy:
5110 * Determine whether the mount is allowed according to MAC check,
5111 * by comparing (where appropriate) label of the remote server
5112 * against the label of the zone being mounted into.
5113 *
5114 * Returns:
5115 * 0 : access allowed
5116 * -1 : read-only access allowed (i.e., read-down)
5117 * >0 : error code, such as EACCES
5118 */
5119 int
nfs_mount_label_policy(vfs_t * vfsp,struct netbuf * addr,struct knetconfig * knconf,cred_t * cr)5120 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5121 struct knetconfig *knconf, cred_t *cr)
5122 {
5123 int addr_type;
5124 void *ipaddr;
5125 bslabel_t *server_sl, *mntlabel;
5126 zone_t *mntzone = NULL;
5127 ts_label_t *zlabel;
5128 tsol_tpc_t *tp;
5129 ts_label_t *tsl = NULL;
5130 int retv;
5131
5132 /*
5133 * Get the zone's label. Each zone on a labeled system has a label.
5134 */
5135 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5136 zlabel = mntzone->zone_slabel;
5137 ASSERT(zlabel != NULL);
5138 label_hold(zlabel);
5139
5140 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5141 addr_type = IPV4_VERSION;
5142 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5143 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5144 addr_type = IPV6_VERSION;
5145 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5146 } else {
5147 retv = 0;
5148 goto out;
5149 }
5150
5151 retv = EACCES; /* assume the worst */
5152
5153 /*
5154 * Next, get the assigned label of the remote server.
5155 */
5156 tp = find_tpc(ipaddr, addr_type, B_FALSE);
5157 if (tp == NULL)
5158 goto out; /* error getting host entry */
5159
5160 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5161 goto rel_tpc; /* invalid domain */
5162 if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5163 (tp->tpc_tp.host_type != UNLABELED))
5164 goto rel_tpc; /* invalid hosttype */
5165
5166 if (tp->tpc_tp.host_type == SUN_CIPSO) {
5167 tsl = getflabel_cipso(vfsp);
5168 if (tsl == NULL)
5169 goto rel_tpc; /* error getting server lbl */
5170
5171 server_sl = label2bslabel(tsl);
5172 } else { /* UNLABELED */
5173 server_sl = &tp->tpc_tp.tp_def_label;
5174 }
5175
5176 mntlabel = label2bslabel(zlabel);
5177
5178 /*
5179 * Now compare labels to complete the MAC check. If the labels
5180 * are equal or if the requestor is in the global zone and has
5181 * NET_MAC_AWARE, then allow read-write access. (Except for
5182 * mounts into the global zone itself; restrict these to
5183 * read-only.)
5184 *
5185 * If the requestor is in some other zone, but their label
5186 * dominates the server, then allow read-down.
5187 *
5188 * Otherwise, access is denied.
5189 */
5190 if (blequal(mntlabel, server_sl) ||
5191 (crgetzoneid(cr) == GLOBAL_ZONEID &&
5192 getpflags(NET_MAC_AWARE, cr) != 0)) {
5193 if ((mntzone == global_zone) ||
5194 !blequal(mntlabel, server_sl))
5195 retv = -1; /* read-only */
5196 else
5197 retv = 0; /* access OK */
5198 } else if (bldominates(mntlabel, server_sl)) {
5199 retv = -1; /* read-only */
5200 } else {
5201 retv = EACCES;
5202 }
5203
5204 if (tsl != NULL)
5205 label_rele(tsl);
5206
5207 rel_tpc:
5208 TPC_RELE(tp);
5209 out:
5210 if (mntzone)
5211 zone_rele(mntzone);
5212 label_rele(zlabel);
5213 return (retv);
5214 }
5215
5216 boolean_t
nfs_has_ctty(void)5217 nfs_has_ctty(void)
5218 {
5219 boolean_t rv;
5220 mutex_enter(&curproc->p_splock);
5221 rv = (curproc->p_sessp->s_vp != NULL);
5222 mutex_exit(&curproc->p_splock);
5223 return (rv);
5224 }
5225
5226 /*
5227 * See if xattr directory to see if it has any generic user attributes
5228 */
5229 int
do_xattr_exists_check(vnode_t * vp,ulong_t * valp,cred_t * cr)5230 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5231 {
5232 struct uio uio;
5233 struct iovec iov;
5234 char *dbuf;
5235 struct dirent64 *dp;
5236 size_t dlen = 8 * 1024;
5237 size_t dbuflen;
5238 int eof = 0;
5239 int error;
5240
5241 *valp = 0;
5242 dbuf = kmem_alloc(dlen, KM_SLEEP);
5243 uio.uio_iov = &iov;
5244 uio.uio_iovcnt = 1;
5245 uio.uio_segflg = UIO_SYSSPACE;
5246 uio.uio_fmode = 0;
5247 uio.uio_extflg = UIO_COPY_CACHED;
5248 uio.uio_loffset = 0;
5249 uio.uio_resid = dlen;
5250 iov.iov_base = dbuf;
5251 iov.iov_len = dlen;
5252 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5253 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5254 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5255
5256 dbuflen = dlen - uio.uio_resid;
5257
5258 if (error || dbuflen == 0) {
5259 kmem_free(dbuf, dlen);
5260 return (error);
5261 }
5262
5263 dp = (dirent64_t *)dbuf;
5264
5265 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5266 if (strcmp(dp->d_name, ".") == 0 ||
5267 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5268 VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5269 VIEW_READONLY) == 0) {
5270 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5271 continue;
5272 }
5273
5274 *valp = 1;
5275 break;
5276 }
5277 kmem_free(dbuf, dlen);
5278 return (0);
5279 }
5280
5281 /*
5282 * NFS specific function that returns time since
5283 * system boot in seconds.
5284 */
5285 time_t
nfs_sys_uptime(void)5286 nfs_sys_uptime(void)
5287 {
5288 return (TICK_TO_SEC(ddi_get_lbolt()));
5289 }
5290