1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 */
29
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/systm.h>
33 #include <sys/cred.h>
34 #include <sys/proc.h>
35 #include <sys/user.h>
36 #include <sys/time.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/socket.h>
41 #include <sys/uio.h>
42 #include <sys/tiuser.h>
43 #include <sys/swap.h>
44 #include <sys/errno.h>
45 #include <sys/debug.h>
46 #include <sys/kmem.h>
47 #include <sys/kstat.h>
48 #include <sys/cmn_err.h>
49 #include <sys/vtrace.h>
50 #include <sys/session.h>
51 #include <sys/dnlc.h>
52 #include <sys/bitmap.h>
53 #include <sys/acl.h>
54 #include <sys/ddi.h>
55 #include <sys/pathname.h>
56 #include <sys/flock.h>
57 #include <sys/dirent.h>
58 #include <sys/flock.h>
59 #include <sys/callb.h>
60 #include <sys/atomic.h>
61 #include <sys/list.h>
62 #include <sys/tsol/tnet.h>
63 #include <sys/priv.h>
64 #include <sys/sdt.h>
65 #include <sys/attr.h>
66
67 #include <inet/ip6.h>
68
69 #include <rpc/types.h>
70 #include <rpc/xdr.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73
74 #include <nfs/nfs.h>
75 #include <nfs/nfs4.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/rnode.h>
78 #include <nfs/nfs_acl.h>
79
80 #include <sys/tsol/label.h>
81
82 /*
83 * The hash queues for the access to active and cached rnodes
84 * are organized as doubly linked lists. A reader/writer lock
85 * for each hash bucket is used to control access and to synchronize
86 * lookups, additions, and deletions from the hash queue.
87 *
88 * The rnode freelist is organized as a doubly linked list with
89 * a head pointer. Additions and deletions are synchronized via
90 * a single mutex.
91 *
92 * In order to add an rnode to the free list, it must be hashed into
93 * a hash queue and the exclusive lock to the hash queue be held.
94 * If an rnode is not hashed into a hash queue, then it is destroyed
95 * because it represents no valuable information that can be reused
96 * about the file. The exclusive lock to the hash queue must be
97 * held in order to prevent a lookup in the hash queue from finding
98 * the rnode and using it and assuming that the rnode is not on the
99 * freelist. The lookup in the hash queue will have the hash queue
100 * locked, either exclusive or shared.
101 *
102 * The vnode reference count for each rnode is not allowed to drop
103 * below 1. This prevents external entities, such as the VM
104 * subsystem, from acquiring references to vnodes already on the
105 * freelist and then trying to place them back on the freelist
106 * when their reference is released. This means that the when an
107 * rnode is looked up in the hash queues, then either the rnode
108 * is removed from the freelist and that reference is transferred to
109 * the new reference or the vnode reference count must be incremented
110 * accordingly. The mutex for the freelist must be held in order to
111 * accurately test to see if the rnode is on the freelist or not.
112 * The hash queue lock might be held shared and it is possible that
113 * two different threads may race to remove the rnode from the
114 * freelist. This race can be resolved by holding the mutex for the
115 * freelist. Please note that the mutex for the freelist does not
116 * need to held if the rnode is not on the freelist. It can not be
117 * placed on the freelist due to the requirement that the thread
118 * putting the rnode on the freelist must hold the exclusive lock
119 * to the hash queue and the thread doing the lookup in the hash
120 * queue is holding either a shared or exclusive lock to the hash
121 * queue.
122 *
123 * The lock ordering is:
124 *
125 * hash bucket lock -> vnode lock
126 * hash bucket lock -> freelist lock
127 */
128 static rhashq_t *rtable;
129
130 static kmutex_t rpfreelist_lock;
131 static rnode_t *rpfreelist = NULL;
132 static long rnew = 0;
133 long nrnode = 0;
134
135 static int rtablesize;
136 static int rtablemask;
137
138 static int hashlen = 4;
139
140 static struct kmem_cache *rnode_cache;
141
142 /*
143 * Mutex to protect the following variables:
144 * nfs_major
145 * nfs_minor
146 */
147 kmutex_t nfs_minor_lock;
148 int nfs_major;
149 int nfs_minor;
150
151 /* Do we allow preepoch (negative) time values otw? */
152 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
153
154 /*
155 * Access cache
156 */
157 static acache_hash_t *acache;
158 static long nacache; /* used strictly to size the number of hash queues */
159
160 static int acachesize;
161 static int acachemask;
162 static struct kmem_cache *acache_cache;
163
164 /*
165 * Client side utilities
166 */
167
168 /*
169 * client side statistics
170 */
171 static const struct clstat clstat_tmpl = {
172 { "calls", KSTAT_DATA_UINT64 },
173 { "badcalls", KSTAT_DATA_UINT64 },
174 { "clgets", KSTAT_DATA_UINT64 },
175 { "cltoomany", KSTAT_DATA_UINT64 },
176 #ifdef DEBUG
177 { "clalloc", KSTAT_DATA_UINT64 },
178 { "noresponse", KSTAT_DATA_UINT64 },
179 { "failover", KSTAT_DATA_UINT64 },
180 { "remap", KSTAT_DATA_UINT64 },
181 #endif
182 };
183
184 /*
185 * The following are statistics that describe behavior of the system as a whole
186 * and doesn't correspond to any one particular zone.
187 */
188 #ifdef DEBUG
189 static struct clstat_debug {
190 kstat_named_t nrnode; /* number of allocated rnodes */
191 kstat_named_t access; /* size of access cache */
192 kstat_named_t dirent; /* size of readdir cache */
193 kstat_named_t dirents; /* size of readdir buf cache */
194 kstat_named_t reclaim; /* number of reclaims */
195 kstat_named_t clreclaim; /* number of cl reclaims */
196 kstat_named_t f_reclaim; /* number of free reclaims */
197 kstat_named_t a_reclaim; /* number of active reclaims */
198 kstat_named_t r_reclaim; /* number of rnode reclaims */
199 kstat_named_t rpath; /* bytes used to store rpaths */
200 } clstat_debug = {
201 { "nrnode", KSTAT_DATA_UINT64 },
202 { "access", KSTAT_DATA_UINT64 },
203 { "dirent", KSTAT_DATA_UINT64 },
204 { "dirents", KSTAT_DATA_UINT64 },
205 { "reclaim", KSTAT_DATA_UINT64 },
206 { "clreclaim", KSTAT_DATA_UINT64 },
207 { "f_reclaim", KSTAT_DATA_UINT64 },
208 { "a_reclaim", KSTAT_DATA_UINT64 },
209 { "r_reclaim", KSTAT_DATA_UINT64 },
210 { "r_path", KSTAT_DATA_UINT64 },
211 };
212 #endif /* DEBUG */
213
214 /*
215 * We keep a global list of per-zone client data, so we can clean up all zones
216 * if we get low on memory.
217 */
218 static list_t nfs_clnt_list;
219 static kmutex_t nfs_clnt_list_lock;
220 static zone_key_t nfsclnt_zone_key;
221
222 static struct kmem_cache *chtab_cache;
223
224 /*
225 * Some servers do not properly update the attributes of the
226 * directory when changes are made. To allow interoperability
227 * with these broken servers, the nfs_disable_rddir_cache
228 * parameter must be set in /etc/system
229 */
230 int nfs_disable_rddir_cache = 0;
231
232 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
233 struct chtab **);
234 void clfree(CLIENT *, struct chtab *);
235 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
236 struct chtab **, struct nfs_clnt *);
237 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
238 struct chtab **, struct nfs_clnt *);
239 static void clreclaim(void *);
240 static int nfs_feedback(int, int, mntinfo_t *);
241 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
242 caddr_t, cred_t *, int *, enum clnt_stat *, int,
243 failinfo_t *);
244 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
245 caddr_t, cred_t *, int *, int, failinfo_t *);
246 static void rinactive(rnode_t *, cred_t *);
247 static int rtablehash(nfs_fhandle *);
248 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
249 struct vnodeops *,
250 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
251 cred_t *),
252 int (*)(const void *, const void *), int *, cred_t *,
253 char *, char *);
254 static void rp_rmfree(rnode_t *);
255 static void rp_addhash(rnode_t *);
256 static void rp_rmhash_locked(rnode_t *);
257 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
258 static void destroy_rnode(rnode_t *);
259 static void rddir_cache_free(rddir_cache *);
260 static int nfs_free_data_reclaim(rnode_t *);
261 static int nfs_active_data_reclaim(rnode_t *);
262 static int nfs_free_reclaim(void);
263 static int nfs_active_reclaim(void);
264 static int nfs_rnode_reclaim(void);
265 static void nfs_reclaim(void *);
266 static int failover_safe(failinfo_t *);
267 static void failover_newserver(mntinfo_t *mi);
268 static void failover_thread(mntinfo_t *mi);
269 static int failover_wait(mntinfo_t *);
270 static int failover_remap(failinfo_t *);
271 static int failover_lookup(char *, vnode_t *,
272 int (*)(vnode_t *, char *, vnode_t **,
273 struct pathname *, int, vnode_t *, cred_t *, int),
274 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
275 vnode_t **);
276 static void nfs_free_r_path(rnode_t *);
277 static void nfs_set_vroot(vnode_t *);
278 static char *nfs_getsrvnames(mntinfo_t *, size_t *);
279
280 /*
281 * from rpcsec module (common/rpcsec)
282 */
283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
284 extern void sec_clnt_freeh(AUTH *);
285 extern void sec_clnt_freeinfo(struct sec_data *);
286
287 /*
288 * used in mount policy
289 */
290 extern ts_label_t *getflabel_cipso(vfs_t *);
291
292 /*
293 * EIO or EINTR are not recoverable errors.
294 */
295 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
296
297 #ifdef DEBUG
298 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
299 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
300 #else
301 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
302 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
303 #endif
304 /*
305 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
306 */
307 static int
clget_impl(clinfo_t * ci,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)308 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
309 struct chtab **chp, struct nfs_clnt *nfscl)
310 {
311 struct chhead *ch, *newch;
312 struct chhead **plistp;
313 struct chtab *cp;
314 int error;
315 k_sigset_t smask;
316
317 if (newcl == NULL || chp == NULL || ci == NULL)
318 return (EINVAL);
319
320 *newcl = NULL;
321 *chp = NULL;
322
323 /*
324 * Find an unused handle or create one
325 */
326 newch = NULL;
327 nfscl->nfscl_stat.clgets.value.ui64++;
328 top:
329 /*
330 * Find the correct entry in the cache to check for free
331 * client handles. The search is based on the RPC program
332 * number, program version number, dev_t for the transport
333 * device, and the protocol family.
334 */
335 mutex_enter(&nfscl->nfscl_chtable_lock);
336 plistp = &nfscl->nfscl_chtable;
337 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
338 if (ch->ch_prog == ci->cl_prog &&
339 ch->ch_vers == ci->cl_vers &&
340 ch->ch_dev == svp->sv_knconf->knc_rdev &&
341 (strcmp(ch->ch_protofmly,
342 svp->sv_knconf->knc_protofmly) == 0))
343 break;
344 plistp = &ch->ch_next;
345 }
346
347 /*
348 * If we didn't find a cache entry for this quadruple, then
349 * create one. If we don't have one already preallocated,
350 * then drop the cache lock, create one, and then start over.
351 * If we did have a preallocated entry, then just add it to
352 * the front of the list.
353 */
354 if (ch == NULL) {
355 if (newch == NULL) {
356 mutex_exit(&nfscl->nfscl_chtable_lock);
357 newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
358 newch->ch_timesused = 0;
359 newch->ch_prog = ci->cl_prog;
360 newch->ch_vers = ci->cl_vers;
361 newch->ch_dev = svp->sv_knconf->knc_rdev;
362 newch->ch_protofmly = kmem_alloc(
363 strlen(svp->sv_knconf->knc_protofmly) + 1,
364 KM_SLEEP);
365 (void) strcpy(newch->ch_protofmly,
366 svp->sv_knconf->knc_protofmly);
367 newch->ch_list = NULL;
368 goto top;
369 }
370 ch = newch;
371 newch = NULL;
372 ch->ch_next = nfscl->nfscl_chtable;
373 nfscl->nfscl_chtable = ch;
374 /*
375 * We found a cache entry, but if it isn't on the front of the
376 * list, then move it to the front of the list to try to take
377 * advantage of locality of operations.
378 */
379 } else if (ch != nfscl->nfscl_chtable) {
380 *plistp = ch->ch_next;
381 ch->ch_next = nfscl->nfscl_chtable;
382 nfscl->nfscl_chtable = ch;
383 }
384
385 /*
386 * If there was a free client handle cached, then remove it
387 * from the list, init it, and use it.
388 */
389 if (ch->ch_list != NULL) {
390 cp = ch->ch_list;
391 ch->ch_list = cp->ch_list;
392 mutex_exit(&nfscl->nfscl_chtable_lock);
393 if (newch != NULL) {
394 kmem_free(newch->ch_protofmly,
395 strlen(newch->ch_protofmly) + 1);
396 kmem_free(newch, sizeof (*newch));
397 }
398 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
399 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
400 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
401 &cp->ch_client->cl_auth);
402 if (error || cp->ch_client->cl_auth == NULL) {
403 CLNT_DESTROY(cp->ch_client);
404 kmem_cache_free(chtab_cache, cp);
405 return ((error != 0) ? error : EINTR);
406 }
407 ch->ch_timesused++;
408 *newcl = cp->ch_client;
409 *chp = cp;
410 return (0);
411 }
412
413 /*
414 * There weren't any free client handles which fit, so allocate
415 * a new one and use that.
416 */
417 #ifdef DEBUG
418 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
419 #endif
420 mutex_exit(&nfscl->nfscl_chtable_lock);
421
422 nfscl->nfscl_stat.cltoomany.value.ui64++;
423 if (newch != NULL) {
424 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
425 kmem_free(newch, sizeof (*newch));
426 }
427
428 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
429 cp->ch_head = ch;
430
431 sigintr(&smask, (int)ci->cl_flags & MI_INT);
432 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
433 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
434 sigunintr(&smask);
435
436 if (error != 0) {
437 kmem_cache_free(chtab_cache, cp);
438 #ifdef DEBUG
439 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
440 #endif
441 /*
442 * Warning is unnecessary if error is EINTR.
443 */
444 if (error != EINTR) {
445 nfs_cmn_err(error, CE_WARN,
446 "clget: couldn't create handle: %m\n");
447 }
448 return (error);
449 }
450 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
451 auth_destroy(cp->ch_client->cl_auth);
452 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
453 &cp->ch_client->cl_auth);
454 if (error || cp->ch_client->cl_auth == NULL) {
455 CLNT_DESTROY(cp->ch_client);
456 kmem_cache_free(chtab_cache, cp);
457 #ifdef DEBUG
458 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
459 #endif
460 return ((error != 0) ? error : EINTR);
461 }
462 ch->ch_timesused++;
463 *newcl = cp->ch_client;
464 ASSERT(cp->ch_client->cl_nosignal == FALSE);
465 *chp = cp;
466 return (0);
467 }
468
469 int
clget(clinfo_t * ci,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp)470 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
471 struct chtab **chp)
472 {
473 struct nfs_clnt *nfscl;
474
475 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
476 ASSERT(nfscl != NULL);
477
478 return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
479 }
480
481 static int
acl_clget(mntinfo_t * mi,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)482 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
483 struct chtab **chp, struct nfs_clnt *nfscl)
484 {
485 clinfo_t ci;
486 int error;
487
488 /*
489 * Set read buffer size to rsize
490 * and add room for RPC headers.
491 */
492 ci.cl_readsize = mi->mi_tsize;
493 if (ci.cl_readsize != 0)
494 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
495
496 /*
497 * If soft mount and server is down just try once.
498 * meaning: do not retransmit.
499 */
500 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
501 ci.cl_retrans = 0;
502 else
503 ci.cl_retrans = mi->mi_retrans;
504
505 ci.cl_prog = NFS_ACL_PROGRAM;
506 ci.cl_vers = mi->mi_vers;
507 ci.cl_flags = mi->mi_flags;
508
509 /*
510 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
511 * security flavor, the client tries to establish a security context
512 * by contacting the server. If the connection is timed out or reset,
513 * e.g. server reboot, we will try again.
514 */
515 do {
516 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
517
518 if (error == 0)
519 break;
520
521 /*
522 * For forced unmount or zone shutdown, bail out, no retry.
523 */
524 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
525 error = EIO;
526 break;
527 }
528
529 /* do not retry for softmount */
530 if (!(mi->mi_flags & MI_HARD))
531 break;
532
533 /* let the caller deal with the failover case */
534 if (FAILOVER_MOUNT(mi))
535 break;
536
537 } while (error == ETIMEDOUT || error == ECONNRESET);
538
539 return (error);
540 }
541
542 static int
nfs_clget(mntinfo_t * mi,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)543 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
544 struct chtab **chp, struct nfs_clnt *nfscl)
545 {
546 clinfo_t ci;
547 int error;
548
549 /*
550 * Set read buffer size to rsize
551 * and add room for RPC headers.
552 */
553 ci.cl_readsize = mi->mi_tsize;
554 if (ci.cl_readsize != 0)
555 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
556
557 /*
558 * If soft mount and server is down just try once.
559 * meaning: do not retransmit.
560 */
561 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
562 ci.cl_retrans = 0;
563 else
564 ci.cl_retrans = mi->mi_retrans;
565
566 ci.cl_prog = mi->mi_prog;
567 ci.cl_vers = mi->mi_vers;
568 ci.cl_flags = mi->mi_flags;
569
570 /*
571 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
572 * security flavor, the client tries to establish a security context
573 * by contacting the server. If the connection is timed out or reset,
574 * e.g. server reboot, we will try again.
575 */
576 do {
577 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
578
579 if (error == 0)
580 break;
581
582 /*
583 * For forced unmount or zone shutdown, bail out, no retry.
584 */
585 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
586 error = EIO;
587 break;
588 }
589
590 /* do not retry for softmount */
591 if (!(mi->mi_flags & MI_HARD))
592 break;
593
594 /* let the caller deal with the failover case */
595 if (FAILOVER_MOUNT(mi))
596 break;
597
598 } while (error == ETIMEDOUT || error == ECONNRESET);
599
600 return (error);
601 }
602
603 static void
clfree_impl(CLIENT * cl,struct chtab * cp,struct nfs_clnt * nfscl)604 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
605 {
606 if (cl->cl_auth != NULL) {
607 sec_clnt_freeh(cl->cl_auth);
608 cl->cl_auth = NULL;
609 }
610
611 /*
612 * Timestamp this cache entry so that we know when it was last
613 * used.
614 */
615 cp->ch_freed = gethrestime_sec();
616
617 /*
618 * Add the free client handle to the front of the list.
619 * This way, the list will be sorted in youngest to oldest
620 * order.
621 */
622 mutex_enter(&nfscl->nfscl_chtable_lock);
623 cp->ch_list = cp->ch_head->ch_list;
624 cp->ch_head->ch_list = cp;
625 mutex_exit(&nfscl->nfscl_chtable_lock);
626 }
627
628 void
clfree(CLIENT * cl,struct chtab * cp)629 clfree(CLIENT *cl, struct chtab *cp)
630 {
631 struct nfs_clnt *nfscl;
632
633 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
634 ASSERT(nfscl != NULL);
635
636 clfree_impl(cl, cp, nfscl);
637 }
638
639 #define CL_HOLDTIME 60 /* time to hold client handles */
640
641 static void
clreclaim_zone(struct nfs_clnt * nfscl,uint_t cl_holdtime)642 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
643 {
644 struct chhead *ch;
645 struct chtab *cp; /* list of objects that can be reclaimed */
646 struct chtab *cpe;
647 struct chtab *cpl;
648 struct chtab **cpp;
649 #ifdef DEBUG
650 int n = 0;
651 #endif
652
653 /*
654 * Need to reclaim some memory, so step through the cache
655 * looking through the lists for entries which can be freed.
656 */
657 cp = NULL;
658
659 mutex_enter(&nfscl->nfscl_chtable_lock);
660
661 /*
662 * Here we step through each non-NULL quadruple and start to
663 * construct the reclaim list pointed to by cp. Note that
664 * cp will contain all eligible chtab entries. When this traversal
665 * completes, chtab entries from the last quadruple will be at the
666 * front of cp and entries from previously inspected quadruples have
667 * been appended to the rear of cp.
668 */
669 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
670 if (ch->ch_list == NULL)
671 continue;
672 /*
673 * Search each list for entries older then
674 * cl_holdtime seconds. The lists are maintained
675 * in youngest to oldest order so that when the
676 * first entry is found which is old enough, then
677 * all of the rest of the entries on the list will
678 * be old enough as well.
679 */
680 cpl = ch->ch_list;
681 cpp = &ch->ch_list;
682 while (cpl != NULL &&
683 cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
684 cpp = &cpl->ch_list;
685 cpl = cpl->ch_list;
686 }
687 if (cpl != NULL) {
688 *cpp = NULL;
689 if (cp != NULL) {
690 cpe = cpl;
691 while (cpe->ch_list != NULL)
692 cpe = cpe->ch_list;
693 cpe->ch_list = cp;
694 }
695 cp = cpl;
696 }
697 }
698
699 mutex_exit(&nfscl->nfscl_chtable_lock);
700
701 /*
702 * If cp is empty, then there is nothing to reclaim here.
703 */
704 if (cp == NULL)
705 return;
706
707 /*
708 * Step through the list of entries to free, destroying each client
709 * handle and kmem_free'ing the memory for each entry.
710 */
711 while (cp != NULL) {
712 #ifdef DEBUG
713 n++;
714 #endif
715 CLNT_DESTROY(cp->ch_client);
716 cpl = cp->ch_list;
717 kmem_cache_free(chtab_cache, cp);
718 cp = cpl;
719 }
720
721 #ifdef DEBUG
722 /*
723 * Update clalloc so that nfsstat shows the current number
724 * of allocated client handles.
725 */
726 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
727 #endif
728 }
729
730 /* ARGSUSED */
731 static void
clreclaim(void * all)732 clreclaim(void *all)
733 {
734 struct nfs_clnt *nfscl;
735
736 #ifdef DEBUG
737 clstat_debug.clreclaim.value.ui64++;
738 #endif
739 /*
740 * The system is low on memory; go through and try to reclaim some from
741 * every zone on the system.
742 */
743 mutex_enter(&nfs_clnt_list_lock);
744 nfscl = list_head(&nfs_clnt_list);
745 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
746 clreclaim_zone(nfscl, CL_HOLDTIME);
747 mutex_exit(&nfs_clnt_list_lock);
748 }
749
750 /*
751 * Minimum time-out values indexed by call type
752 * These units are in "eights" of a second to avoid multiplies
753 */
754 static unsigned int minimum_timeo[] = {
755 6, 7, 10
756 };
757
758 /*
759 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
760 */
761 #define MAXTIMO (20*hz)
762 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
763 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
764
765 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
766 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
767 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
768
769 /*
770 * Function called when rfscall notices that we have been
771 * re-transmitting, or when we get a response without retransmissions.
772 * Return 1 if the transfer size was adjusted down - 0 if no change.
773 */
774 static int
nfs_feedback(int flag,int which,mntinfo_t * mi)775 nfs_feedback(int flag, int which, mntinfo_t *mi)
776 {
777 int kind;
778 int r = 0;
779
780 mutex_enter(&mi->mi_lock);
781 if (flag == FEEDBACK_REXMIT1) {
782 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
783 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
784 goto done;
785 if (mi->mi_curread > MIN_NFS_TSIZE) {
786 mi->mi_curread /= 2;
787 if (mi->mi_curread < MIN_NFS_TSIZE)
788 mi->mi_curread = MIN_NFS_TSIZE;
789 r = 1;
790 }
791
792 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
793 mi->mi_curwrite /= 2;
794 if (mi->mi_curwrite < MIN_NFS_TSIZE)
795 mi->mi_curwrite = MIN_NFS_TSIZE;
796 r = 1;
797 }
798 } else if (flag == FEEDBACK_OK) {
799 kind = mi->mi_timer_type[which];
800 if (kind == 0 ||
801 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
802 goto done;
803 if (kind == 1) {
804 if (mi->mi_curread >= mi->mi_tsize)
805 goto done;
806 mi->mi_curread += MIN_NFS_TSIZE;
807 if (mi->mi_curread > mi->mi_tsize/2)
808 mi->mi_curread = mi->mi_tsize;
809 } else if (kind == 2) {
810 if (mi->mi_curwrite >= mi->mi_stsize)
811 goto done;
812 mi->mi_curwrite += MIN_NFS_TSIZE;
813 if (mi->mi_curwrite > mi->mi_stsize/2)
814 mi->mi_curwrite = mi->mi_stsize;
815 }
816 }
817 done:
818 mutex_exit(&mi->mi_lock);
819 return (r);
820 }
821
822 #ifdef DEBUG
823 static int rfs2call_hits = 0;
824 static int rfs2call_misses = 0;
825 #endif
826
827 int
rfs2call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,enum nfsstat * statusp,int flags,failinfo_t * fi)828 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
829 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
830 enum nfsstat *statusp, int flags, failinfo_t *fi)
831 {
832 int rpcerror;
833 enum clnt_stat rpc_status;
834
835 ASSERT(statusp != NULL);
836
837 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
838 cr, douprintf, &rpc_status, flags, fi);
839 if (!rpcerror) {
840 /*
841 * See crnetadjust() for comments.
842 */
843 if (*statusp == NFSERR_ACCES &&
844 (cr = crnetadjust(cr)) != NULL) {
845 #ifdef DEBUG
846 rfs2call_hits++;
847 #endif
848 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
849 resp, cr, douprintf, NULL, flags, fi);
850 crfree(cr);
851 #ifdef DEBUG
852 if (*statusp == NFSERR_ACCES)
853 rfs2call_misses++;
854 #endif
855 }
856 } else if (rpc_status == RPC_PROCUNAVAIL) {
857 *statusp = NFSERR_OPNOTSUPP;
858 rpcerror = 0;
859 }
860
861 return (rpcerror);
862 }
863
864 #define NFS3_JUKEBOX_DELAY 10 * hz
865
866 static clock_t nfs3_jukebox_delay = 0;
867
868 #ifdef DEBUG
869 static int rfs3call_hits = 0;
870 static int rfs3call_misses = 0;
871 #endif
872
873 int
rfs3call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,nfsstat3 * statusp,int flags,failinfo_t * fi)874 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
875 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
876 nfsstat3 *statusp, int flags, failinfo_t *fi)
877 {
878 int rpcerror;
879 int user_informed;
880
881 user_informed = 0;
882 do {
883 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
884 cr, douprintf, NULL, flags, fi);
885 if (!rpcerror) {
886 cred_t *crr;
887 if (*statusp == NFS3ERR_JUKEBOX) {
888 if (ttoproc(curthread) == &p0) {
889 rpcerror = EAGAIN;
890 break;
891 }
892 if (!user_informed) {
893 user_informed = 1;
894 uprintf(
895 "file temporarily unavailable on the server, retrying...\n");
896 }
897 delay(nfs3_jukebox_delay);
898 }
899 /*
900 * See crnetadjust() for comments.
901 */
902 else if (*statusp == NFS3ERR_ACCES &&
903 (crr = crnetadjust(cr)) != NULL) {
904 #ifdef DEBUG
905 rfs3call_hits++;
906 #endif
907 rpcerror = rfscall(mi, which, xdrargs, argsp,
908 xdrres, resp, crr, douprintf,
909 NULL, flags, fi);
910
911 crfree(crr);
912 #ifdef DEBUG
913 if (*statusp == NFS3ERR_ACCES)
914 rfs3call_misses++;
915 #endif
916 }
917 }
918 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
919
920 return (rpcerror);
921 }
922
923 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
924 #define INC_READERS(mi) { \
925 mi->mi_readers++; \
926 }
927 #define DEC_READERS(mi) { \
928 mi->mi_readers--; \
929 if (mi->mi_readers == 0) \
930 cv_broadcast(&mi->mi_failover_cv); \
931 }
932
933 static int
rfscall(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * douprintf,enum clnt_stat * rpc_status,int flags,failinfo_t * fi)934 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
935 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
936 enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
937 {
938 CLIENT *client;
939 struct chtab *ch;
940 cred_t *cr = icr;
941 enum clnt_stat status;
942 struct rpc_err rpcerr, rpcerr_tmp;
943 struct timeval wait;
944 int timeo; /* in units of hz */
945 int my_rsize, my_wsize;
946 bool_t tryagain;
947 bool_t cred_cloned = FALSE;
948 k_sigset_t smask;
949 servinfo_t *svp;
950 struct nfs_clnt *nfscl;
951 zoneid_t zoneid = getzoneid();
952 char *msg;
953 #ifdef DEBUG
954 char *bufp;
955 #endif
956
957
958 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
959 "rfscall_start:which %d mi %p", which, mi);
960
961 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
962 ASSERT(nfscl != NULL);
963
964 nfscl->nfscl_stat.calls.value.ui64++;
965 mi->mi_reqs[which].value.ui64++;
966
967 rpcerr.re_status = RPC_SUCCESS;
968
969 /*
970 * In case of forced unmount or zone shutdown, return EIO.
971 */
972
973 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
974 rpcerr.re_status = RPC_FAILED;
975 rpcerr.re_errno = EIO;
976 return (rpcerr.re_errno);
977 }
978
979 /*
980 * Remember the transfer sizes in case
981 * nfs_feedback changes them underneath us.
982 */
983 my_rsize = mi->mi_curread;
984 my_wsize = mi->mi_curwrite;
985
986 /*
987 * NFS client failover support
988 *
989 * If this rnode is not in sync with the current server (VALID_FH),
990 * we'd like to do a remap to get in sync. We can be interrupted
991 * in failover_remap(), and if so we'll bail. Otherwise, we'll
992 * use the best info we have to try the RPC. Part of that is
993 * unconditionally updating the filehandle copy kept for V3.
994 *
995 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
996 * rw_enter(); we're trying to keep the current server from being
997 * changed on us until we're done with the remapping and have a
998 * matching client handle. We don't want to sending a filehandle
999 * to the wrong host.
1000 */
1001 failoverretry:
1002 if (FAILOVER_MOUNT(mi)) {
1003 mutex_enter(&mi->mi_lock);
1004 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1005 if (failover_wait(mi)) {
1006 mutex_exit(&mi->mi_lock);
1007 return (EINTR);
1008 }
1009 }
1010 INC_READERS(mi);
1011 mutex_exit(&mi->mi_lock);
1012 if (fi) {
1013 if (!VALID_FH(fi) &&
1014 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1015 int remaperr;
1016
1017 svp = mi->mi_curr_serv;
1018 remaperr = failover_remap(fi);
1019 if (remaperr != 0) {
1020 #ifdef DEBUG
1021 if (remaperr != EINTR)
1022 nfs_cmn_err(remaperr, CE_WARN,
1023 "rfscall couldn't failover: %m");
1024 #endif
1025 mutex_enter(&mi->mi_lock);
1026 DEC_READERS(mi);
1027 mutex_exit(&mi->mi_lock);
1028 /*
1029 * If failover_remap returns ETIMEDOUT
1030 * and the filesystem is hard mounted
1031 * we have to retry the call with a new
1032 * server.
1033 */
1034 if ((mi->mi_flags & MI_HARD) &&
1035 IS_RECOVERABLE_ERROR(remaperr)) {
1036 if (svp == mi->mi_curr_serv)
1037 failover_newserver(mi);
1038 rpcerr.re_status = RPC_SUCCESS;
1039 goto failoverretry;
1040 }
1041 rpcerr.re_errno = remaperr;
1042 return (remaperr);
1043 }
1044 }
1045 if (fi->fhp && fi->copyproc)
1046 (*fi->copyproc)(fi->fhp, fi->vp);
1047 }
1048 }
1049
1050 /* For TSOL, use a new cred which has net_mac_aware flag */
1051 if (!cred_cloned && is_system_labeled()) {
1052 cred_cloned = TRUE;
1053 cr = crdup(icr);
1054 (void) setpflags(NET_MAC_AWARE, 1, cr);
1055 }
1056
1057 /*
1058 * clget() calls clnt_tli_kinit() which clears the xid, so we
1059 * are guaranteed to reprocess the retry as a new request.
1060 */
1061 svp = mi->mi_curr_serv;
1062 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1063
1064 if (FAILOVER_MOUNT(mi)) {
1065 mutex_enter(&mi->mi_lock);
1066 DEC_READERS(mi);
1067 mutex_exit(&mi->mi_lock);
1068
1069 if ((rpcerr.re_errno == ETIMEDOUT ||
1070 rpcerr.re_errno == ECONNRESET) &&
1071 failover_safe(fi)) {
1072 if (svp == mi->mi_curr_serv)
1073 failover_newserver(mi);
1074 goto failoverretry;
1075 }
1076 }
1077 if (rpcerr.re_errno != 0)
1078 return (rpcerr.re_errno);
1079
1080 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1081 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1082 timeo = (mi->mi_timeo * hz) / 10;
1083 } else {
1084 mutex_enter(&mi->mi_lock);
1085 timeo = CLNT_SETTIMERS(client,
1086 &(mi->mi_timers[mi->mi_timer_type[which]]),
1087 &(mi->mi_timers[NFS_CALLTYPES]),
1088 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1089 (void (*)())NULL, (caddr_t)mi, 0);
1090 mutex_exit(&mi->mi_lock);
1091 }
1092
1093 /*
1094 * If hard mounted fs, retry call forever unless hard error occurs.
1095 */
1096 do {
1097 tryagain = FALSE;
1098
1099 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1100 status = RPC_FAILED;
1101 rpcerr.re_status = RPC_FAILED;
1102 rpcerr.re_errno = EIO;
1103 break;
1104 }
1105
1106 TICK_TO_TIMEVAL(timeo, &wait);
1107
1108 /*
1109 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1110 * and SIGTERM. (Preserving the existing masks).
1111 * Mask out SIGINT if mount option nointr is specified.
1112 */
1113 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1114 if (!(mi->mi_flags & MI_INT))
1115 client->cl_nosignal = TRUE;
1116
1117 /*
1118 * If there is a current signal, then don't bother
1119 * even trying to send out the request because we
1120 * won't be able to block waiting for the response.
1121 * Simply assume RPC_INTR and get on with it.
1122 */
1123 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1124 status = RPC_INTR;
1125 else {
1126 status = CLNT_CALL(client, which, xdrargs, argsp,
1127 xdrres, resp, wait);
1128 }
1129
1130 if (!(mi->mi_flags & MI_INT))
1131 client->cl_nosignal = FALSE;
1132 /*
1133 * restore original signal mask
1134 */
1135 sigunintr(&smask);
1136
1137 switch (status) {
1138 case RPC_SUCCESS:
1139 if ((mi->mi_flags & MI_DYNAMIC) &&
1140 mi->mi_timer_type[which] != 0 &&
1141 (mi->mi_curread != my_rsize ||
1142 mi->mi_curwrite != my_wsize))
1143 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1144 break;
1145
1146 case RPC_INTR:
1147 /*
1148 * There is no way to recover from this error,
1149 * even if mount option nointr is specified.
1150 * SIGKILL, for example, cannot be blocked.
1151 */
1152 rpcerr.re_status = RPC_INTR;
1153 rpcerr.re_errno = EINTR;
1154 break;
1155
1156 case RPC_UDERROR:
1157 /*
1158 * If the NFS server is local (vold) and
1159 * it goes away then we get RPC_UDERROR.
1160 * This is a retryable error, so we would
1161 * loop, so check to see if the specific
1162 * error was ECONNRESET, indicating that
1163 * target did not exist at all. If so,
1164 * return with RPC_PROGUNAVAIL and
1165 * ECONNRESET to indicate why.
1166 */
1167 CLNT_GETERR(client, &rpcerr);
1168 if (rpcerr.re_errno == ECONNRESET) {
1169 rpcerr.re_status = RPC_PROGUNAVAIL;
1170 rpcerr.re_errno = ECONNRESET;
1171 break;
1172 }
1173 /*FALLTHROUGH*/
1174
1175 default: /* probably RPC_TIMEDOUT */
1176 if (IS_UNRECOVERABLE_RPC(status))
1177 break;
1178
1179 /*
1180 * increment server not responding count
1181 */
1182 mutex_enter(&mi->mi_lock);
1183 mi->mi_noresponse++;
1184 mutex_exit(&mi->mi_lock);
1185 #ifdef DEBUG
1186 nfscl->nfscl_stat.noresponse.value.ui64++;
1187 #endif
1188
1189 if (!(mi->mi_flags & MI_HARD)) {
1190 if (!(mi->mi_flags & MI_SEMISOFT) ||
1191 (mi->mi_ss_call_type[which] == 0))
1192 break;
1193 }
1194
1195 /*
1196 * The call is in progress (over COTS).
1197 * Try the CLNT_CALL again, but don't
1198 * print a noisy error message.
1199 */
1200 if (status == RPC_INPROGRESS) {
1201 tryagain = TRUE;
1202 break;
1203 }
1204
1205 if (flags & RFSCALL_SOFT)
1206 break;
1207
1208 /*
1209 * On zone shutdown, just move on.
1210 */
1211 if (zone_status_get(curproc->p_zone) >=
1212 ZONE_IS_SHUTTING_DOWN) {
1213 rpcerr.re_status = RPC_FAILED;
1214 rpcerr.re_errno = EIO;
1215 break;
1216 }
1217
1218 /*
1219 * NFS client failover support
1220 *
1221 * If the current server just failed us, we'll
1222 * start the process of finding a new server.
1223 * After that, we can just retry.
1224 */
1225 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1226 if (svp == mi->mi_curr_serv)
1227 failover_newserver(mi);
1228 clfree_impl(client, ch, nfscl);
1229 goto failoverretry;
1230 }
1231
1232 tryagain = TRUE;
1233 timeo = backoff(timeo);
1234
1235 CLNT_GETERR(client, &rpcerr_tmp);
1236 if ((status == RPC_CANTSEND) &&
1237 (rpcerr_tmp.re_errno == ENOBUFS))
1238 msg = SRV_QFULL_MSG;
1239 else
1240 msg = SRV_NOTRESP_MSG;
1241
1242 mutex_enter(&mi->mi_lock);
1243 if (!(mi->mi_flags & MI_PRINTED)) {
1244 mi->mi_flags |= MI_PRINTED;
1245 mutex_exit(&mi->mi_lock);
1246 #ifdef DEBUG
1247 zprintf(zoneid, msg, mi->mi_vers,
1248 svp->sv_hostname);
1249 #else
1250 zprintf(zoneid, msg, svp->sv_hostname);
1251 #endif
1252 } else
1253 mutex_exit(&mi->mi_lock);
1254 if (*douprintf && nfs_has_ctty()) {
1255 *douprintf = 0;
1256 if (!(mi->mi_flags & MI_NOPRINT))
1257 #ifdef DEBUG
1258 uprintf(msg, mi->mi_vers,
1259 svp->sv_hostname);
1260 #else
1261 uprintf(msg, svp->sv_hostname);
1262 #endif
1263 }
1264
1265 /*
1266 * If doing dynamic adjustment of transfer
1267 * size and if it's a read or write call
1268 * and if the transfer size changed while
1269 * retransmitting or if the feedback routine
1270 * changed the transfer size,
1271 * then exit rfscall so that the transfer
1272 * size can be adjusted at the vnops level.
1273 */
1274 if ((mi->mi_flags & MI_DYNAMIC) &&
1275 mi->mi_timer_type[which] != 0 &&
1276 (mi->mi_curread != my_rsize ||
1277 mi->mi_curwrite != my_wsize ||
1278 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1279 /*
1280 * On read or write calls, return
1281 * back to the vnode ops level if
1282 * the transfer size changed.
1283 */
1284 clfree_impl(client, ch, nfscl);
1285 if (cred_cloned)
1286 crfree(cr);
1287 return (ENFS_TRYAGAIN);
1288 }
1289 }
1290 } while (tryagain);
1291
1292 if (status != RPC_SUCCESS) {
1293 /*
1294 * Let soft mounts use the timed out message.
1295 */
1296 if (status == RPC_INPROGRESS)
1297 status = RPC_TIMEDOUT;
1298 nfscl->nfscl_stat.badcalls.value.ui64++;
1299 if (status != RPC_INTR) {
1300 mutex_enter(&mi->mi_lock);
1301 mi->mi_flags |= MI_DOWN;
1302 mutex_exit(&mi->mi_lock);
1303 CLNT_GETERR(client, &rpcerr);
1304 #ifdef DEBUG
1305 bufp = clnt_sperror(client, svp->sv_hostname);
1306 zprintf(zoneid, "NFS%d %s failed for %s\n",
1307 mi->mi_vers, mi->mi_rfsnames[which], bufp);
1308 if (nfs_has_ctty()) {
1309 if (!(mi->mi_flags & MI_NOPRINT)) {
1310 uprintf("NFS%d %s failed for %s\n",
1311 mi->mi_vers, mi->mi_rfsnames[which],
1312 bufp);
1313 }
1314 }
1315 kmem_free(bufp, MAXPATHLEN);
1316 #else
1317 zprintf(zoneid,
1318 "NFS %s failed for server %s: error %d (%s)\n",
1319 mi->mi_rfsnames[which], svp->sv_hostname,
1320 status, clnt_sperrno(status));
1321 if (nfs_has_ctty()) {
1322 if (!(mi->mi_flags & MI_NOPRINT)) {
1323 uprintf(
1324 "NFS %s failed for server %s: error %d (%s)\n",
1325 mi->mi_rfsnames[which],
1326 svp->sv_hostname, status,
1327 clnt_sperrno(status));
1328 }
1329 }
1330 #endif
1331 /*
1332 * when CLNT_CALL() fails with RPC_AUTHERROR,
1333 * re_errno is set appropriately depending on
1334 * the authentication error
1335 */
1336 if (status == RPC_VERSMISMATCH ||
1337 status == RPC_PROGVERSMISMATCH)
1338 rpcerr.re_errno = EIO;
1339 }
1340 } else {
1341 /*
1342 * Test the value of mi_down and mi_printed without
1343 * holding the mi_lock mutex. If they are both zero,
1344 * then it is okay to skip the down and printed
1345 * processing. This saves on a mutex_enter and
1346 * mutex_exit pair for a normal, successful RPC.
1347 * This was just complete overhead.
1348 */
1349 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1350 mutex_enter(&mi->mi_lock);
1351 mi->mi_flags &= ~MI_DOWN;
1352 if (mi->mi_flags & MI_PRINTED) {
1353 mi->mi_flags &= ~MI_PRINTED;
1354 mutex_exit(&mi->mi_lock);
1355 #ifdef DEBUG
1356 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357 zprintf(zoneid, "NFS%d server %s ok\n",
1358 mi->mi_vers, svp->sv_hostname);
1359 #else
1360 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1361 zprintf(zoneid, "NFS server %s ok\n",
1362 svp->sv_hostname);
1363 #endif
1364 } else
1365 mutex_exit(&mi->mi_lock);
1366 }
1367
1368 if (*douprintf == 0) {
1369 if (!(mi->mi_flags & MI_NOPRINT))
1370 #ifdef DEBUG
1371 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372 uprintf("NFS%d server %s ok\n",
1373 mi->mi_vers, svp->sv_hostname);
1374 #else
1375 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1376 uprintf("NFS server %s ok\n", svp->sv_hostname);
1377 #endif
1378 *douprintf = 1;
1379 }
1380 }
1381
1382 clfree_impl(client, ch, nfscl);
1383 if (cred_cloned)
1384 crfree(cr);
1385
1386 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1387
1388 if (rpc_status != NULL)
1389 *rpc_status = rpcerr.re_status;
1390
1391 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1392 rpcerr.re_errno);
1393
1394 return (rpcerr.re_errno);
1395 }
1396
1397 #ifdef DEBUG
1398 static int acl2call_hits = 0;
1399 static int acl2call_misses = 0;
1400 #endif
1401
1402 int
acl2call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,enum nfsstat * statusp,int flags,failinfo_t * fi)1403 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1404 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1405 enum nfsstat *statusp, int flags, failinfo_t *fi)
1406 {
1407 int rpcerror;
1408
1409 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1410 cr, douprintf, flags, fi);
1411 if (!rpcerror) {
1412 /*
1413 * See comments with crnetadjust().
1414 */
1415 if (*statusp == NFSERR_ACCES &&
1416 (cr = crnetadjust(cr)) != NULL) {
1417 #ifdef DEBUG
1418 acl2call_hits++;
1419 #endif
1420 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1421 resp, cr, douprintf, flags, fi);
1422 crfree(cr);
1423 #ifdef DEBUG
1424 if (*statusp == NFSERR_ACCES)
1425 acl2call_misses++;
1426 #endif
1427 }
1428 }
1429
1430 return (rpcerror);
1431 }
1432
1433 #ifdef DEBUG
1434 static int acl3call_hits = 0;
1435 static int acl3call_misses = 0;
1436 #endif
1437
1438 int
acl3call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,nfsstat3 * statusp,int flags,failinfo_t * fi)1439 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1440 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1441 nfsstat3 *statusp, int flags, failinfo_t *fi)
1442 {
1443 int rpcerror;
1444 int user_informed;
1445
1446 user_informed = 0;
1447
1448 do {
1449 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1450 cr, douprintf, flags, fi);
1451 if (!rpcerror) {
1452 cred_t *crr;
1453 if (*statusp == NFS3ERR_JUKEBOX) {
1454 if (!user_informed) {
1455 user_informed = 1;
1456 uprintf(
1457 "file temporarily unavailable on the server, retrying...\n");
1458 }
1459 delay(nfs3_jukebox_delay);
1460 }
1461 /*
1462 * See crnetadjust() for comments.
1463 */
1464 else if (*statusp == NFS3ERR_ACCES &&
1465 (crr = crnetadjust(cr)) != NULL) {
1466 #ifdef DEBUG
1467 acl3call_hits++;
1468 #endif
1469 rpcerror = aclcall(mi, which, xdrargs, argsp,
1470 xdrres, resp, crr, douprintf, flags, fi);
1471
1472 crfree(crr);
1473 #ifdef DEBUG
1474 if (*statusp == NFS3ERR_ACCES)
1475 acl3call_misses++;
1476 #endif
1477 }
1478 }
1479 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1480
1481 return (rpcerror);
1482 }
1483
1484 static int
aclcall(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * douprintf,int flags,failinfo_t * fi)1485 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1486 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1487 int flags, failinfo_t *fi)
1488 {
1489 CLIENT *client;
1490 struct chtab *ch;
1491 cred_t *cr = icr;
1492 bool_t cred_cloned = FALSE;
1493 enum clnt_stat status;
1494 struct rpc_err rpcerr;
1495 struct timeval wait;
1496 int timeo; /* in units of hz */
1497 #if 0 /* notyet */
1498 int my_rsize, my_wsize;
1499 #endif
1500 bool_t tryagain;
1501 k_sigset_t smask;
1502 servinfo_t *svp;
1503 struct nfs_clnt *nfscl;
1504 zoneid_t zoneid = getzoneid();
1505 #ifdef DEBUG
1506 char *bufp;
1507 #endif
1508
1509 #if 0 /* notyet */
1510 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1511 "rfscall_start:which %d mi %p", which, mi);
1512 #endif
1513
1514 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1515 ASSERT(nfscl != NULL);
1516
1517 nfscl->nfscl_stat.calls.value.ui64++;
1518 mi->mi_aclreqs[which].value.ui64++;
1519
1520 rpcerr.re_status = RPC_SUCCESS;
1521
1522 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1523 rpcerr.re_status = RPC_FAILED;
1524 rpcerr.re_errno = EIO;
1525 return (rpcerr.re_errno);
1526 }
1527
1528 #if 0 /* notyet */
1529 /*
1530 * Remember the transfer sizes in case
1531 * nfs_feedback changes them underneath us.
1532 */
1533 my_rsize = mi->mi_curread;
1534 my_wsize = mi->mi_curwrite;
1535 #endif
1536
1537 /*
1538 * NFS client failover support
1539 *
1540 * If this rnode is not in sync with the current server (VALID_FH),
1541 * we'd like to do a remap to get in sync. We can be interrupted
1542 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1543 * use the best info we have to try the RPC. Part of that is
1544 * unconditionally updating the filehandle copy kept for V3.
1545 *
1546 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1547 * rw_enter(); we're trying to keep the current server from being
1548 * changed on us until we're done with the remapping and have a
1549 * matching client handle. We don't want to sending a filehandle
1550 * to the wrong host.
1551 */
1552 failoverretry:
1553 if (FAILOVER_MOUNT(mi)) {
1554 mutex_enter(&mi->mi_lock);
1555 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1556 if (failover_wait(mi)) {
1557 mutex_exit(&mi->mi_lock);
1558 return (EINTR);
1559 }
1560 }
1561 INC_READERS(mi);
1562 mutex_exit(&mi->mi_lock);
1563 if (fi) {
1564 if (!VALID_FH(fi) &&
1565 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1566 int remaperr;
1567
1568 svp = mi->mi_curr_serv;
1569 remaperr = failover_remap(fi);
1570 if (remaperr != 0) {
1571 #ifdef DEBUG
1572 if (remaperr != EINTR)
1573 nfs_cmn_err(remaperr, CE_WARN,
1574 "aclcall couldn't failover: %m");
1575 #endif
1576 mutex_enter(&mi->mi_lock);
1577 DEC_READERS(mi);
1578 mutex_exit(&mi->mi_lock);
1579
1580 /*
1581 * If failover_remap returns ETIMEDOUT
1582 * and the filesystem is hard mounted
1583 * we have to retry the call with a new
1584 * server.
1585 */
1586 if ((mi->mi_flags & MI_HARD) &&
1587 IS_RECOVERABLE_ERROR(remaperr)) {
1588 if (svp == mi->mi_curr_serv)
1589 failover_newserver(mi);
1590 rpcerr.re_status = RPC_SUCCESS;
1591 goto failoverretry;
1592 }
1593 return (remaperr);
1594 }
1595 }
1596 if (fi->fhp && fi->copyproc)
1597 (*fi->copyproc)(fi->fhp, fi->vp);
1598 }
1599 }
1600
1601 /* For TSOL, use a new cred which has net_mac_aware flag */
1602 if (!cred_cloned && is_system_labeled()) {
1603 cred_cloned = TRUE;
1604 cr = crdup(icr);
1605 (void) setpflags(NET_MAC_AWARE, 1, cr);
1606 }
1607
1608 /*
1609 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1610 * are guaranteed to reprocess the retry as a new request.
1611 */
1612 svp = mi->mi_curr_serv;
1613 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1614 if (FAILOVER_MOUNT(mi)) {
1615 mutex_enter(&mi->mi_lock);
1616 DEC_READERS(mi);
1617 mutex_exit(&mi->mi_lock);
1618
1619 if ((rpcerr.re_errno == ETIMEDOUT ||
1620 rpcerr.re_errno == ECONNRESET) &&
1621 failover_safe(fi)) {
1622 if (svp == mi->mi_curr_serv)
1623 failover_newserver(mi);
1624 goto failoverretry;
1625 }
1626 }
1627 if (rpcerr.re_errno != 0) {
1628 if (cred_cloned)
1629 crfree(cr);
1630 return (rpcerr.re_errno);
1631 }
1632
1633 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1634 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1635 timeo = (mi->mi_timeo * hz) / 10;
1636 } else {
1637 mutex_enter(&mi->mi_lock);
1638 timeo = CLNT_SETTIMERS(client,
1639 &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1640 &(mi->mi_timers[NFS_CALLTYPES]),
1641 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1642 (void (*)()) 0, (caddr_t)mi, 0);
1643 mutex_exit(&mi->mi_lock);
1644 }
1645
1646 /*
1647 * If hard mounted fs, retry call forever unless hard error occurs.
1648 */
1649 do {
1650 tryagain = FALSE;
1651
1652 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1653 status = RPC_FAILED;
1654 rpcerr.re_status = RPC_FAILED;
1655 rpcerr.re_errno = EIO;
1656 break;
1657 }
1658
1659 TICK_TO_TIMEVAL(timeo, &wait);
1660
1661 /*
1662 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1663 * and SIGTERM. (Preserving the existing masks).
1664 * Mask out SIGINT if mount option nointr is specified.
1665 */
1666 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1667 if (!(mi->mi_flags & MI_INT))
1668 client->cl_nosignal = TRUE;
1669
1670 /*
1671 * If there is a current signal, then don't bother
1672 * even trying to send out the request because we
1673 * won't be able to block waiting for the response.
1674 * Simply assume RPC_INTR and get on with it.
1675 */
1676 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1677 status = RPC_INTR;
1678 else {
1679 status = CLNT_CALL(client, which, xdrargs, argsp,
1680 xdrres, resp, wait);
1681 }
1682
1683 if (!(mi->mi_flags & MI_INT))
1684 client->cl_nosignal = FALSE;
1685 /*
1686 * restore original signal mask
1687 */
1688 sigunintr(&smask);
1689
1690 switch (status) {
1691 case RPC_SUCCESS:
1692 #if 0 /* notyet */
1693 if ((mi->mi_flags & MI_DYNAMIC) &&
1694 mi->mi_timer_type[which] != 0 &&
1695 (mi->mi_curread != my_rsize ||
1696 mi->mi_curwrite != my_wsize))
1697 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1698 #endif
1699 break;
1700
1701 /*
1702 * Unfortunately, there are servers in the world which
1703 * are not coded correctly. They are not prepared to
1704 * handle RPC requests to the NFS port which are not
1705 * NFS requests. Thus, they may try to process the
1706 * NFS_ACL request as if it were an NFS request. This
1707 * does not work. Generally, an error will be generated
1708 * on the client because it will not be able to decode
1709 * the response from the server. However, it seems
1710 * possible that the server may not be able to decode
1711 * the arguments. Thus, the criteria for deciding
1712 * whether the server supports NFS_ACL or not is whether
1713 * the following RPC errors are returned from CLNT_CALL.
1714 */
1715 case RPC_CANTDECODERES:
1716 case RPC_PROGUNAVAIL:
1717 case RPC_CANTDECODEARGS:
1718 case RPC_PROGVERSMISMATCH:
1719 mutex_enter(&mi->mi_lock);
1720 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1721 mutex_exit(&mi->mi_lock);
1722 break;
1723
1724 /*
1725 * If the server supports NFS_ACL but not the new ops
1726 * for extended attributes, make sure we don't retry.
1727 */
1728 case RPC_PROCUNAVAIL:
1729 mutex_enter(&mi->mi_lock);
1730 mi->mi_flags &= ~MI_EXTATTR;
1731 mutex_exit(&mi->mi_lock);
1732 break;
1733
1734 case RPC_INTR:
1735 /*
1736 * There is no way to recover from this error,
1737 * even if mount option nointr is specified.
1738 * SIGKILL, for example, cannot be blocked.
1739 */
1740 rpcerr.re_status = RPC_INTR;
1741 rpcerr.re_errno = EINTR;
1742 break;
1743
1744 case RPC_UDERROR:
1745 /*
1746 * If the NFS server is local (vold) and
1747 * it goes away then we get RPC_UDERROR.
1748 * This is a retryable error, so we would
1749 * loop, so check to see if the specific
1750 * error was ECONNRESET, indicating that
1751 * target did not exist at all. If so,
1752 * return with RPC_PROGUNAVAIL and
1753 * ECONNRESET to indicate why.
1754 */
1755 CLNT_GETERR(client, &rpcerr);
1756 if (rpcerr.re_errno == ECONNRESET) {
1757 rpcerr.re_status = RPC_PROGUNAVAIL;
1758 rpcerr.re_errno = ECONNRESET;
1759 break;
1760 }
1761 /*FALLTHROUGH*/
1762
1763 default: /* probably RPC_TIMEDOUT */
1764 if (IS_UNRECOVERABLE_RPC(status))
1765 break;
1766
1767 /*
1768 * increment server not responding count
1769 */
1770 mutex_enter(&mi->mi_lock);
1771 mi->mi_noresponse++;
1772 mutex_exit(&mi->mi_lock);
1773 #ifdef DEBUG
1774 nfscl->nfscl_stat.noresponse.value.ui64++;
1775 #endif
1776
1777 if (!(mi->mi_flags & MI_HARD)) {
1778 if (!(mi->mi_flags & MI_SEMISOFT) ||
1779 (mi->mi_acl_ss_call_type[which] == 0))
1780 break;
1781 }
1782
1783 /*
1784 * The call is in progress (over COTS).
1785 * Try the CLNT_CALL again, but don't
1786 * print a noisy error message.
1787 */
1788 if (status == RPC_INPROGRESS) {
1789 tryagain = TRUE;
1790 break;
1791 }
1792
1793 if (flags & RFSCALL_SOFT)
1794 break;
1795
1796 /*
1797 * On zone shutdown, just move on.
1798 */
1799 if (zone_status_get(curproc->p_zone) >=
1800 ZONE_IS_SHUTTING_DOWN) {
1801 rpcerr.re_status = RPC_FAILED;
1802 rpcerr.re_errno = EIO;
1803 break;
1804 }
1805
1806 /*
1807 * NFS client failover support
1808 *
1809 * If the current server just failed us, we'll
1810 * start the process of finding a new server.
1811 * After that, we can just retry.
1812 */
1813 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1814 if (svp == mi->mi_curr_serv)
1815 failover_newserver(mi);
1816 clfree_impl(client, ch, nfscl);
1817 goto failoverretry;
1818 }
1819
1820 tryagain = TRUE;
1821 timeo = backoff(timeo);
1822 mutex_enter(&mi->mi_lock);
1823 if (!(mi->mi_flags & MI_PRINTED)) {
1824 mi->mi_flags |= MI_PRINTED;
1825 mutex_exit(&mi->mi_lock);
1826 #ifdef DEBUG
1827 zprintf(zoneid,
1828 "NFS_ACL%d server %s not responding still trying\n",
1829 mi->mi_vers, svp->sv_hostname);
1830 #else
1831 zprintf(zoneid,
1832 "NFS server %s not responding still trying\n",
1833 svp->sv_hostname);
1834 #endif
1835 } else
1836 mutex_exit(&mi->mi_lock);
1837 if (*douprintf && nfs_has_ctty()) {
1838 *douprintf = 0;
1839 if (!(mi->mi_flags & MI_NOPRINT))
1840 #ifdef DEBUG
1841 uprintf(
1842 "NFS_ACL%d server %s not responding still trying\n",
1843 mi->mi_vers, svp->sv_hostname);
1844 #else
1845 uprintf(
1846 "NFS server %s not responding still trying\n",
1847 svp->sv_hostname);
1848 #endif
1849 }
1850
1851 #if 0 /* notyet */
1852 /*
1853 * If doing dynamic adjustment of transfer
1854 * size and if it's a read or write call
1855 * and if the transfer size changed while
1856 * retransmitting or if the feedback routine
1857 * changed the transfer size,
1858 * then exit rfscall so that the transfer
1859 * size can be adjusted at the vnops level.
1860 */
1861 if ((mi->mi_flags & MI_DYNAMIC) &&
1862 mi->mi_acl_timer_type[which] != 0 &&
1863 (mi->mi_curread != my_rsize ||
1864 mi->mi_curwrite != my_wsize ||
1865 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1866 /*
1867 * On read or write calls, return
1868 * back to the vnode ops level if
1869 * the transfer size changed.
1870 */
1871 clfree_impl(client, ch, nfscl);
1872 if (cred_cloned)
1873 crfree(cr);
1874 return (ENFS_TRYAGAIN);
1875 }
1876 #endif
1877 }
1878 } while (tryagain);
1879
1880 if (status != RPC_SUCCESS) {
1881 /*
1882 * Let soft mounts use the timed out message.
1883 */
1884 if (status == RPC_INPROGRESS)
1885 status = RPC_TIMEDOUT;
1886 nfscl->nfscl_stat.badcalls.value.ui64++;
1887 if (status == RPC_CANTDECODERES ||
1888 status == RPC_PROGUNAVAIL ||
1889 status == RPC_PROCUNAVAIL ||
1890 status == RPC_CANTDECODEARGS ||
1891 status == RPC_PROGVERSMISMATCH)
1892 CLNT_GETERR(client, &rpcerr);
1893 else if (status != RPC_INTR) {
1894 mutex_enter(&mi->mi_lock);
1895 mi->mi_flags |= MI_DOWN;
1896 mutex_exit(&mi->mi_lock);
1897 CLNT_GETERR(client, &rpcerr);
1898 #ifdef DEBUG
1899 bufp = clnt_sperror(client, svp->sv_hostname);
1900 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1901 mi->mi_vers, mi->mi_aclnames[which], bufp);
1902 if (nfs_has_ctty()) {
1903 if (!(mi->mi_flags & MI_NOPRINT)) {
1904 uprintf("NFS_ACL%d %s failed for %s\n",
1905 mi->mi_vers, mi->mi_aclnames[which],
1906 bufp);
1907 }
1908 }
1909 kmem_free(bufp, MAXPATHLEN);
1910 #else
1911 zprintf(zoneid,
1912 "NFS %s failed for server %s: error %d (%s)\n",
1913 mi->mi_aclnames[which], svp->sv_hostname,
1914 status, clnt_sperrno(status));
1915 if (nfs_has_ctty()) {
1916 if (!(mi->mi_flags & MI_NOPRINT))
1917 uprintf(
1918 "NFS %s failed for server %s: error %d (%s)\n",
1919 mi->mi_aclnames[which],
1920 svp->sv_hostname, status,
1921 clnt_sperrno(status));
1922 }
1923 #endif
1924 /*
1925 * when CLNT_CALL() fails with RPC_AUTHERROR,
1926 * re_errno is set appropriately depending on
1927 * the authentication error
1928 */
1929 if (status == RPC_VERSMISMATCH ||
1930 status == RPC_PROGVERSMISMATCH)
1931 rpcerr.re_errno = EIO;
1932 }
1933 } else {
1934 /*
1935 * Test the value of mi_down and mi_printed without
1936 * holding the mi_lock mutex. If they are both zero,
1937 * then it is okay to skip the down and printed
1938 * processing. This saves on a mutex_enter and
1939 * mutex_exit pair for a normal, successful RPC.
1940 * This was just complete overhead.
1941 */
1942 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1943 mutex_enter(&mi->mi_lock);
1944 mi->mi_flags &= ~MI_DOWN;
1945 if (mi->mi_flags & MI_PRINTED) {
1946 mi->mi_flags &= ~MI_PRINTED;
1947 mutex_exit(&mi->mi_lock);
1948 #ifdef DEBUG
1949 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1950 mi->mi_vers, svp->sv_hostname);
1951 #else
1952 zprintf(zoneid, "NFS server %s ok\n",
1953 svp->sv_hostname);
1954 #endif
1955 } else
1956 mutex_exit(&mi->mi_lock);
1957 }
1958
1959 if (*douprintf == 0) {
1960 if (!(mi->mi_flags & MI_NOPRINT))
1961 #ifdef DEBUG
1962 uprintf("NFS_ACL%d server %s ok\n",
1963 mi->mi_vers, svp->sv_hostname);
1964 #else
1965 uprintf("NFS server %s ok\n", svp->sv_hostname);
1966 #endif
1967 *douprintf = 1;
1968 }
1969 }
1970
1971 clfree_impl(client, ch, nfscl);
1972 if (cred_cloned)
1973 crfree(cr);
1974
1975 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1976
1977 #if 0 /* notyet */
1978 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1979 rpcerr.re_errno);
1980 #endif
1981
1982 return (rpcerr.re_errno);
1983 }
1984
1985 int
vattr_to_sattr(struct vattr * vap,struct nfssattr * sa)1986 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1987 {
1988 uint_t mask = vap->va_mask;
1989
1990 if (!(mask & AT_MODE))
1991 sa->sa_mode = (uint32_t)-1;
1992 else
1993 sa->sa_mode = vap->va_mode;
1994 if (!(mask & AT_UID))
1995 sa->sa_uid = (uint32_t)-1;
1996 else
1997 sa->sa_uid = (uint32_t)vap->va_uid;
1998 if (!(mask & AT_GID))
1999 sa->sa_gid = (uint32_t)-1;
2000 else
2001 sa->sa_gid = (uint32_t)vap->va_gid;
2002 if (!(mask & AT_SIZE))
2003 sa->sa_size = (uint32_t)-1;
2004 else
2005 sa->sa_size = (uint32_t)vap->va_size;
2006 if (!(mask & AT_ATIME))
2007 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2008 else {
2009 /* check time validity */
2010 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2011 return (EOVERFLOW);
2012 }
2013 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2014 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2015 }
2016 if (!(mask & AT_MTIME))
2017 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2018 else {
2019 /* check time validity */
2020 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2021 return (EOVERFLOW);
2022 }
2023 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2024 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2025 }
2026 return (0);
2027 }
2028
2029 int
vattr_to_sattr3(struct vattr * vap,sattr3 * sa)2030 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2031 {
2032 uint_t mask = vap->va_mask;
2033
2034 if (!(mask & AT_MODE))
2035 sa->mode.set_it = FALSE;
2036 else {
2037 sa->mode.set_it = TRUE;
2038 sa->mode.mode = (mode3)vap->va_mode;
2039 }
2040 if (!(mask & AT_UID))
2041 sa->uid.set_it = FALSE;
2042 else {
2043 sa->uid.set_it = TRUE;
2044 sa->uid.uid = (uid3)vap->va_uid;
2045 }
2046 if (!(mask & AT_GID))
2047 sa->gid.set_it = FALSE;
2048 else {
2049 sa->gid.set_it = TRUE;
2050 sa->gid.gid = (gid3)vap->va_gid;
2051 }
2052 if (!(mask & AT_SIZE))
2053 sa->size.set_it = FALSE;
2054 else {
2055 sa->size.set_it = TRUE;
2056 sa->size.size = (size3)vap->va_size;
2057 }
2058 if (!(mask & AT_ATIME))
2059 sa->atime.set_it = DONT_CHANGE;
2060 else {
2061 /* check time validity */
2062 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2063 return (EOVERFLOW);
2064 }
2065 sa->atime.set_it = SET_TO_CLIENT_TIME;
2066 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2067 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2068 }
2069 if (!(mask & AT_MTIME))
2070 sa->mtime.set_it = DONT_CHANGE;
2071 else {
2072 /* check time validity */
2073 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2074 return (EOVERFLOW);
2075 }
2076 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2077 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2078 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2079 }
2080 return (0);
2081 }
2082
2083 void
setdiropargs(struct nfsdiropargs * da,char * nm,vnode_t * dvp)2084 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2085 {
2086
2087 da->da_fhandle = VTOFH(dvp);
2088 da->da_name = nm;
2089 da->da_flags = 0;
2090 }
2091
2092 void
setdiropargs3(diropargs3 * da,char * nm,vnode_t * dvp)2093 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2094 {
2095
2096 da->dirp = VTOFH3(dvp);
2097 da->name = nm;
2098 }
2099
2100 int
setdirgid(vnode_t * dvp,gid_t * gidp,cred_t * cr)2101 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2102 {
2103 int error;
2104 rnode_t *rp;
2105 struct vattr va;
2106
2107 va.va_mask = AT_MODE | AT_GID;
2108 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2109 if (error)
2110 return (error);
2111
2112 /*
2113 * To determine the expected group-id of the created file:
2114 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2115 * GRPID option, and the directory's set-gid bit is clear,
2116 * then use the process's gid.
2117 * 2) Otherwise, set the group-id to the gid of the parent directory.
2118 */
2119 rp = VTOR(dvp);
2120 mutex_enter(&rp->r_statelock);
2121 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2122 *gidp = crgetgid(cr);
2123 else
2124 *gidp = va.va_gid;
2125 mutex_exit(&rp->r_statelock);
2126 return (0);
2127 }
2128
2129 int
setdirmode(vnode_t * dvp,mode_t * omp,cred_t * cr)2130 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2131 {
2132 int error;
2133 struct vattr va;
2134
2135 va.va_mask = AT_MODE;
2136 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2137 if (error)
2138 return (error);
2139
2140 /*
2141 * Modify the expected mode (om) so that the set-gid bit matches
2142 * that of the parent directory (dvp).
2143 */
2144 if (va.va_mode & VSGID)
2145 *omp |= VSGID;
2146 else
2147 *omp &= ~VSGID;
2148 return (0);
2149 }
2150
2151 void
nfs_setswaplike(vnode_t * vp,vattr_t * vap)2152 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2153 {
2154
2155 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2156 if (!(vp->v_flag & VSWAPLIKE)) {
2157 mutex_enter(&vp->v_lock);
2158 vp->v_flag |= VSWAPLIKE;
2159 mutex_exit(&vp->v_lock);
2160 }
2161 } else {
2162 if (vp->v_flag & VSWAPLIKE) {
2163 mutex_enter(&vp->v_lock);
2164 vp->v_flag &= ~VSWAPLIKE;
2165 mutex_exit(&vp->v_lock);
2166 }
2167 }
2168 }
2169
2170 /*
2171 * Free the resources associated with an rnode.
2172 */
2173 static void
rinactive(rnode_t * rp,cred_t * cr)2174 rinactive(rnode_t *rp, cred_t *cr)
2175 {
2176 vnode_t *vp;
2177 cred_t *cred;
2178 char *contents;
2179 int size;
2180 vsecattr_t *vsp;
2181 int error;
2182 nfs3_pathconf_info *info;
2183
2184 /*
2185 * Before freeing anything, wait until all asynchronous
2186 * activity is done on this rnode. This will allow all
2187 * asynchronous read ahead and write behind i/o's to
2188 * finish.
2189 */
2190 mutex_enter(&rp->r_statelock);
2191 while (rp->r_count > 0)
2192 cv_wait(&rp->r_cv, &rp->r_statelock);
2193 mutex_exit(&rp->r_statelock);
2194
2195 /*
2196 * Flush and invalidate all pages associated with the vnode.
2197 */
2198 vp = RTOV(rp);
2199 if (vn_has_cached_data(vp)) {
2200 ASSERT(vp->v_type != VCHR);
2201 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2202 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2203 if (error && (error == ENOSPC || error == EDQUOT)) {
2204 mutex_enter(&rp->r_statelock);
2205 if (!rp->r_error)
2206 rp->r_error = error;
2207 mutex_exit(&rp->r_statelock);
2208 }
2209 }
2210 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2211 }
2212
2213 /*
2214 * Free any held credentials and caches which may be associated
2215 * with this rnode.
2216 */
2217 mutex_enter(&rp->r_statelock);
2218 cred = rp->r_cred;
2219 rp->r_cred = NULL;
2220 contents = rp->r_symlink.contents;
2221 size = rp->r_symlink.size;
2222 rp->r_symlink.contents = NULL;
2223 vsp = rp->r_secattr;
2224 rp->r_secattr = NULL;
2225 info = rp->r_pathconf;
2226 rp->r_pathconf = NULL;
2227 mutex_exit(&rp->r_statelock);
2228
2229 /*
2230 * Free the held credential.
2231 */
2232 if (cred != NULL)
2233 crfree(cred);
2234
2235 /*
2236 * Free the access cache entries.
2237 */
2238 (void) nfs_access_purge_rp(rp);
2239
2240 /*
2241 * Free the readdir cache entries.
2242 */
2243 if (HAVE_RDDIR_CACHE(rp))
2244 nfs_purge_rddir_cache(vp);
2245
2246 /*
2247 * Free the symbolic link cache.
2248 */
2249 if (contents != NULL) {
2250
2251 kmem_free((void *)contents, size);
2252 }
2253
2254 /*
2255 * Free any cached ACL.
2256 */
2257 if (vsp != NULL)
2258 nfs_acl_free(vsp);
2259
2260 /*
2261 * Free any cached pathconf information.
2262 */
2263 if (info != NULL)
2264 kmem_free(info, sizeof (*info));
2265 }
2266
2267 /*
2268 * Return a vnode for the given NFS Version 2 file handle.
2269 * If no rnode exists for this fhandle, create one and put it
2270 * into the hash queues. If the rnode for this fhandle
2271 * already exists, return it.
2272 *
2273 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2274 */
2275 vnode_t *
makenfsnode(fhandle_t * fh,struct nfsfattr * attr,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2276 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2277 hrtime_t t, cred_t *cr, char *dnm, char *nm)
2278 {
2279 int newnode;
2280 int index;
2281 vnode_t *vp;
2282 nfs_fhandle nfh;
2283 vattr_t va;
2284
2285 nfh.fh_len = NFS_FHSIZE;
2286 bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2287
2288 index = rtablehash(&nfh);
2289 rw_enter(&rtable[index].r_lock, RW_READER);
2290
2291 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2292 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2293
2294 if (attr != NULL) {
2295 if (!newnode) {
2296 rw_exit(&rtable[index].r_lock);
2297 (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2298 } else {
2299 if (attr->na_type < NFNON || attr->na_type > NFSOC)
2300 vp->v_type = VBAD;
2301 else
2302 vp->v_type = n2v_type(attr);
2303 /*
2304 * A translation here seems to be necessary
2305 * because this function can be called
2306 * with `attr' that has come from the wire,
2307 * and been operated on by vattr_to_nattr().
2308 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2309 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2310 * ->makenfsnode().
2311 */
2312 if ((attr->na_rdev & 0xffff0000) == 0)
2313 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2314 else
2315 vp->v_rdev = expldev(n2v_rdev(attr));
2316 nfs_attrcache(vp, attr, t);
2317 rw_exit(&rtable[index].r_lock);
2318 }
2319 } else {
2320 if (newnode) {
2321 PURGE_ATTRCACHE(vp);
2322 }
2323 rw_exit(&rtable[index].r_lock);
2324 }
2325
2326 return (vp);
2327 }
2328
2329 /*
2330 * Return a vnode for the given NFS Version 3 file handle.
2331 * If no rnode exists for this fhandle, create one and put it
2332 * into the hash queues. If the rnode for this fhandle
2333 * already exists, return it.
2334 *
2335 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2336 */
2337 vnode_t *
makenfs3node_va(nfs_fh3 * fh,vattr_t * vap,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2338 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2339 cred_t *cr, char *dnm, char *nm)
2340 {
2341 int newnode;
2342 int index;
2343 vnode_t *vp;
2344
2345 index = rtablehash((nfs_fhandle *)fh);
2346 rw_enter(&rtable[index].r_lock, RW_READER);
2347
2348 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2349 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2350 dnm, nm);
2351
2352 if (vap == NULL) {
2353 if (newnode) {
2354 PURGE_ATTRCACHE(vp);
2355 }
2356 rw_exit(&rtable[index].r_lock);
2357 return (vp);
2358 }
2359
2360 if (!newnode) {
2361 rw_exit(&rtable[index].r_lock);
2362 nfs_attr_cache(vp, vap, t, cr);
2363 } else {
2364 rnode_t *rp = VTOR(vp);
2365
2366 vp->v_type = vap->va_type;
2367 vp->v_rdev = vap->va_rdev;
2368
2369 mutex_enter(&rp->r_statelock);
2370 if (rp->r_mtime <= t)
2371 nfs_attrcache_va(vp, vap);
2372 mutex_exit(&rp->r_statelock);
2373 rw_exit(&rtable[index].r_lock);
2374 }
2375
2376 return (vp);
2377 }
2378
2379 vnode_t *
makenfs3node(nfs_fh3 * fh,fattr3 * attr,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2380 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2381 cred_t *cr, char *dnm, char *nm)
2382 {
2383 int newnode;
2384 int index;
2385 vnode_t *vp;
2386 vattr_t va;
2387
2388 index = rtablehash((nfs_fhandle *)fh);
2389 rw_enter(&rtable[index].r_lock, RW_READER);
2390
2391 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2392 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2393 dnm, nm);
2394
2395 if (attr == NULL) {
2396 if (newnode) {
2397 PURGE_ATTRCACHE(vp);
2398 }
2399 rw_exit(&rtable[index].r_lock);
2400 return (vp);
2401 }
2402
2403 if (!newnode) {
2404 rw_exit(&rtable[index].r_lock);
2405 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2406 } else {
2407 if (attr->type < NF3REG || attr->type > NF3FIFO)
2408 vp->v_type = VBAD;
2409 else
2410 vp->v_type = nf3_to_vt[attr->type];
2411 vp->v_rdev = makedevice(attr->rdev.specdata1,
2412 attr->rdev.specdata2);
2413 nfs3_attrcache(vp, attr, t);
2414 rw_exit(&rtable[index].r_lock);
2415 }
2416
2417 return (vp);
2418 }
2419
2420 /*
2421 * Read this comment before making changes to rtablehash()!
2422 * This is a hash function in which seemingly obvious and harmless
2423 * changes can cause escalations costing million dollars!
2424 * Know what you are doing.
2425 *
2426 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2427 * algorithm is currently detailed here:
2428 *
2429 * http://burtleburtle.net/bob/hash/doobs.html
2430 *
2431 * Of course, the above link may not be valid by the time you are reading
2432 * this, but suffice it to say that the one-at-a-time algorithm works well in
2433 * almost all cases. If you are changing the algorithm be sure to verify that
2434 * the hash algorithm still provides even distribution in all cases and with
2435 * any server returning filehandles in whatever order (sequential or random).
2436 */
2437 static int
rtablehash(nfs_fhandle * fh)2438 rtablehash(nfs_fhandle *fh)
2439 {
2440 ulong_t hash, len, i;
2441 char *key;
2442
2443 key = fh->fh_buf;
2444 len = (ulong_t)fh->fh_len;
2445 for (hash = 0, i = 0; i < len; i++) {
2446 hash += key[i];
2447 hash += (hash << 10);
2448 hash ^= (hash >> 6);
2449 }
2450 hash += (hash << 3);
2451 hash ^= (hash >> 11);
2452 hash += (hash << 15);
2453 return (hash & rtablemask);
2454 }
2455
2456 static vnode_t *
make_rnode(nfs_fhandle * fh,rhashq_t * rhtp,struct vfs * vfsp,struct vnodeops * vops,int (* putapage)(vnode_t *,page_t *,u_offset_t *,size_t *,int,cred_t *),int (* compar)(const void *,const void *),int * newnode,cred_t * cr,char * dnm,char * nm)2457 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2458 struct vnodeops *vops,
2459 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2460 int (*compar)(const void *, const void *),
2461 int *newnode, cred_t *cr, char *dnm, char *nm)
2462 {
2463 rnode_t *rp;
2464 rnode_t *trp;
2465 vnode_t *vp;
2466 mntinfo_t *mi;
2467
2468 ASSERT(RW_READ_HELD(&rhtp->r_lock));
2469
2470 mi = VFTOMI(vfsp);
2471 start:
2472 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2473 vp = RTOV(rp);
2474 nfs_set_vroot(vp);
2475 *newnode = 0;
2476 return (vp);
2477 }
2478 rw_exit(&rhtp->r_lock);
2479
2480 mutex_enter(&rpfreelist_lock);
2481 if (rpfreelist != NULL && rnew >= nrnode) {
2482 rp = rpfreelist;
2483 rp_rmfree(rp);
2484 mutex_exit(&rpfreelist_lock);
2485
2486 vp = RTOV(rp);
2487
2488 if (rp->r_flags & RHASHED) {
2489 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2490 mutex_enter(&vp->v_lock);
2491 if (vp->v_count > 1) {
2492 vp->v_count--;
2493 mutex_exit(&vp->v_lock);
2494 rw_exit(&rp->r_hashq->r_lock);
2495 rw_enter(&rhtp->r_lock, RW_READER);
2496 goto start;
2497 }
2498 mutex_exit(&vp->v_lock);
2499 rp_rmhash_locked(rp);
2500 rw_exit(&rp->r_hashq->r_lock);
2501 }
2502
2503 rinactive(rp, cr);
2504
2505 mutex_enter(&vp->v_lock);
2506 if (vp->v_count > 1) {
2507 vp->v_count--;
2508 mutex_exit(&vp->v_lock);
2509 rw_enter(&rhtp->r_lock, RW_READER);
2510 goto start;
2511 }
2512 mutex_exit(&vp->v_lock);
2513 vn_invalid(vp);
2514 /*
2515 * destroy old locks before bzero'ing and
2516 * recreating the locks below.
2517 */
2518 nfs_rw_destroy(&rp->r_rwlock);
2519 nfs_rw_destroy(&rp->r_lkserlock);
2520 mutex_destroy(&rp->r_statelock);
2521 cv_destroy(&rp->r_cv);
2522 cv_destroy(&rp->r_commit.c_cv);
2523 nfs_free_r_path(rp);
2524 avl_destroy(&rp->r_dir);
2525 /*
2526 * Make sure that if rnode is recycled then
2527 * VFS count is decremented properly before
2528 * reuse.
2529 */
2530 VFS_RELE(vp->v_vfsp);
2531 vn_reinit(vp);
2532 } else {
2533 vnode_t *new_vp;
2534
2535 mutex_exit(&rpfreelist_lock);
2536
2537 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2538 new_vp = vn_alloc(KM_SLEEP);
2539
2540 atomic_inc_ulong((ulong_t *)&rnew);
2541 #ifdef DEBUG
2542 clstat_debug.nrnode.value.ui64++;
2543 #endif
2544 vp = new_vp;
2545 }
2546
2547 bzero(rp, sizeof (*rp));
2548 rp->r_vnode = vp;
2549 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2550 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2551 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2552 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2553 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2554 rp->r_fh.fh_len = fh->fh_len;
2555 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2556 rp->r_server = mi->mi_curr_serv;
2557 if (FAILOVER_MOUNT(mi)) {
2558 /*
2559 * If replicated servers, stash pathnames
2560 */
2561 if (dnm != NULL && nm != NULL) {
2562 char *s, *p;
2563 uint_t len;
2564
2565 len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2566 rp->r_path = kmem_alloc(len, KM_SLEEP);
2567 #ifdef DEBUG
2568 clstat_debug.rpath.value.ui64 += len;
2569 #endif
2570 s = rp->r_path;
2571 for (p = dnm; *p; p++)
2572 *s++ = *p;
2573 *s++ = '/';
2574 for (p = nm; *p; p++)
2575 *s++ = *p;
2576 *s = '\0';
2577 } else {
2578 /* special case for root */
2579 rp->r_path = kmem_alloc(2, KM_SLEEP);
2580 #ifdef DEBUG
2581 clstat_debug.rpath.value.ui64 += 2;
2582 #endif
2583 *rp->r_path = '.';
2584 *(rp->r_path + 1) = '\0';
2585 }
2586 }
2587 VFS_HOLD(vfsp);
2588 rp->r_putapage = putapage;
2589 rp->r_hashq = rhtp;
2590 rp->r_flags = RREADDIRPLUS;
2591 avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2592 offsetof(rddir_cache, tree));
2593 vn_setops(vp, vops);
2594 vp->v_data = (caddr_t)rp;
2595 vp->v_vfsp = vfsp;
2596 vp->v_type = VNON;
2597 vp->v_flag |= VMODSORT;
2598 nfs_set_vroot(vp);
2599
2600 /*
2601 * There is a race condition if someone else
2602 * alloc's the rnode while no locks are held, so we
2603 * check again and recover if found.
2604 */
2605 rw_enter(&rhtp->r_lock, RW_WRITER);
2606 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2607 vp = RTOV(trp);
2608 nfs_set_vroot(vp);
2609 *newnode = 0;
2610 rw_exit(&rhtp->r_lock);
2611 rp_addfree(rp, cr);
2612 rw_enter(&rhtp->r_lock, RW_READER);
2613 return (vp);
2614 }
2615 rp_addhash(rp);
2616 *newnode = 1;
2617 return (vp);
2618 }
2619
2620 /*
2621 * Callback function to check if the page should be marked as
2622 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2623 */
2624 int
nfs_setmod_check(page_t * pp)2625 nfs_setmod_check(page_t *pp)
2626 {
2627 if (pp->p_fsdata != C_NOCOMMIT) {
2628 pp->p_fsdata = C_NOCOMMIT;
2629 return (1);
2630 }
2631 return (0);
2632 }
2633
2634 static void
nfs_set_vroot(vnode_t * vp)2635 nfs_set_vroot(vnode_t *vp)
2636 {
2637 rnode_t *rp;
2638 nfs_fhandle *rootfh;
2639
2640 rp = VTOR(vp);
2641 rootfh = &rp->r_server->sv_fhandle;
2642 if (rootfh->fh_len == rp->r_fh.fh_len &&
2643 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2644 if (!(vp->v_flag & VROOT)) {
2645 mutex_enter(&vp->v_lock);
2646 vp->v_flag |= VROOT;
2647 mutex_exit(&vp->v_lock);
2648 }
2649 }
2650 }
2651
2652 static void
nfs_free_r_path(rnode_t * rp)2653 nfs_free_r_path(rnode_t *rp)
2654 {
2655 char *path;
2656 size_t len;
2657
2658 path = rp->r_path;
2659 if (path) {
2660 rp->r_path = NULL;
2661 len = strlen(path) + 1;
2662 kmem_free(path, len);
2663 #ifdef DEBUG
2664 clstat_debug.rpath.value.ui64 -= len;
2665 #endif
2666 }
2667 }
2668
2669 /*
2670 * Put an rnode on the free list.
2671 *
2672 * Rnodes which were allocated above and beyond the normal limit
2673 * are immediately freed.
2674 */
2675 void
rp_addfree(rnode_t * rp,cred_t * cr)2676 rp_addfree(rnode_t *rp, cred_t *cr)
2677 {
2678 vnode_t *vp;
2679 struct vfs *vfsp;
2680
2681 vp = RTOV(rp);
2682 ASSERT(vp->v_count >= 1);
2683 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2684
2685 /*
2686 * If we have too many rnodes allocated and there are no
2687 * references to this rnode, or if the rnode is no longer
2688 * accessible by it does not reside in the hash queues,
2689 * or if an i/o error occurred while writing to the file,
2690 * then just free it instead of putting it on the rnode
2691 * freelist.
2692 */
2693 vfsp = vp->v_vfsp;
2694 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2695 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2696 if (rp->r_flags & RHASHED) {
2697 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2698 mutex_enter(&vp->v_lock);
2699 if (vp->v_count > 1) {
2700 vp->v_count--;
2701 mutex_exit(&vp->v_lock);
2702 rw_exit(&rp->r_hashq->r_lock);
2703 return;
2704 }
2705 mutex_exit(&vp->v_lock);
2706 rp_rmhash_locked(rp);
2707 rw_exit(&rp->r_hashq->r_lock);
2708 }
2709
2710 rinactive(rp, cr);
2711
2712 /*
2713 * Recheck the vnode reference count. We need to
2714 * make sure that another reference has not been
2715 * acquired while we were not holding v_lock. The
2716 * rnode is not in the rnode hash queues, so the
2717 * only way for a reference to have been acquired
2718 * is for a VOP_PUTPAGE because the rnode was marked
2719 * with RDIRTY or for a modified page. This
2720 * reference may have been acquired before our call
2721 * to rinactive. The i/o may have been completed,
2722 * thus allowing rinactive to complete, but the
2723 * reference to the vnode may not have been released
2724 * yet. In any case, the rnode can not be destroyed
2725 * until the other references to this vnode have been
2726 * released. The other references will take care of
2727 * either destroying the rnode or placing it on the
2728 * rnode freelist. If there are no other references,
2729 * then the rnode may be safely destroyed.
2730 */
2731 mutex_enter(&vp->v_lock);
2732 if (vp->v_count > 1) {
2733 vp->v_count--;
2734 mutex_exit(&vp->v_lock);
2735 return;
2736 }
2737 mutex_exit(&vp->v_lock);
2738
2739 destroy_rnode(rp);
2740 return;
2741 }
2742
2743 /*
2744 * Lock the hash queue and then recheck the reference count
2745 * to ensure that no other threads have acquired a reference
2746 * to indicate that the rnode should not be placed on the
2747 * freelist. If another reference has been acquired, then
2748 * just release this one and let the other thread complete
2749 * the processing of adding this rnode to the freelist.
2750 */
2751 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2752
2753 mutex_enter(&vp->v_lock);
2754 if (vp->v_count > 1) {
2755 vp->v_count--;
2756 mutex_exit(&vp->v_lock);
2757 rw_exit(&rp->r_hashq->r_lock);
2758 return;
2759 }
2760 mutex_exit(&vp->v_lock);
2761
2762 /*
2763 * If there is no cached data or metadata for this file, then
2764 * put the rnode on the front of the freelist so that it will
2765 * be reused before other rnodes which may have cached data or
2766 * metadata associated with them.
2767 */
2768 mutex_enter(&rpfreelist_lock);
2769 if (rpfreelist == NULL) {
2770 rp->r_freef = rp;
2771 rp->r_freeb = rp;
2772 rpfreelist = rp;
2773 } else {
2774 rp->r_freef = rpfreelist;
2775 rp->r_freeb = rpfreelist->r_freeb;
2776 rpfreelist->r_freeb->r_freef = rp;
2777 rpfreelist->r_freeb = rp;
2778 if (!vn_has_cached_data(vp) &&
2779 !HAVE_RDDIR_CACHE(rp) &&
2780 rp->r_symlink.contents == NULL &&
2781 rp->r_secattr == NULL &&
2782 rp->r_pathconf == NULL)
2783 rpfreelist = rp;
2784 }
2785 mutex_exit(&rpfreelist_lock);
2786
2787 rw_exit(&rp->r_hashq->r_lock);
2788 }
2789
2790 /*
2791 * Remove an rnode from the free list.
2792 *
2793 * The caller must be holding rpfreelist_lock and the rnode
2794 * must be on the freelist.
2795 */
2796 static void
rp_rmfree(rnode_t * rp)2797 rp_rmfree(rnode_t *rp)
2798 {
2799
2800 ASSERT(MUTEX_HELD(&rpfreelist_lock));
2801 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2802
2803 if (rp == rpfreelist) {
2804 rpfreelist = rp->r_freef;
2805 if (rp == rpfreelist)
2806 rpfreelist = NULL;
2807 }
2808
2809 rp->r_freeb->r_freef = rp->r_freef;
2810 rp->r_freef->r_freeb = rp->r_freeb;
2811
2812 rp->r_freef = rp->r_freeb = NULL;
2813 }
2814
2815 /*
2816 * Put a rnode in the hash table.
2817 *
2818 * The caller must be holding the exclusive hash queue lock.
2819 */
2820 static void
rp_addhash(rnode_t * rp)2821 rp_addhash(rnode_t *rp)
2822 {
2823 mntinfo_t *mi;
2824
2825 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2826 ASSERT(!(rp->r_flags & RHASHED));
2827
2828 rp->r_hashf = rp->r_hashq->r_hashf;
2829 rp->r_hashq->r_hashf = rp;
2830 rp->r_hashb = (rnode_t *)rp->r_hashq;
2831 rp->r_hashf->r_hashb = rp;
2832
2833 mutex_enter(&rp->r_statelock);
2834 rp->r_flags |= RHASHED;
2835 mutex_exit(&rp->r_statelock);
2836
2837 mi = VTOMI(RTOV(rp));
2838 mutex_enter(&mi->mi_rnodes_lock);
2839 list_insert_tail(&mi->mi_rnodes, rp);
2840 mutex_exit(&mi->mi_rnodes_lock);
2841 }
2842
2843 /*
2844 * Remove a rnode from the hash table.
2845 *
2846 * The caller must be holding the hash queue lock.
2847 */
2848 static void
rp_rmhash_locked(rnode_t * rp)2849 rp_rmhash_locked(rnode_t *rp)
2850 {
2851 mntinfo_t *mi;
2852
2853 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2854 ASSERT(rp->r_flags & RHASHED);
2855
2856 rp->r_hashb->r_hashf = rp->r_hashf;
2857 rp->r_hashf->r_hashb = rp->r_hashb;
2858
2859 mutex_enter(&rp->r_statelock);
2860 rp->r_flags &= ~RHASHED;
2861 mutex_exit(&rp->r_statelock);
2862
2863 mi = VTOMI(RTOV(rp));
2864 mutex_enter(&mi->mi_rnodes_lock);
2865 if (list_link_active(&rp->r_mi_link))
2866 list_remove(&mi->mi_rnodes, rp);
2867 mutex_exit(&mi->mi_rnodes_lock);
2868 }
2869
2870 /*
2871 * Remove a rnode from the hash table.
2872 *
2873 * The caller must not be holding the hash queue lock.
2874 */
2875 void
rp_rmhash(rnode_t * rp)2876 rp_rmhash(rnode_t *rp)
2877 {
2878
2879 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2880 rp_rmhash_locked(rp);
2881 rw_exit(&rp->r_hashq->r_lock);
2882 }
2883
2884 /*
2885 * Lookup a rnode by fhandle.
2886 *
2887 * The caller must be holding the hash queue lock, either shared or exclusive.
2888 */
2889 static rnode_t *
rfind(rhashq_t * rhtp,nfs_fhandle * fh,struct vfs * vfsp)2890 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2891 {
2892 rnode_t *rp;
2893 vnode_t *vp;
2894
2895 ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2896
2897 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2898 vp = RTOV(rp);
2899 if (vp->v_vfsp == vfsp &&
2900 rp->r_fh.fh_len == fh->fh_len &&
2901 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2902 /*
2903 * remove rnode from free list, if necessary.
2904 */
2905 if (rp->r_freef != NULL) {
2906 mutex_enter(&rpfreelist_lock);
2907 /*
2908 * If the rnode is on the freelist,
2909 * then remove it and use that reference
2910 * as the new reference. Otherwise,
2911 * need to increment the reference count.
2912 */
2913 if (rp->r_freef != NULL) {
2914 rp_rmfree(rp);
2915 mutex_exit(&rpfreelist_lock);
2916 } else {
2917 mutex_exit(&rpfreelist_lock);
2918 VN_HOLD(vp);
2919 }
2920 } else
2921 VN_HOLD(vp);
2922 return (rp);
2923 }
2924 }
2925 return (NULL);
2926 }
2927
2928 /*
2929 * Return 1 if there is an active vnode belonging to this vfs in the
2930 * rtable cache.
2931 *
2932 * Several of these checks are done without holding the usual
2933 * locks. This is safe because destroy_rtable(), rp_addfree(),
2934 * etc. will redo the necessary checks before actually destroying
2935 * any rnodes.
2936 */
2937 int
check_rtable(struct vfs * vfsp)2938 check_rtable(struct vfs *vfsp)
2939 {
2940 rnode_t *rp;
2941 vnode_t *vp;
2942 mntinfo_t *mi;
2943
2944 ASSERT(vfsp != NULL);
2945 mi = VFTOMI(vfsp);
2946
2947 mutex_enter(&mi->mi_rnodes_lock);
2948 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
2949 rp = list_next(&mi->mi_rnodes, rp)) {
2950 vp = RTOV(rp);
2951
2952 if (rp->r_freef == NULL ||
2953 (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) ||
2954 rp->r_count > 0) {
2955 mutex_exit(&mi->mi_rnodes_lock);
2956 return (1);
2957 }
2958 }
2959 mutex_exit(&mi->mi_rnodes_lock);
2960
2961 return (0);
2962 }
2963
2964 /*
2965 * Destroy inactive vnodes from the hash queues which belong to this
2966 * vfs. It is essential that we destroy all inactive vnodes during a
2967 * forced unmount as well as during a normal unmount.
2968 */
2969 void
destroy_rtable(struct vfs * vfsp,cred_t * cr)2970 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2971 {
2972 rnode_t *rp;
2973 mntinfo_t *mi;
2974
2975 ASSERT(vfsp != NULL);
2976
2977 mi = VFTOMI(vfsp);
2978
2979 mutex_enter(&rpfreelist_lock);
2980 mutex_enter(&mi->mi_rnodes_lock);
2981 while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
2982 /*
2983 * If the rnode is no longer on the freelist it is not
2984 * ours and it will be handled by some other thread, so
2985 * skip it.
2986 */
2987 if (rp->r_freef == NULL)
2988 continue;
2989 mutex_exit(&mi->mi_rnodes_lock);
2990
2991 rp_rmfree(rp);
2992 mutex_exit(&rpfreelist_lock);
2993
2994 rp_rmhash(rp);
2995
2996 /*
2997 * This call to rp_addfree will end up destroying the
2998 * rnode, but in a safe way with the appropriate set
2999 * of checks done.
3000 */
3001 rp_addfree(rp, cr);
3002
3003 mutex_enter(&rpfreelist_lock);
3004 mutex_enter(&mi->mi_rnodes_lock);
3005 }
3006 mutex_exit(&mi->mi_rnodes_lock);
3007 mutex_exit(&rpfreelist_lock);
3008 }
3009
3010 /*
3011 * This routine destroys all the resources associated with the rnode
3012 * and then the rnode itself.
3013 */
3014 static void
destroy_rnode(rnode_t * rp)3015 destroy_rnode(rnode_t *rp)
3016 {
3017 vnode_t *vp;
3018 vfs_t *vfsp;
3019
3020 vp = RTOV(rp);
3021 vfsp = vp->v_vfsp;
3022
3023 ASSERT(vp->v_count == 1);
3024 ASSERT(rp->r_count == 0);
3025 ASSERT(rp->r_lmpl == NULL);
3026 ASSERT(rp->r_mapcnt == 0);
3027 ASSERT(!(rp->r_flags & RHASHED));
3028 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3029 atomic_dec_ulong((ulong_t *)&rnew);
3030 #ifdef DEBUG
3031 clstat_debug.nrnode.value.ui64--;
3032 #endif
3033 nfs_rw_destroy(&rp->r_rwlock);
3034 nfs_rw_destroy(&rp->r_lkserlock);
3035 mutex_destroy(&rp->r_statelock);
3036 cv_destroy(&rp->r_cv);
3037 cv_destroy(&rp->r_commit.c_cv);
3038 if (rp->r_flags & RDELMAPLIST)
3039 list_destroy(&rp->r_indelmap);
3040 nfs_free_r_path(rp);
3041 avl_destroy(&rp->r_dir);
3042 vn_invalid(vp);
3043 vn_free(vp);
3044 kmem_cache_free(rnode_cache, rp);
3045 VFS_RELE(vfsp);
3046 }
3047
3048 /*
3049 * Flush all vnodes in this (or every) vfs.
3050 * Used by nfs_sync and by nfs_unmount.
3051 */
3052 void
rflush(struct vfs * vfsp,cred_t * cr)3053 rflush(struct vfs *vfsp, cred_t *cr)
3054 {
3055 int index;
3056 rnode_t *rp;
3057 vnode_t *vp, **vplist;
3058 long num, cnt;
3059
3060 /*
3061 * Check to see whether there is anything to do.
3062 */
3063 num = rnew;
3064 if (num == 0)
3065 return;
3066
3067 /*
3068 * Allocate a slot for all currently active rnodes on the
3069 * supposition that they all may need flushing.
3070 */
3071 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3072 cnt = 0;
3073
3074 /*
3075 * If the vfs is known we can do fast path by iterating all rnodes that
3076 * belongs to this vfs. This is much faster than the traditional way
3077 * of iterating rtable (below) in a case there is a lot of rnodes that
3078 * does not belong to our vfs.
3079 */
3080 if (vfsp != NULL) {
3081 mntinfo_t *mi = VFTOMI(vfsp);
3082
3083 mutex_enter(&mi->mi_rnodes_lock);
3084 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
3085 rp = list_next(&mi->mi_rnodes, rp)) {
3086 vp = RTOV(rp);
3087 /*
3088 * Don't bother sync'ing a vp if it
3089 * is part of virtual swap device or
3090 * if VFS is read-only
3091 */
3092 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3093 continue;
3094 /*
3095 * If the vnode has pages and is marked as either dirty
3096 * or mmap'd, hold and add this vnode to the list of
3097 * vnodes to flush.
3098 */
3099 ASSERT(vp->v_vfsp == vfsp);
3100 if (vn_has_cached_data(vp) &&
3101 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3102 VN_HOLD(vp);
3103 vplist[cnt++] = vp;
3104 if (cnt == num) {
3105 /*
3106 * The vplist is full because there is
3107 * too many rnodes. We are done for
3108 * now.
3109 */
3110 break;
3111 }
3112 }
3113 }
3114 mutex_exit(&mi->mi_rnodes_lock);
3115
3116 goto done;
3117 }
3118
3119 ASSERT(vfsp == NULL);
3120
3121 /*
3122 * Walk the hash queues looking for rnodes with page
3123 * lists associated with them. Make a list of these
3124 * files.
3125 */
3126 for (index = 0; index < rtablesize; index++) {
3127 rw_enter(&rtable[index].r_lock, RW_READER);
3128 for (rp = rtable[index].r_hashf;
3129 rp != (rnode_t *)(&rtable[index]);
3130 rp = rp->r_hashf) {
3131 vp = RTOV(rp);
3132 /*
3133 * Don't bother sync'ing a vp if it
3134 * is part of virtual swap device or
3135 * if VFS is read-only
3136 */
3137 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3138 continue;
3139 /*
3140 * If the vnode has pages and is marked as either dirty
3141 * or mmap'd, hold and add this vnode to the list of
3142 * vnodes to flush.
3143 */
3144 if (vn_has_cached_data(vp) &&
3145 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3146 VN_HOLD(vp);
3147 vplist[cnt++] = vp;
3148 if (cnt == num) {
3149 rw_exit(&rtable[index].r_lock);
3150 /*
3151 * The vplist is full because there is
3152 * too many rnodes. We are done for
3153 * now.
3154 */
3155 goto done;
3156 }
3157 }
3158 }
3159 rw_exit(&rtable[index].r_lock);
3160 }
3161
3162 done:
3163
3164 /*
3165 * Flush and release all of the files on the list.
3166 */
3167 while (cnt-- > 0) {
3168 vp = vplist[cnt];
3169 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3170 VN_RELE(vp);
3171 }
3172
3173 /*
3174 * Free the space allocated to hold the list.
3175 */
3176 kmem_free(vplist, num * sizeof (*vplist));
3177 }
3178
3179 /*
3180 * This probably needs to be larger than or equal to
3181 * log2(sizeof (struct rnode)) due to the way that rnodes are
3182 * allocated.
3183 */
3184 #define ACACHE_SHIFT_BITS 9
3185
3186 static int
acachehash(rnode_t * rp,cred_t * cr)3187 acachehash(rnode_t *rp, cred_t *cr)
3188 {
3189
3190 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3191 acachemask);
3192 }
3193
3194 #ifdef DEBUG
3195 static long nfs_access_cache_hits = 0;
3196 static long nfs_access_cache_misses = 0;
3197 #endif
3198
3199 nfs_access_type_t
nfs_access_check(rnode_t * rp,uint32_t acc,cred_t * cr)3200 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3201 {
3202 vnode_t *vp;
3203 acache_t *ap;
3204 acache_hash_t *hp;
3205 nfs_access_type_t all;
3206
3207 vp = RTOV(rp);
3208 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3209 return (NFS_ACCESS_UNKNOWN);
3210
3211 if (rp->r_acache != NULL) {
3212 hp = &acache[acachehash(rp, cr)];
3213 rw_enter(&hp->lock, RW_READER);
3214 ap = hp->next;
3215 while (ap != (acache_t *)hp) {
3216 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3217 if ((ap->known & acc) == acc) {
3218 #ifdef DEBUG
3219 nfs_access_cache_hits++;
3220 #endif
3221 if ((ap->allowed & acc) == acc)
3222 all = NFS_ACCESS_ALLOWED;
3223 else
3224 all = NFS_ACCESS_DENIED;
3225 } else {
3226 #ifdef DEBUG
3227 nfs_access_cache_misses++;
3228 #endif
3229 all = NFS_ACCESS_UNKNOWN;
3230 }
3231 rw_exit(&hp->lock);
3232 return (all);
3233 }
3234 ap = ap->next;
3235 }
3236 rw_exit(&hp->lock);
3237 }
3238
3239 #ifdef DEBUG
3240 nfs_access_cache_misses++;
3241 #endif
3242 return (NFS_ACCESS_UNKNOWN);
3243 }
3244
3245 void
nfs_access_cache(rnode_t * rp,uint32_t acc,uint32_t resacc,cred_t * cr)3246 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3247 {
3248 acache_t *ap;
3249 acache_t *nap;
3250 acache_hash_t *hp;
3251
3252 hp = &acache[acachehash(rp, cr)];
3253
3254 /*
3255 * Allocate now assuming that mostly an allocation will be
3256 * required. This allows the allocation to happen without
3257 * holding the hash bucket locked.
3258 */
3259 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3260 if (nap != NULL) {
3261 nap->known = acc;
3262 nap->allowed = resacc;
3263 nap->rnode = rp;
3264 crhold(cr);
3265 nap->cred = cr;
3266 nap->hashq = hp;
3267 }
3268
3269 rw_enter(&hp->lock, RW_WRITER);
3270
3271 if (rp->r_acache != NULL) {
3272 ap = hp->next;
3273 while (ap != (acache_t *)hp) {
3274 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3275 ap->known |= acc;
3276 ap->allowed &= ~acc;
3277 ap->allowed |= resacc;
3278 rw_exit(&hp->lock);
3279 if (nap != NULL) {
3280 crfree(nap->cred);
3281 kmem_cache_free(acache_cache, nap);
3282 }
3283 return;
3284 }
3285 ap = ap->next;
3286 }
3287 }
3288
3289 if (nap != NULL) {
3290 #ifdef DEBUG
3291 clstat_debug.access.value.ui64++;
3292 #endif
3293 nap->next = hp->next;
3294 hp->next = nap;
3295 nap->next->prev = nap;
3296 nap->prev = (acache_t *)hp;
3297
3298 mutex_enter(&rp->r_statelock);
3299 nap->list = rp->r_acache;
3300 rp->r_acache = nap;
3301 mutex_exit(&rp->r_statelock);
3302 }
3303
3304 rw_exit(&hp->lock);
3305 }
3306
3307 int
nfs_access_purge_rp(rnode_t * rp)3308 nfs_access_purge_rp(rnode_t *rp)
3309 {
3310 acache_t *ap;
3311 acache_t *tmpap;
3312 acache_t *rplist;
3313
3314 /*
3315 * If there aren't any cached entries, then there is nothing
3316 * to free.
3317 */
3318 if (rp->r_acache == NULL)
3319 return (0);
3320
3321 mutex_enter(&rp->r_statelock);
3322 rplist = rp->r_acache;
3323 rp->r_acache = NULL;
3324 mutex_exit(&rp->r_statelock);
3325
3326 /*
3327 * Loop through each entry in the list pointed to in the
3328 * rnode. Remove each of these entries from the hash
3329 * queue that it is on and remove it from the list in
3330 * the rnode.
3331 */
3332 for (ap = rplist; ap != NULL; ap = tmpap) {
3333 rw_enter(&ap->hashq->lock, RW_WRITER);
3334 ap->prev->next = ap->next;
3335 ap->next->prev = ap->prev;
3336 rw_exit(&ap->hashq->lock);
3337
3338 tmpap = ap->list;
3339 crfree(ap->cred);
3340 kmem_cache_free(acache_cache, ap);
3341 #ifdef DEBUG
3342 clstat_debug.access.value.ui64--;
3343 #endif
3344 }
3345
3346 return (1);
3347 }
3348
3349 static const char prefix[] = ".nfs";
3350
3351 static kmutex_t newnum_lock;
3352
3353 int
newnum(void)3354 newnum(void)
3355 {
3356 static uint_t newnum = 0;
3357 uint_t id;
3358
3359 mutex_enter(&newnum_lock);
3360 if (newnum == 0)
3361 newnum = gethrestime_sec() & 0xffff;
3362 id = newnum++;
3363 mutex_exit(&newnum_lock);
3364 return (id);
3365 }
3366
3367 char *
newname(void)3368 newname(void)
3369 {
3370 char *news;
3371 char *s;
3372 const char *p;
3373 uint_t id;
3374
3375 id = newnum();
3376 news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3377 s = news;
3378 p = prefix;
3379 while (*p != '\0')
3380 *s++ = *p++;
3381 while (id != 0) {
3382 *s++ = "0123456789ABCDEF"[id & 0x0f];
3383 id >>= 4;
3384 }
3385 *s = '\0';
3386 return (news);
3387 }
3388
3389 /*
3390 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3391 * framework.
3392 */
3393 static int
cl_snapshot(kstat_t * ksp,void * buf,int rw)3394 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3395 {
3396 ksp->ks_snaptime = gethrtime();
3397 if (rw == KSTAT_WRITE) {
3398 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3399 #ifdef DEBUG
3400 /*
3401 * Currently only the global zone can write to kstats, but we
3402 * add the check just for paranoia.
3403 */
3404 if (INGLOBALZONE(curproc))
3405 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3406 sizeof (clstat_debug));
3407 #endif
3408 } else {
3409 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3410 #ifdef DEBUG
3411 /*
3412 * If we're displaying the "global" debug kstat values, we
3413 * display them as-is to all zones since in fact they apply to
3414 * the system as a whole.
3415 */
3416 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3417 sizeof (clstat_debug));
3418 #endif
3419 }
3420 return (0);
3421 }
3422
3423 static void *
clinit_zone(zoneid_t zoneid)3424 clinit_zone(zoneid_t zoneid)
3425 {
3426 kstat_t *nfs_client_kstat;
3427 struct nfs_clnt *nfscl;
3428 uint_t ndata;
3429
3430 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3431 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3432 nfscl->nfscl_chtable = NULL;
3433 nfscl->nfscl_zoneid = zoneid;
3434
3435 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3436 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3437 #ifdef DEBUG
3438 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3439 #endif
3440 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3441 "misc", KSTAT_TYPE_NAMED, ndata,
3442 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3443 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3444 nfs_client_kstat->ks_snapshot = cl_snapshot;
3445 kstat_install(nfs_client_kstat);
3446 }
3447 mutex_enter(&nfs_clnt_list_lock);
3448 list_insert_head(&nfs_clnt_list, nfscl);
3449 mutex_exit(&nfs_clnt_list_lock);
3450 return (nfscl);
3451 }
3452
3453 /*ARGSUSED*/
3454 static void
clfini_zone(zoneid_t zoneid,void * arg)3455 clfini_zone(zoneid_t zoneid, void *arg)
3456 {
3457 struct nfs_clnt *nfscl = arg;
3458 chhead_t *chp, *next;
3459
3460 if (nfscl == NULL)
3461 return;
3462 mutex_enter(&nfs_clnt_list_lock);
3463 list_remove(&nfs_clnt_list, nfscl);
3464 mutex_exit(&nfs_clnt_list_lock);
3465 clreclaim_zone(nfscl, 0);
3466 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3467 ASSERT(chp->ch_list == NULL);
3468 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3469 next = chp->ch_next;
3470 kmem_free(chp, sizeof (*chp));
3471 }
3472 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3473 mutex_destroy(&nfscl->nfscl_chtable_lock);
3474 kmem_free(nfscl, sizeof (*nfscl));
3475 }
3476
3477 /*
3478 * Called by endpnt_destructor to make sure the client handles are
3479 * cleaned up before the RPC endpoints. This becomes a no-op if
3480 * clfini_zone (above) is called first. This function is needed
3481 * (rather than relying on clfini_zone to clean up) because the ZSD
3482 * callbacks have no ordering mechanism, so we have no way to ensure
3483 * that clfini_zone is called before endpnt_destructor.
3484 */
3485 void
clcleanup_zone(zoneid_t zoneid)3486 clcleanup_zone(zoneid_t zoneid)
3487 {
3488 struct nfs_clnt *nfscl;
3489
3490 mutex_enter(&nfs_clnt_list_lock);
3491 nfscl = list_head(&nfs_clnt_list);
3492 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3493 if (nfscl->nfscl_zoneid == zoneid) {
3494 clreclaim_zone(nfscl, 0);
3495 break;
3496 }
3497 }
3498 mutex_exit(&nfs_clnt_list_lock);
3499 }
3500
3501 int
nfs_subrinit(void)3502 nfs_subrinit(void)
3503 {
3504 int i;
3505 ulong_t nrnode_max;
3506
3507 /*
3508 * Allocate and initialize the rnode hash queues
3509 */
3510 if (nrnode <= 0)
3511 nrnode = ncsize;
3512 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3513 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3514 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3515 "!setting nrnode to max value of %ld", nrnode_max);
3516 nrnode = nrnode_max;
3517 }
3518
3519 rtablesize = 1 << highbit(nrnode / hashlen);
3520 rtablemask = rtablesize - 1;
3521 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3522 for (i = 0; i < rtablesize; i++) {
3523 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3524 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3525 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3526 }
3527 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3528 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3529
3530 /*
3531 * Allocate and initialize the access cache
3532 */
3533
3534 /*
3535 * Initial guess is one access cache entry per rnode unless
3536 * nacache is set to a non-zero value and then it is used to
3537 * indicate a guess at the number of access cache entries.
3538 */
3539 if (nacache > 0)
3540 acachesize = 1 << highbit(nacache / hashlen);
3541 else
3542 acachesize = rtablesize;
3543 acachemask = acachesize - 1;
3544 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3545 for (i = 0; i < acachesize; i++) {
3546 acache[i].next = (acache_t *)&acache[i];
3547 acache[i].prev = (acache_t *)&acache[i];
3548 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3549 }
3550 acache_cache = kmem_cache_create("nfs_access_cache",
3551 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3552 /*
3553 * Allocate and initialize the client handle cache
3554 */
3555 chtab_cache = kmem_cache_create("client_handle_cache",
3556 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3557 /*
3558 * Initialize the list of per-zone client handles (and associated data).
3559 * This needs to be done before we call zone_key_create().
3560 */
3561 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3562 offsetof(struct nfs_clnt, nfscl_node));
3563 /*
3564 * Initialize the zone_key for per-zone client handle lists.
3565 */
3566 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3567 /*
3568 * Initialize the various mutexes and reader/writer locks
3569 */
3570 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3571 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3572 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3573
3574 /*
3575 * Assign unique major number for all nfs mounts
3576 */
3577 if ((nfs_major = getudev()) == -1) {
3578 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3579 "nfs: init: can't get unique device number");
3580 nfs_major = 0;
3581 }
3582 nfs_minor = 0;
3583
3584 if (nfs3_jukebox_delay == 0)
3585 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3586
3587 return (0);
3588 }
3589
3590 void
nfs_subrfini(void)3591 nfs_subrfini(void)
3592 {
3593 int i;
3594
3595 /*
3596 * Deallocate the rnode hash queues
3597 */
3598 kmem_cache_destroy(rnode_cache);
3599
3600 for (i = 0; i < rtablesize; i++)
3601 rw_destroy(&rtable[i].r_lock);
3602 kmem_free(rtable, rtablesize * sizeof (*rtable));
3603
3604 /*
3605 * Deallocated the access cache
3606 */
3607 kmem_cache_destroy(acache_cache);
3608
3609 for (i = 0; i < acachesize; i++)
3610 rw_destroy(&acache[i].lock);
3611 kmem_free(acache, acachesize * sizeof (*acache));
3612
3613 /*
3614 * Deallocate the client handle cache
3615 */
3616 kmem_cache_destroy(chtab_cache);
3617
3618 /*
3619 * Destroy the various mutexes and reader/writer locks
3620 */
3621 mutex_destroy(&rpfreelist_lock);
3622 mutex_destroy(&newnum_lock);
3623 mutex_destroy(&nfs_minor_lock);
3624 (void) zone_key_delete(nfsclnt_zone_key);
3625 }
3626
3627 enum nfsstat
puterrno(int error)3628 puterrno(int error)
3629 {
3630
3631 switch (error) {
3632 case EOPNOTSUPP:
3633 return (NFSERR_OPNOTSUPP);
3634 case ENAMETOOLONG:
3635 return (NFSERR_NAMETOOLONG);
3636 case ENOTEMPTY:
3637 return (NFSERR_NOTEMPTY);
3638 case EDQUOT:
3639 return (NFSERR_DQUOT);
3640 case ESTALE:
3641 return (NFSERR_STALE);
3642 case EREMOTE:
3643 return (NFSERR_REMOTE);
3644 case ENOSYS:
3645 return (NFSERR_OPNOTSUPP);
3646 case EOVERFLOW:
3647 return (NFSERR_INVAL);
3648 default:
3649 return ((enum nfsstat)error);
3650 }
3651 /* NOTREACHED */
3652 }
3653
3654 int
geterrno(enum nfsstat status)3655 geterrno(enum nfsstat status)
3656 {
3657
3658 switch (status) {
3659 case NFSERR_OPNOTSUPP:
3660 return (EOPNOTSUPP);
3661 case NFSERR_NAMETOOLONG:
3662 return (ENAMETOOLONG);
3663 case NFSERR_NOTEMPTY:
3664 return (ENOTEMPTY);
3665 case NFSERR_DQUOT:
3666 return (EDQUOT);
3667 case NFSERR_STALE:
3668 return (ESTALE);
3669 case NFSERR_REMOTE:
3670 return (EREMOTE);
3671 case NFSERR_WFLUSH:
3672 return (EIO);
3673 default:
3674 return ((int)status);
3675 }
3676 /* NOTREACHED */
3677 }
3678
3679 enum nfsstat3
puterrno3(int error)3680 puterrno3(int error)
3681 {
3682
3683 #ifdef DEBUG
3684 switch (error) {
3685 case 0:
3686 return (NFS3_OK);
3687 case EPERM:
3688 return (NFS3ERR_PERM);
3689 case ENOENT:
3690 return (NFS3ERR_NOENT);
3691 case EIO:
3692 return (NFS3ERR_IO);
3693 case ENXIO:
3694 return (NFS3ERR_NXIO);
3695 case EACCES:
3696 return (NFS3ERR_ACCES);
3697 case EEXIST:
3698 return (NFS3ERR_EXIST);
3699 case EXDEV:
3700 return (NFS3ERR_XDEV);
3701 case ENODEV:
3702 return (NFS3ERR_NODEV);
3703 case ENOTDIR:
3704 return (NFS3ERR_NOTDIR);
3705 case EISDIR:
3706 return (NFS3ERR_ISDIR);
3707 case EINVAL:
3708 return (NFS3ERR_INVAL);
3709 case EFBIG:
3710 return (NFS3ERR_FBIG);
3711 case ENOSPC:
3712 return (NFS3ERR_NOSPC);
3713 case EROFS:
3714 return (NFS3ERR_ROFS);
3715 case EMLINK:
3716 return (NFS3ERR_MLINK);
3717 case ENAMETOOLONG:
3718 return (NFS3ERR_NAMETOOLONG);
3719 case ENOTEMPTY:
3720 return (NFS3ERR_NOTEMPTY);
3721 case EDQUOT:
3722 return (NFS3ERR_DQUOT);
3723 case ESTALE:
3724 return (NFS3ERR_STALE);
3725 case EREMOTE:
3726 return (NFS3ERR_REMOTE);
3727 case ENOSYS:
3728 case EOPNOTSUPP:
3729 return (NFS3ERR_NOTSUPP);
3730 case EOVERFLOW:
3731 return (NFS3ERR_INVAL);
3732 default:
3733 zcmn_err(getzoneid(), CE_WARN,
3734 "puterrno3: got error %d", error);
3735 return ((enum nfsstat3)error);
3736 }
3737 #else
3738 switch (error) {
3739 case ENAMETOOLONG:
3740 return (NFS3ERR_NAMETOOLONG);
3741 case ENOTEMPTY:
3742 return (NFS3ERR_NOTEMPTY);
3743 case EDQUOT:
3744 return (NFS3ERR_DQUOT);
3745 case ESTALE:
3746 return (NFS3ERR_STALE);
3747 case ENOSYS:
3748 case EOPNOTSUPP:
3749 return (NFS3ERR_NOTSUPP);
3750 case EREMOTE:
3751 return (NFS3ERR_REMOTE);
3752 case EOVERFLOW:
3753 return (NFS3ERR_INVAL);
3754 default:
3755 return ((enum nfsstat3)error);
3756 }
3757 #endif
3758 }
3759
3760 int
geterrno3(enum nfsstat3 status)3761 geterrno3(enum nfsstat3 status)
3762 {
3763
3764 #ifdef DEBUG
3765 switch (status) {
3766 case NFS3_OK:
3767 return (0);
3768 case NFS3ERR_PERM:
3769 return (EPERM);
3770 case NFS3ERR_NOENT:
3771 return (ENOENT);
3772 case NFS3ERR_IO:
3773 return (EIO);
3774 case NFS3ERR_NXIO:
3775 return (ENXIO);
3776 case NFS3ERR_ACCES:
3777 return (EACCES);
3778 case NFS3ERR_EXIST:
3779 return (EEXIST);
3780 case NFS3ERR_XDEV:
3781 return (EXDEV);
3782 case NFS3ERR_NODEV:
3783 return (ENODEV);
3784 case NFS3ERR_NOTDIR:
3785 return (ENOTDIR);
3786 case NFS3ERR_ISDIR:
3787 return (EISDIR);
3788 case NFS3ERR_INVAL:
3789 return (EINVAL);
3790 case NFS3ERR_FBIG:
3791 return (EFBIG);
3792 case NFS3ERR_NOSPC:
3793 return (ENOSPC);
3794 case NFS3ERR_ROFS:
3795 return (EROFS);
3796 case NFS3ERR_MLINK:
3797 return (EMLINK);
3798 case NFS3ERR_NAMETOOLONG:
3799 return (ENAMETOOLONG);
3800 case NFS3ERR_NOTEMPTY:
3801 return (ENOTEMPTY);
3802 case NFS3ERR_DQUOT:
3803 return (EDQUOT);
3804 case NFS3ERR_STALE:
3805 return (ESTALE);
3806 case NFS3ERR_REMOTE:
3807 return (EREMOTE);
3808 case NFS3ERR_BADHANDLE:
3809 return (ESTALE);
3810 case NFS3ERR_NOT_SYNC:
3811 return (EINVAL);
3812 case NFS3ERR_BAD_COOKIE:
3813 return (ENOENT);
3814 case NFS3ERR_NOTSUPP:
3815 return (EOPNOTSUPP);
3816 case NFS3ERR_TOOSMALL:
3817 return (EINVAL);
3818 case NFS3ERR_SERVERFAULT:
3819 return (EIO);
3820 case NFS3ERR_BADTYPE:
3821 return (EINVAL);
3822 case NFS3ERR_JUKEBOX:
3823 return (ENXIO);
3824 default:
3825 zcmn_err(getzoneid(), CE_WARN,
3826 "geterrno3: got status %d", status);
3827 return ((int)status);
3828 }
3829 #else
3830 switch (status) {
3831 case NFS3ERR_NAMETOOLONG:
3832 return (ENAMETOOLONG);
3833 case NFS3ERR_NOTEMPTY:
3834 return (ENOTEMPTY);
3835 case NFS3ERR_DQUOT:
3836 return (EDQUOT);
3837 case NFS3ERR_STALE:
3838 case NFS3ERR_BADHANDLE:
3839 return (ESTALE);
3840 case NFS3ERR_NOTSUPP:
3841 return (EOPNOTSUPP);
3842 case NFS3ERR_REMOTE:
3843 return (EREMOTE);
3844 case NFS3ERR_NOT_SYNC:
3845 case NFS3ERR_TOOSMALL:
3846 case NFS3ERR_BADTYPE:
3847 return (EINVAL);
3848 case NFS3ERR_BAD_COOKIE:
3849 return (ENOENT);
3850 case NFS3ERR_SERVERFAULT:
3851 return (EIO);
3852 case NFS3ERR_JUKEBOX:
3853 return (ENXIO);
3854 default:
3855 return ((int)status);
3856 }
3857 #endif
3858 }
3859
3860 rddir_cache *
rddir_cache_alloc(int flags)3861 rddir_cache_alloc(int flags)
3862 {
3863 rddir_cache *rc;
3864
3865 rc = kmem_alloc(sizeof (*rc), flags);
3866 if (rc != NULL) {
3867 rc->entries = NULL;
3868 rc->flags = RDDIR;
3869 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3870 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3871 rc->count = 1;
3872 #ifdef DEBUG
3873 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3874 #endif
3875 }
3876 return (rc);
3877 }
3878
3879 static void
rddir_cache_free(rddir_cache * rc)3880 rddir_cache_free(rddir_cache *rc)
3881 {
3882
3883 #ifdef DEBUG
3884 atomic_dec_64(&clstat_debug.dirent.value.ui64);
3885 #endif
3886 if (rc->entries != NULL) {
3887 #ifdef DEBUG
3888 rddir_cache_buf_free(rc->entries, rc->buflen);
3889 #else
3890 kmem_free(rc->entries, rc->buflen);
3891 #endif
3892 }
3893 cv_destroy(&rc->cv);
3894 mutex_destroy(&rc->lock);
3895 kmem_free(rc, sizeof (*rc));
3896 }
3897
3898 void
rddir_cache_hold(rddir_cache * rc)3899 rddir_cache_hold(rddir_cache *rc)
3900 {
3901
3902 mutex_enter(&rc->lock);
3903 rc->count++;
3904 mutex_exit(&rc->lock);
3905 }
3906
3907 void
rddir_cache_rele(rddir_cache * rc)3908 rddir_cache_rele(rddir_cache *rc)
3909 {
3910
3911 mutex_enter(&rc->lock);
3912 ASSERT(rc->count > 0);
3913 if (--rc->count == 0) {
3914 mutex_exit(&rc->lock);
3915 rddir_cache_free(rc);
3916 } else
3917 mutex_exit(&rc->lock);
3918 }
3919
3920 #ifdef DEBUG
3921 char *
rddir_cache_buf_alloc(size_t size,int flags)3922 rddir_cache_buf_alloc(size_t size, int flags)
3923 {
3924 char *rc;
3925
3926 rc = kmem_alloc(size, flags);
3927 if (rc != NULL)
3928 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3929 return (rc);
3930 }
3931
3932 void
rddir_cache_buf_free(void * addr,size_t size)3933 rddir_cache_buf_free(void *addr, size_t size)
3934 {
3935
3936 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3937 kmem_free(addr, size);
3938 }
3939 #endif
3940
3941 static int
nfs_free_data_reclaim(rnode_t * rp)3942 nfs_free_data_reclaim(rnode_t *rp)
3943 {
3944 char *contents;
3945 int size;
3946 vsecattr_t *vsp;
3947 nfs3_pathconf_info *info;
3948 int freed;
3949 cred_t *cred;
3950
3951 /*
3952 * Free any held credentials and caches which
3953 * may be associated with this rnode.
3954 */
3955 mutex_enter(&rp->r_statelock);
3956 cred = rp->r_cred;
3957 rp->r_cred = NULL;
3958 contents = rp->r_symlink.contents;
3959 size = rp->r_symlink.size;
3960 rp->r_symlink.contents = NULL;
3961 vsp = rp->r_secattr;
3962 rp->r_secattr = NULL;
3963 info = rp->r_pathconf;
3964 rp->r_pathconf = NULL;
3965 mutex_exit(&rp->r_statelock);
3966
3967 if (cred != NULL)
3968 crfree(cred);
3969
3970 /*
3971 * Free the access cache entries.
3972 */
3973 freed = nfs_access_purge_rp(rp);
3974
3975 if (!HAVE_RDDIR_CACHE(rp) &&
3976 contents == NULL &&
3977 vsp == NULL &&
3978 info == NULL)
3979 return (freed);
3980
3981 /*
3982 * Free the readdir cache entries
3983 */
3984 if (HAVE_RDDIR_CACHE(rp))
3985 nfs_purge_rddir_cache(RTOV(rp));
3986
3987 /*
3988 * Free the symbolic link cache.
3989 */
3990 if (contents != NULL) {
3991
3992 kmem_free((void *)contents, size);
3993 }
3994
3995 /*
3996 * Free any cached ACL.
3997 */
3998 if (vsp != NULL)
3999 nfs_acl_free(vsp);
4000
4001 /*
4002 * Free any cached pathconf information.
4003 */
4004 if (info != NULL)
4005 kmem_free(info, sizeof (*info));
4006
4007 return (1);
4008 }
4009
4010 static int
nfs_active_data_reclaim(rnode_t * rp)4011 nfs_active_data_reclaim(rnode_t *rp)
4012 {
4013 char *contents;
4014 int size;
4015 vsecattr_t *vsp;
4016 nfs3_pathconf_info *info;
4017 int freed;
4018
4019 /*
4020 * Free any held credentials and caches which
4021 * may be associated with this rnode.
4022 */
4023 if (!mutex_tryenter(&rp->r_statelock))
4024 return (0);
4025 contents = rp->r_symlink.contents;
4026 size = rp->r_symlink.size;
4027 rp->r_symlink.contents = NULL;
4028 vsp = rp->r_secattr;
4029 rp->r_secattr = NULL;
4030 info = rp->r_pathconf;
4031 rp->r_pathconf = NULL;
4032 mutex_exit(&rp->r_statelock);
4033
4034 /*
4035 * Free the access cache entries.
4036 */
4037 freed = nfs_access_purge_rp(rp);
4038
4039 if (!HAVE_RDDIR_CACHE(rp) &&
4040 contents == NULL &&
4041 vsp == NULL &&
4042 info == NULL)
4043 return (freed);
4044
4045 /*
4046 * Free the readdir cache entries
4047 */
4048 if (HAVE_RDDIR_CACHE(rp))
4049 nfs_purge_rddir_cache(RTOV(rp));
4050
4051 /*
4052 * Free the symbolic link cache.
4053 */
4054 if (contents != NULL) {
4055
4056 kmem_free((void *)contents, size);
4057 }
4058
4059 /*
4060 * Free any cached ACL.
4061 */
4062 if (vsp != NULL)
4063 nfs_acl_free(vsp);
4064
4065 /*
4066 * Free any cached pathconf information.
4067 */
4068 if (info != NULL)
4069 kmem_free(info, sizeof (*info));
4070
4071 return (1);
4072 }
4073
4074 static int
nfs_free_reclaim(void)4075 nfs_free_reclaim(void)
4076 {
4077 int freed;
4078 rnode_t *rp;
4079
4080 #ifdef DEBUG
4081 clstat_debug.f_reclaim.value.ui64++;
4082 #endif
4083 freed = 0;
4084 mutex_enter(&rpfreelist_lock);
4085 rp = rpfreelist;
4086 if (rp != NULL) {
4087 do {
4088 if (nfs_free_data_reclaim(rp))
4089 freed = 1;
4090 } while ((rp = rp->r_freef) != rpfreelist);
4091 }
4092 mutex_exit(&rpfreelist_lock);
4093 return (freed);
4094 }
4095
4096 static int
nfs_active_reclaim(void)4097 nfs_active_reclaim(void)
4098 {
4099 int freed;
4100 int index;
4101 rnode_t *rp;
4102
4103 #ifdef DEBUG
4104 clstat_debug.a_reclaim.value.ui64++;
4105 #endif
4106 freed = 0;
4107 for (index = 0; index < rtablesize; index++) {
4108 rw_enter(&rtable[index].r_lock, RW_READER);
4109 for (rp = rtable[index].r_hashf;
4110 rp != (rnode_t *)(&rtable[index]);
4111 rp = rp->r_hashf) {
4112 if (nfs_active_data_reclaim(rp))
4113 freed = 1;
4114 }
4115 rw_exit(&rtable[index].r_lock);
4116 }
4117 return (freed);
4118 }
4119
4120 static int
nfs_rnode_reclaim(void)4121 nfs_rnode_reclaim(void)
4122 {
4123 int freed;
4124 rnode_t *rp;
4125 vnode_t *vp;
4126
4127 #ifdef DEBUG
4128 clstat_debug.r_reclaim.value.ui64++;
4129 #endif
4130 freed = 0;
4131 mutex_enter(&rpfreelist_lock);
4132 while ((rp = rpfreelist) != NULL) {
4133 rp_rmfree(rp);
4134 mutex_exit(&rpfreelist_lock);
4135 if (rp->r_flags & RHASHED) {
4136 vp = RTOV(rp);
4137 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4138 mutex_enter(&vp->v_lock);
4139 if (vp->v_count > 1) {
4140 vp->v_count--;
4141 mutex_exit(&vp->v_lock);
4142 rw_exit(&rp->r_hashq->r_lock);
4143 mutex_enter(&rpfreelist_lock);
4144 continue;
4145 }
4146 mutex_exit(&vp->v_lock);
4147 rp_rmhash_locked(rp);
4148 rw_exit(&rp->r_hashq->r_lock);
4149 }
4150 /*
4151 * This call to rp_addfree will end up destroying the
4152 * rnode, but in a safe way with the appropriate set
4153 * of checks done.
4154 */
4155 rp_addfree(rp, CRED());
4156 mutex_enter(&rpfreelist_lock);
4157 }
4158 mutex_exit(&rpfreelist_lock);
4159 return (freed);
4160 }
4161
4162 /*ARGSUSED*/
4163 static void
nfs_reclaim(void * cdrarg)4164 nfs_reclaim(void *cdrarg)
4165 {
4166
4167 #ifdef DEBUG
4168 clstat_debug.reclaim.value.ui64++;
4169 #endif
4170 if (nfs_free_reclaim())
4171 return;
4172
4173 if (nfs_active_reclaim())
4174 return;
4175
4176 (void) nfs_rnode_reclaim();
4177 }
4178
4179 /*
4180 * NFS client failover support
4181 *
4182 * Routines to copy filehandles
4183 */
4184 void
nfscopyfh(caddr_t fhp,vnode_t * vp)4185 nfscopyfh(caddr_t fhp, vnode_t *vp)
4186 {
4187 fhandle_t *dest = (fhandle_t *)fhp;
4188
4189 if (dest != NULL)
4190 *dest = *VTOFH(vp);
4191 }
4192
4193 void
nfs3copyfh(caddr_t fhp,vnode_t * vp)4194 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4195 {
4196 nfs_fh3 *dest = (nfs_fh3 *)fhp;
4197
4198 if (dest != NULL)
4199 *dest = *VTOFH3(vp);
4200 }
4201
4202 /*
4203 * NFS client failover support
4204 *
4205 * failover_safe() will test various conditions to ensure that
4206 * failover is permitted for this vnode. It will be denied
4207 * if:
4208 * 1) the operation in progress does not support failover (NULL fi)
4209 * 2) there are no available replicas (NULL mi_servers->sv_next)
4210 * 3) any locks are outstanding on this file
4211 */
4212 static int
failover_safe(failinfo_t * fi)4213 failover_safe(failinfo_t *fi)
4214 {
4215
4216 /*
4217 * Does this op permit failover?
4218 */
4219 if (fi == NULL || fi->vp == NULL)
4220 return (0);
4221
4222 /*
4223 * Are there any alternates to failover to?
4224 */
4225 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4226 return (0);
4227
4228 /*
4229 * Disable check; we've forced local locking
4230 *
4231 * if (flk_has_remote_locks(fi->vp))
4232 * return (0);
4233 */
4234
4235 /*
4236 * If we have no partial path, we can't do anything
4237 */
4238 if (VTOR(fi->vp)->r_path == NULL)
4239 return (0);
4240
4241 return (1);
4242 }
4243
4244 #include <sys/thread.h>
4245
4246 /*
4247 * NFS client failover support
4248 *
4249 * failover_newserver() will start a search for a new server,
4250 * preferably by starting an async thread to do the work. If
4251 * someone is already doing this (recognizable by MI_BINDINPROG
4252 * being set), it will simply return and the calling thread
4253 * will queue on the mi_failover_cv condition variable.
4254 */
4255 static void
failover_newserver(mntinfo_t * mi)4256 failover_newserver(mntinfo_t *mi)
4257 {
4258 /*
4259 * Check if someone else is doing this already
4260 */
4261 mutex_enter(&mi->mi_lock);
4262 if (mi->mi_flags & MI_BINDINPROG) {
4263 mutex_exit(&mi->mi_lock);
4264 return;
4265 }
4266 mi->mi_flags |= MI_BINDINPROG;
4267
4268 /*
4269 * Need to hold the vfs struct so that it can't be released
4270 * while the failover thread is selecting a new server.
4271 */
4272 VFS_HOLD(mi->mi_vfsp);
4273
4274 /*
4275 * Start a thread to do the real searching.
4276 */
4277 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4278
4279 mutex_exit(&mi->mi_lock);
4280 }
4281
4282 /*
4283 * NFS client failover support
4284 *
4285 * failover_thread() will find a new server to replace the one
4286 * currently in use, wake up other threads waiting on this mount
4287 * point, and die. It will start at the head of the server list
4288 * and poll servers until it finds one with an NFS server which is
4289 * registered and responds to a NULL procedure ping.
4290 *
4291 * XXX failover_thread is unsafe within the scope of the
4292 * present model defined for cpr to suspend the system.
4293 * Specifically, over-the-wire calls made by the thread
4294 * are unsafe. The thread needs to be reevaluated in case of
4295 * future updates to the cpr suspend model.
4296 */
4297 static void
failover_thread(mntinfo_t * mi)4298 failover_thread(mntinfo_t *mi)
4299 {
4300 servinfo_t *svp = NULL;
4301 CLIENT *cl;
4302 enum clnt_stat status;
4303 struct timeval tv;
4304 int error;
4305 int oncethru = 0;
4306 callb_cpr_t cprinfo;
4307 rnode_t *rp;
4308 int index;
4309 char *srvnames;
4310 size_t srvnames_len;
4311 struct nfs_clnt *nfscl = NULL;
4312 zoneid_t zoneid = getzoneid();
4313
4314 #ifdef DEBUG
4315 /*
4316 * This is currently only needed to access counters which exist on
4317 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4318 * on non-DEBUG kernels.
4319 */
4320 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4321 ASSERT(nfscl != NULL);
4322 #endif
4323
4324 /*
4325 * Its safe to piggyback on the mi_lock since failover_newserver()
4326 * code guarantees that there will be only one failover thread
4327 * per mountinfo at any instance.
4328 */
4329 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4330 "failover_thread");
4331
4332 mutex_enter(&mi->mi_lock);
4333 while (mi->mi_readers) {
4334 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4335 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4336 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4337 }
4338 mutex_exit(&mi->mi_lock);
4339
4340 tv.tv_sec = 2;
4341 tv.tv_usec = 0;
4342
4343 /*
4344 * Ping the null NFS procedure of every server in
4345 * the list until one responds. We always start
4346 * at the head of the list and always skip the one
4347 * that is current, since it's caused us a problem.
4348 */
4349 while (svp == NULL) {
4350 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4351 if (!oncethru && svp == mi->mi_curr_serv)
4352 continue;
4353
4354 /*
4355 * If the file system was forcibly umounted
4356 * while trying to do a failover, then just
4357 * give up on the failover. It won't matter
4358 * what the server is.
4359 */
4360 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4361 svp = NULL;
4362 goto done;
4363 }
4364
4365 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4366 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4367 if (error)
4368 continue;
4369
4370 if (!(mi->mi_flags & MI_INT))
4371 cl->cl_nosignal = TRUE;
4372 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4373 xdr_void, NULL, tv);
4374 if (!(mi->mi_flags & MI_INT))
4375 cl->cl_nosignal = FALSE;
4376 AUTH_DESTROY(cl->cl_auth);
4377 CLNT_DESTROY(cl);
4378 if (status == RPC_SUCCESS) {
4379 if (svp == mi->mi_curr_serv) {
4380 #ifdef DEBUG
4381 zcmn_err(zoneid, CE_NOTE,
4382 "NFS%d: failing over: selecting original server %s",
4383 mi->mi_vers, svp->sv_hostname);
4384 #else
4385 zcmn_err(zoneid, CE_NOTE,
4386 "NFS: failing over: selecting original server %s",
4387 svp->sv_hostname);
4388 #endif
4389 } else {
4390 #ifdef DEBUG
4391 zcmn_err(zoneid, CE_NOTE,
4392 "NFS%d: failing over from %s to %s",
4393 mi->mi_vers,
4394 mi->mi_curr_serv->sv_hostname,
4395 svp->sv_hostname);
4396 #else
4397 zcmn_err(zoneid, CE_NOTE,
4398 "NFS: failing over from %s to %s",
4399 mi->mi_curr_serv->sv_hostname,
4400 svp->sv_hostname);
4401 #endif
4402 }
4403 break;
4404 }
4405 }
4406
4407 if (svp == NULL) {
4408 if (!oncethru) {
4409 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4410 #ifdef DEBUG
4411 zprintf(zoneid,
4412 "NFS%d servers %s not responding "
4413 "still trying\n", mi->mi_vers, srvnames);
4414 #else
4415 zprintf(zoneid, "NFS servers %s not responding "
4416 "still trying\n", srvnames);
4417 #endif
4418 oncethru = 1;
4419 }
4420 mutex_enter(&mi->mi_lock);
4421 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4422 mutex_exit(&mi->mi_lock);
4423 delay(hz);
4424 mutex_enter(&mi->mi_lock);
4425 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4426 mutex_exit(&mi->mi_lock);
4427 }
4428 }
4429
4430 if (oncethru) {
4431 #ifdef DEBUG
4432 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4433 #else
4434 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4435 #endif
4436 }
4437
4438 if (svp != mi->mi_curr_serv) {
4439 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4440 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4441 rw_enter(&rtable[index].r_lock, RW_WRITER);
4442 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4443 mi->mi_vfsp);
4444 if (rp != NULL) {
4445 if (rp->r_flags & RHASHED)
4446 rp_rmhash_locked(rp);
4447 rw_exit(&rtable[index].r_lock);
4448 rp->r_server = svp;
4449 rp->r_fh = svp->sv_fhandle;
4450 (void) nfs_free_data_reclaim(rp);
4451 index = rtablehash(&rp->r_fh);
4452 rp->r_hashq = &rtable[index];
4453 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4454 vn_exists(RTOV(rp));
4455 rp_addhash(rp);
4456 rw_exit(&rp->r_hashq->r_lock);
4457 VN_RELE(RTOV(rp));
4458 } else
4459 rw_exit(&rtable[index].r_lock);
4460 }
4461
4462 done:
4463 if (oncethru)
4464 kmem_free(srvnames, srvnames_len);
4465 mutex_enter(&mi->mi_lock);
4466 mi->mi_flags &= ~MI_BINDINPROG;
4467 if (svp != NULL) {
4468 mi->mi_curr_serv = svp;
4469 mi->mi_failover++;
4470 #ifdef DEBUG
4471 nfscl->nfscl_stat.failover.value.ui64++;
4472 #endif
4473 }
4474 cv_broadcast(&mi->mi_failover_cv);
4475 CALLB_CPR_EXIT(&cprinfo);
4476 VFS_RELE(mi->mi_vfsp);
4477 zthread_exit();
4478 /* NOTREACHED */
4479 }
4480
4481 /*
4482 * NFS client failover support
4483 *
4484 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4485 * is cleared, meaning that failover is complete. Called with
4486 * mi_lock mutex held.
4487 */
4488 static int
failover_wait(mntinfo_t * mi)4489 failover_wait(mntinfo_t *mi)
4490 {
4491 k_sigset_t smask;
4492
4493 /*
4494 * If someone else is hunting for a living server,
4495 * sleep until it's done. After our sleep, we may
4496 * be bound to the right server and get off cheaply.
4497 */
4498 while (mi->mi_flags & MI_BINDINPROG) {
4499 /*
4500 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4501 * and SIGTERM. (Preserving the existing masks).
4502 * Mask out SIGINT if mount option nointr is specified.
4503 */
4504 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4505 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4506 /*
4507 * restore original signal mask
4508 */
4509 sigunintr(&smask);
4510 return (EINTR);
4511 }
4512 /*
4513 * restore original signal mask
4514 */
4515 sigunintr(&smask);
4516 }
4517 return (0);
4518 }
4519
4520 /*
4521 * NFS client failover support
4522 *
4523 * failover_remap() will do a partial pathname lookup and find the
4524 * desired vnode on the current server. The interim vnode will be
4525 * discarded after we pilfer the new filehandle.
4526 *
4527 * Side effects:
4528 * - This routine will also update the filehandle in the args structure
4529 * pointed to by the fi->fhp pointer if it is non-NULL.
4530 */
4531
4532 static int
failover_remap(failinfo_t * fi)4533 failover_remap(failinfo_t *fi)
4534 {
4535 vnode_t *vp, *nvp, *rootvp;
4536 rnode_t *rp, *nrp;
4537 mntinfo_t *mi;
4538 int error;
4539 #ifdef DEBUG
4540 struct nfs_clnt *nfscl;
4541
4542 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4543 ASSERT(nfscl != NULL);
4544 #endif
4545 /*
4546 * Sanity check
4547 */
4548 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4549 return (EINVAL);
4550 vp = fi->vp;
4551 rp = VTOR(vp);
4552 mi = VTOMI(vp);
4553
4554 if (!(vp->v_flag & VROOT)) {
4555 /*
4556 * Given the root fh, use the path stored in
4557 * the rnode to find the fh for the new server.
4558 */
4559 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4560 if (error)
4561 return (error);
4562
4563 error = failover_lookup(rp->r_path, rootvp,
4564 fi->lookupproc, fi->xattrdirproc, &nvp);
4565
4566 VN_RELE(rootvp);
4567
4568 if (error)
4569 return (error);
4570
4571 /*
4572 * If we found the same rnode, we're done now
4573 */
4574 if (nvp == vp) {
4575 /*
4576 * Failed and the new server may physically be same
4577 * OR may share a same disk subsystem. In this case
4578 * file handle for a particular file path is not going
4579 * to change, given the same filehandle lookup will
4580 * always locate the same rnode as the existing one.
4581 * All we might need to do is to update the r_server
4582 * with the current servinfo.
4583 */
4584 if (!VALID_FH(fi)) {
4585 rp->r_server = mi->mi_curr_serv;
4586 }
4587 VN_RELE(nvp);
4588 return (0);
4589 }
4590
4591 /*
4592 * Try to make it so that no one else will find this
4593 * vnode because it is just a temporary to hold the
4594 * new file handle until that file handle can be
4595 * copied to the original vnode/rnode.
4596 */
4597 nrp = VTOR(nvp);
4598 mutex_enter(&mi->mi_remap_lock);
4599 /*
4600 * Some other thread could have raced in here and could
4601 * have done the remap for this particular rnode before
4602 * this thread here. Check for rp->r_server and
4603 * mi->mi_curr_serv and return if they are same.
4604 */
4605 if (VALID_FH(fi)) {
4606 mutex_exit(&mi->mi_remap_lock);
4607 VN_RELE(nvp);
4608 return (0);
4609 }
4610
4611 if (nrp->r_flags & RHASHED)
4612 rp_rmhash(nrp);
4613
4614 /*
4615 * As a heuristic check on the validity of the new
4616 * file, check that the size and type match against
4617 * that we remember from the old version.
4618 */
4619 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4620 mutex_exit(&mi->mi_remap_lock);
4621 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4622 "NFS replicas %s and %s: file %s not same.",
4623 rp->r_server->sv_hostname,
4624 nrp->r_server->sv_hostname, rp->r_path);
4625 VN_RELE(nvp);
4626 return (EINVAL);
4627 }
4628
4629 /*
4630 * snarf the filehandle from the new rnode
4631 * then release it, again while updating the
4632 * hash queues for the rnode.
4633 */
4634 if (rp->r_flags & RHASHED)
4635 rp_rmhash(rp);
4636 rp->r_server = mi->mi_curr_serv;
4637 rp->r_fh = nrp->r_fh;
4638 rp->r_hashq = nrp->r_hashq;
4639 /*
4640 * Copy the attributes from the new rnode to the old
4641 * rnode. This will help to reduce unnecessary page
4642 * cache flushes.
4643 */
4644 rp->r_attr = nrp->r_attr;
4645 rp->r_attrtime = nrp->r_attrtime;
4646 rp->r_mtime = nrp->r_mtime;
4647 (void) nfs_free_data_reclaim(rp);
4648 nfs_setswaplike(vp, &rp->r_attr);
4649 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4650 rp_addhash(rp);
4651 rw_exit(&rp->r_hashq->r_lock);
4652 mutex_exit(&mi->mi_remap_lock);
4653 VN_RELE(nvp);
4654 }
4655
4656 /*
4657 * Update successful failover remap count
4658 */
4659 mutex_enter(&mi->mi_lock);
4660 mi->mi_remap++;
4661 mutex_exit(&mi->mi_lock);
4662 #ifdef DEBUG
4663 nfscl->nfscl_stat.remap.value.ui64++;
4664 #endif
4665
4666 /*
4667 * If we have a copied filehandle to update, do it now.
4668 */
4669 if (fi->fhp != NULL && fi->copyproc != NULL)
4670 (*fi->copyproc)(fi->fhp, vp);
4671
4672 return (0);
4673 }
4674
4675 /*
4676 * NFS client failover support
4677 *
4678 * We want a simple pathname lookup routine to parse the pieces
4679 * of path in rp->r_path. We know that the path was a created
4680 * as rnodes were made, so we know we have only to deal with
4681 * paths that look like:
4682 * dir1/dir2/dir3/file
4683 * Any evidence of anything like .., symlinks, and ENOTDIR
4684 * are hard errors, because they mean something in this filesystem
4685 * is different from the one we came from, or has changed under
4686 * us in some way. If this is true, we want the failure.
4687 *
4688 * Extended attributes: if the filesystem is mounted with extended
4689 * attributes enabled (-o xattr), the attribute directory will be
4690 * represented in the r_path as the magic name XATTR_RPATH. So if
4691 * we see that name in the pathname, is must be because this node
4692 * is an extended attribute. Therefore, look it up that way.
4693 */
4694 static int
failover_lookup(char * path,vnode_t * root,int (* lookupproc)(vnode_t *,char *,vnode_t **,struct pathname *,int,vnode_t *,cred_t *,int),int (* xattrdirproc)(vnode_t *,vnode_t **,bool_t,cred_t *,int),vnode_t ** new)4695 failover_lookup(char *path, vnode_t *root,
4696 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4697 vnode_t *, cred_t *, int),
4698 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4699 vnode_t **new)
4700 {
4701 vnode_t *dvp, *nvp;
4702 int error = EINVAL;
4703 char *s, *p, *tmppath;
4704 size_t len;
4705 mntinfo_t *mi;
4706 bool_t xattr;
4707
4708 /* Make local copy of path */
4709 len = strlen(path) + 1;
4710 tmppath = kmem_alloc(len, KM_SLEEP);
4711 (void) strcpy(tmppath, path);
4712 s = tmppath;
4713
4714 dvp = root;
4715 VN_HOLD(dvp);
4716 mi = VTOMI(root);
4717 xattr = mi->mi_flags & MI_EXTATTR;
4718
4719 do {
4720 p = strchr(s, '/');
4721 if (p != NULL)
4722 *p = '\0';
4723 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4724 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4725 RFSCALL_SOFT);
4726 } else {
4727 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4728 CRED(), RFSCALL_SOFT);
4729 }
4730 if (p != NULL)
4731 *p++ = '/';
4732 if (error) {
4733 VN_RELE(dvp);
4734 kmem_free(tmppath, len);
4735 return (error);
4736 }
4737 s = p;
4738 VN_RELE(dvp);
4739 dvp = nvp;
4740 } while (p != NULL);
4741
4742 if (nvp != NULL && new != NULL)
4743 *new = nvp;
4744 kmem_free(tmppath, len);
4745 return (0);
4746 }
4747
4748 /*
4749 * NFS client failover support
4750 *
4751 * sv_free() frees the malloc'd portion of a "servinfo_t".
4752 */
4753 void
sv_free(servinfo_t * svp)4754 sv_free(servinfo_t *svp)
4755 {
4756 servinfo_t *next;
4757 struct knetconfig *knconf;
4758
4759 while (svp != NULL) {
4760 next = svp->sv_next;
4761 if (svp->sv_secdata)
4762 sec_clnt_freeinfo(svp->sv_secdata);
4763 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4764 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4765 knconf = svp->sv_knconf;
4766 if (knconf != NULL) {
4767 if (knconf->knc_protofmly != NULL)
4768 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4769 if (knconf->knc_proto != NULL)
4770 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4771 kmem_free(knconf, sizeof (*knconf));
4772 }
4773 knconf = svp->sv_origknconf;
4774 if (knconf != NULL) {
4775 if (knconf->knc_protofmly != NULL)
4776 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4777 if (knconf->knc_proto != NULL)
4778 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4779 kmem_free(knconf, sizeof (*knconf));
4780 }
4781 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4782 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4783 mutex_destroy(&svp->sv_lock);
4784 kmem_free(svp, sizeof (*svp));
4785 svp = next;
4786 }
4787 }
4788
4789 /*
4790 * Only can return non-zero if intr != 0.
4791 */
4792 int
nfs_rw_enter_sig(nfs_rwlock_t * l,krw_t rw,int intr)4793 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4794 {
4795
4796 mutex_enter(&l->lock);
4797
4798 /*
4799 * If this is a nested enter, then allow it. There
4800 * must be as many exits as enters through.
4801 */
4802 if (l->owner == curthread) {
4803 /* lock is held for writing by current thread */
4804 ASSERT(rw == RW_READER || rw == RW_WRITER);
4805 l->count--;
4806 } else if (rw == RW_READER) {
4807 /*
4808 * While there is a writer active or writers waiting,
4809 * then wait for them to finish up and move on. Then,
4810 * increment the count to indicate that a reader is
4811 * active.
4812 */
4813 while (l->count < 0 || l->waiters > 0) {
4814 if (intr) {
4815 klwp_t *lwp = ttolwp(curthread);
4816
4817 if (lwp != NULL)
4818 lwp->lwp_nostop++;
4819 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4820 if (lwp != NULL)
4821 lwp->lwp_nostop--;
4822 mutex_exit(&l->lock);
4823 return (EINTR);
4824 }
4825 if (lwp != NULL)
4826 lwp->lwp_nostop--;
4827 } else
4828 cv_wait(&l->cv_rd, &l->lock);
4829 }
4830 ASSERT(l->count < INT_MAX);
4831 #ifdef DEBUG
4832 if ((l->count % 10000) == 9999)
4833 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4834 "rwlock @ %p\n", l->count, (void *)&l);
4835 #endif
4836 l->count++;
4837 } else {
4838 ASSERT(rw == RW_WRITER);
4839 /*
4840 * While there are readers active or a writer
4841 * active, then wait for all of the readers
4842 * to finish or for the writer to finish.
4843 * Then, set the owner field to curthread and
4844 * decrement count to indicate that a writer
4845 * is active.
4846 */
4847 while (l->count != 0) {
4848 l->waiters++;
4849 if (intr) {
4850 klwp_t *lwp = ttolwp(curthread);
4851
4852 if (lwp != NULL)
4853 lwp->lwp_nostop++;
4854 if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4855 if (lwp != NULL)
4856 lwp->lwp_nostop--;
4857 l->waiters--;
4858 /*
4859 * If there are readers active and no
4860 * writers waiting then wake up all of
4861 * the waiting readers (if any).
4862 */
4863 if (l->count > 0 && l->waiters == 0)
4864 cv_broadcast(&l->cv_rd);
4865 mutex_exit(&l->lock);
4866 return (EINTR);
4867 }
4868 if (lwp != NULL)
4869 lwp->lwp_nostop--;
4870 } else
4871 cv_wait(&l->cv, &l->lock);
4872 l->waiters--;
4873 }
4874 ASSERT(l->owner == NULL);
4875 l->owner = curthread;
4876 l->count--;
4877 }
4878
4879 mutex_exit(&l->lock);
4880
4881 return (0);
4882 }
4883
4884 /*
4885 * If the lock is available, obtain it and return non-zero. If there is
4886 * already a conflicting lock, return 0 immediately.
4887 */
4888
4889 int
nfs_rw_tryenter(nfs_rwlock_t * l,krw_t rw)4890 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4891 {
4892 mutex_enter(&l->lock);
4893
4894 /*
4895 * If this is a nested enter, then allow it. There
4896 * must be as many exits as enters through.
4897 */
4898 if (l->owner == curthread) {
4899 /* lock is held for writing by current thread */
4900 ASSERT(rw == RW_READER || rw == RW_WRITER);
4901 l->count--;
4902 } else if (rw == RW_READER) {
4903 /*
4904 * If there is a writer active or writers waiting, deny the
4905 * lock. Otherwise, bump the count of readers.
4906 */
4907 if (l->count < 0 || l->waiters > 0) {
4908 mutex_exit(&l->lock);
4909 return (0);
4910 }
4911 l->count++;
4912 } else {
4913 ASSERT(rw == RW_WRITER);
4914 /*
4915 * If there are readers active or a writer active, deny the
4916 * lock. Otherwise, set the owner field to curthread and
4917 * decrement count to indicate that a writer is active.
4918 */
4919 if (l->count != 0) {
4920 mutex_exit(&l->lock);
4921 return (0);
4922 }
4923 ASSERT(l->owner == NULL);
4924 l->owner = curthread;
4925 l->count--;
4926 }
4927
4928 mutex_exit(&l->lock);
4929
4930 return (1);
4931 }
4932
4933 void
nfs_rw_exit(nfs_rwlock_t * l)4934 nfs_rw_exit(nfs_rwlock_t *l)
4935 {
4936
4937 mutex_enter(&l->lock);
4938
4939 if (l->owner != NULL) {
4940 ASSERT(l->owner == curthread);
4941
4942 /*
4943 * To release a writer lock increment count to indicate that
4944 * there is one less writer active. If this was the last of
4945 * possibly nested writer locks, then clear the owner field as
4946 * well to indicate that there is no writer active.
4947 */
4948 ASSERT(l->count < 0);
4949 l->count++;
4950 if (l->count == 0) {
4951 l->owner = NULL;
4952
4953 /*
4954 * If there are no writers waiting then wakeup all of
4955 * the waiting readers (if any).
4956 */
4957 if (l->waiters == 0)
4958 cv_broadcast(&l->cv_rd);
4959 }
4960 } else {
4961 /*
4962 * To release a reader lock just decrement count to indicate
4963 * that there is one less reader active.
4964 */
4965 ASSERT(l->count > 0);
4966 l->count--;
4967 }
4968
4969 /*
4970 * If there are no readers active nor a writer active and there is a
4971 * writer waiting we need to wake up it.
4972 */
4973 if (l->count == 0 && l->waiters > 0)
4974 cv_signal(&l->cv);
4975 mutex_exit(&l->lock);
4976 }
4977
4978 int
nfs_rw_lock_held(nfs_rwlock_t * l,krw_t rw)4979 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4980 {
4981
4982 if (rw == RW_READER)
4983 return (l->count > 0);
4984 ASSERT(rw == RW_WRITER);
4985 return (l->count < 0);
4986 }
4987
4988 /* ARGSUSED */
4989 void
nfs_rw_init(nfs_rwlock_t * l,char * name,krw_type_t type,void * arg)4990 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4991 {
4992
4993 l->count = 0;
4994 l->waiters = 0;
4995 l->owner = NULL;
4996 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4997 cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4998 cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4999 }
5000
5001 void
nfs_rw_destroy(nfs_rwlock_t * l)5002 nfs_rw_destroy(nfs_rwlock_t *l)
5003 {
5004
5005 mutex_destroy(&l->lock);
5006 cv_destroy(&l->cv);
5007 cv_destroy(&l->cv_rd);
5008 }
5009
5010 int
nfs3_rddir_compar(const void * x,const void * y)5011 nfs3_rddir_compar(const void *x, const void *y)
5012 {
5013 rddir_cache *a = (rddir_cache *)x;
5014 rddir_cache *b = (rddir_cache *)y;
5015
5016 if (a->nfs3_cookie == b->nfs3_cookie) {
5017 if (a->buflen == b->buflen)
5018 return (0);
5019 if (a->buflen < b->buflen)
5020 return (-1);
5021 return (1);
5022 }
5023
5024 if (a->nfs3_cookie < b->nfs3_cookie)
5025 return (-1);
5026
5027 return (1);
5028 }
5029
5030 int
nfs_rddir_compar(const void * x,const void * y)5031 nfs_rddir_compar(const void *x, const void *y)
5032 {
5033 rddir_cache *a = (rddir_cache *)x;
5034 rddir_cache *b = (rddir_cache *)y;
5035
5036 if (a->nfs_cookie == b->nfs_cookie) {
5037 if (a->buflen == b->buflen)
5038 return (0);
5039 if (a->buflen < b->buflen)
5040 return (-1);
5041 return (1);
5042 }
5043
5044 if (a->nfs_cookie < b->nfs_cookie)
5045 return (-1);
5046
5047 return (1);
5048 }
5049
5050 static char *
nfs_getsrvnames(mntinfo_t * mi,size_t * len)5051 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5052 {
5053 servinfo_t *s;
5054 char *srvnames;
5055 char *namep;
5056 size_t length;
5057
5058 /*
5059 * Calculate the length of the string required to hold all
5060 * of the server names plus either a comma or a null
5061 * character following each individual one.
5062 */
5063 length = 0;
5064 for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5065 length += s->sv_hostnamelen;
5066
5067 srvnames = kmem_alloc(length, KM_SLEEP);
5068
5069 namep = srvnames;
5070 for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5071 (void) strcpy(namep, s->sv_hostname);
5072 namep += s->sv_hostnamelen - 1;
5073 *namep++ = ',';
5074 }
5075 *--namep = '\0';
5076
5077 *len = length;
5078
5079 return (srvnames);
5080 }
5081
5082 /*
5083 * These two functions are temporary and designed for the upgrade-workaround
5084 * only. They cannot be used for general zone-crossing NFS client support, and
5085 * will be removed shortly.
5086 *
5087 * When the workaround is enabled, all NFS traffic is forced into the global
5088 * zone. These functions are called when the code needs to refer to the state
5089 * of the underlying network connection. They're not called when the function
5090 * needs to refer to the state of the process that invoked the system call.
5091 * (E.g., when checking whether the zone is shutting down during the mount()
5092 * call.)
5093 */
5094
5095 struct zone *
nfs_zone(void)5096 nfs_zone(void)
5097 {
5098 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5099 }
5100
5101 zoneid_t
nfs_zoneid(void)5102 nfs_zoneid(void)
5103 {
5104 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5105 }
5106
5107 /*
5108 * nfs_mount_label_policy:
5109 * Determine whether the mount is allowed according to MAC check,
5110 * by comparing (where appropriate) label of the remote server
5111 * against the label of the zone being mounted into.
5112 *
5113 * Returns:
5114 * 0 : access allowed
5115 * -1 : read-only access allowed (i.e., read-down)
5116 * >0 : error code, such as EACCES
5117 */
5118 int
nfs_mount_label_policy(vfs_t * vfsp,struct netbuf * addr,struct knetconfig * knconf,cred_t * cr)5119 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5120 struct knetconfig *knconf, cred_t *cr)
5121 {
5122 int addr_type;
5123 void *ipaddr;
5124 bslabel_t *server_sl, *mntlabel;
5125 zone_t *mntzone = NULL;
5126 ts_label_t *zlabel;
5127 tsol_tpc_t *tp;
5128 ts_label_t *tsl = NULL;
5129 int retv;
5130
5131 /*
5132 * Get the zone's label. Each zone on a labeled system has a label.
5133 */
5134 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5135 zlabel = mntzone->zone_slabel;
5136 ASSERT(zlabel != NULL);
5137 label_hold(zlabel);
5138
5139 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5140 addr_type = IPV4_VERSION;
5141 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5142 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5143 addr_type = IPV6_VERSION;
5144 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5145 } else {
5146 retv = 0;
5147 goto out;
5148 }
5149
5150 retv = EACCES; /* assume the worst */
5151
5152 /*
5153 * Next, get the assigned label of the remote server.
5154 */
5155 tp = find_tpc(ipaddr, addr_type, B_FALSE);
5156 if (tp == NULL)
5157 goto out; /* error getting host entry */
5158
5159 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5160 goto rel_tpc; /* invalid domain */
5161 if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5162 (tp->tpc_tp.host_type != UNLABELED))
5163 goto rel_tpc; /* invalid hosttype */
5164
5165 if (tp->tpc_tp.host_type == SUN_CIPSO) {
5166 tsl = getflabel_cipso(vfsp);
5167 if (tsl == NULL)
5168 goto rel_tpc; /* error getting server lbl */
5169
5170 server_sl = label2bslabel(tsl);
5171 } else { /* UNLABELED */
5172 server_sl = &tp->tpc_tp.tp_def_label;
5173 }
5174
5175 mntlabel = label2bslabel(zlabel);
5176
5177 /*
5178 * Now compare labels to complete the MAC check. If the labels
5179 * are equal or if the requestor is in the global zone and has
5180 * NET_MAC_AWARE, then allow read-write access. (Except for
5181 * mounts into the global zone itself; restrict these to
5182 * read-only.)
5183 *
5184 * If the requestor is in some other zone, but his label
5185 * dominates the server, then allow read-down.
5186 *
5187 * Otherwise, access is denied.
5188 */
5189 if (blequal(mntlabel, server_sl) ||
5190 (crgetzoneid(cr) == GLOBAL_ZONEID &&
5191 getpflags(NET_MAC_AWARE, cr) != 0)) {
5192 if ((mntzone == global_zone) ||
5193 !blequal(mntlabel, server_sl))
5194 retv = -1; /* read-only */
5195 else
5196 retv = 0; /* access OK */
5197 } else if (bldominates(mntlabel, server_sl)) {
5198 retv = -1; /* read-only */
5199 } else {
5200 retv = EACCES;
5201 }
5202
5203 if (tsl != NULL)
5204 label_rele(tsl);
5205
5206 rel_tpc:
5207 TPC_RELE(tp);
5208 out:
5209 if (mntzone)
5210 zone_rele(mntzone);
5211 label_rele(zlabel);
5212 return (retv);
5213 }
5214
5215 boolean_t
nfs_has_ctty(void)5216 nfs_has_ctty(void)
5217 {
5218 boolean_t rv;
5219 mutex_enter(&curproc->p_splock);
5220 rv = (curproc->p_sessp->s_vp != NULL);
5221 mutex_exit(&curproc->p_splock);
5222 return (rv);
5223 }
5224
5225 /*
5226 * See if xattr directory to see if it has any generic user attributes
5227 */
5228 int
do_xattr_exists_check(vnode_t * vp,ulong_t * valp,cred_t * cr)5229 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5230 {
5231 struct uio uio;
5232 struct iovec iov;
5233 char *dbuf;
5234 struct dirent64 *dp;
5235 size_t dlen = 8 * 1024;
5236 size_t dbuflen;
5237 int eof = 0;
5238 int error;
5239
5240 *valp = 0;
5241 dbuf = kmem_alloc(dlen, KM_SLEEP);
5242 uio.uio_iov = &iov;
5243 uio.uio_iovcnt = 1;
5244 uio.uio_segflg = UIO_SYSSPACE;
5245 uio.uio_fmode = 0;
5246 uio.uio_extflg = UIO_COPY_CACHED;
5247 uio.uio_loffset = 0;
5248 uio.uio_resid = dlen;
5249 iov.iov_base = dbuf;
5250 iov.iov_len = dlen;
5251 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5252 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5253 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5254
5255 dbuflen = dlen - uio.uio_resid;
5256
5257 if (error || dbuflen == 0) {
5258 kmem_free(dbuf, dlen);
5259 return (error);
5260 }
5261
5262 dp = (dirent64_t *)dbuf;
5263
5264 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5265 if (strcmp(dp->d_name, ".") == 0 ||
5266 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5267 VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5268 VIEW_READONLY) == 0) {
5269 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5270 continue;
5271 }
5272
5273 *valp = 1;
5274 break;
5275 }
5276 kmem_free(dbuf, dlen);
5277 return (0);
5278 }
5279