1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 * All rights reserved.
29 */
30
31 /*
32 * Copyright 2018 Nexenta Systems, Inc.
33 */
34
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/systm.h>
38 #include <sys/thread.h>
39 #include <sys/t_lock.h>
40 #include <sys/time.h>
41 #include <sys/vnode.h>
42 #include <sys/vfs.h>
43 #include <sys/errno.h>
44 #include <sys/buf.h>
45 #include <sys/stat.h>
46 #include <sys/cred.h>
47 #include <sys/kmem.h>
48 #include <sys/debug.h>
49 #include <sys/dnlc.h>
50 #include <sys/vmsystm.h>
51 #include <sys/flock.h>
52 #include <sys/share.h>
53 #include <sys/cmn_err.h>
54 #include <sys/tiuser.h>
55 #include <sys/sysmacros.h>
56 #include <sys/callb.h>
57 #include <sys/acl.h>
58 #include <sys/kstat.h>
59 #include <sys/signal.h>
60 #include <sys/list.h>
61 #include <sys/zone.h>
62
63 #include <rpc/types.h>
64 #include <rpc/xdr.h>
65 #include <rpc/auth.h>
66 #include <rpc/clnt.h>
67
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/nfs_cmd.h>
71
72 #include <nfs/rnode.h>
73 #include <nfs/nfs_acl.h>
74 #include <nfs/lm.h>
75
76 #include <vm/hat.h>
77 #include <vm/as.h>
78 #include <vm/page.h>
79 #include <vm/pvn.h>
80 #include <vm/seg.h>
81 #include <vm/seg_map.h>
82 #include <vm/seg_vn.h>
83
84 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
85 cred_t *);
86 static int nfs_getattr_cache(vnode_t *, struct vattr *);
87 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
88
89 struct mi_globals {
90 kmutex_t mig_lock; /* lock protecting mig_list */
91 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */
92 boolean_t mig_destructor_called;
93 };
94
95 static zone_key_t mi_list_key;
96
97 /* Debugging flag for PC file shares. */
98 extern int share_debug;
99
100 /*
101 * Attributes caching:
102 *
103 * Attributes are cached in the rnode in struct vattr form.
104 * There is a time associated with the cached attributes (r_attrtime)
105 * which tells whether the attributes are valid. The time is initialized
106 * to the difference between current time and the modify time of the vnode
107 * when new attributes are cached. This allows the attributes for
108 * files that have changed recently to be timed out sooner than for files
109 * that have not changed for a long time. There are minimum and maximum
110 * timeout values that can be set per mount point.
111 */
112
113 int
nfs_waitfor_purge_complete(vnode_t * vp)114 nfs_waitfor_purge_complete(vnode_t *vp)
115 {
116 rnode_t *rp;
117 k_sigset_t smask;
118
119 rp = VTOR(vp);
120 if (rp->r_serial != NULL && rp->r_serial != curthread) {
121 mutex_enter(&rp->r_statelock);
122 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
123 while (rp->r_serial != NULL) {
124 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
125 sigunintr(&smask);
126 mutex_exit(&rp->r_statelock);
127 return (EINTR);
128 }
129 }
130 sigunintr(&smask);
131 mutex_exit(&rp->r_statelock);
132 }
133 return (0);
134 }
135
136 /*
137 * Validate caches by checking cached attributes. If the cached
138 * attributes have timed out, then get new attributes from the server.
139 * As a side affect, this will do cache invalidation if the attributes
140 * have changed.
141 *
142 * If the attributes have not timed out and if there is a cache
143 * invalidation being done by some other thread, then wait until that
144 * thread has completed the cache invalidation.
145 */
146 int
nfs_validate_caches(vnode_t * vp,cred_t * cr)147 nfs_validate_caches(vnode_t *vp, cred_t *cr)
148 {
149 int error;
150 struct vattr va;
151
152 if (ATTRCACHE_VALID(vp)) {
153 error = nfs_waitfor_purge_complete(vp);
154 if (error)
155 return (error);
156 return (0);
157 }
158
159 va.va_mask = AT_ALL;
160 return (nfs_getattr_otw(vp, &va, cr));
161 }
162
163 /*
164 * Validate caches by checking cached attributes. If the cached
165 * attributes have timed out, then get new attributes from the server.
166 * As a side affect, this will do cache invalidation if the attributes
167 * have changed.
168 *
169 * If the attributes have not timed out and if there is a cache
170 * invalidation being done by some other thread, then wait until that
171 * thread has completed the cache invalidation.
172 */
173 int
nfs3_validate_caches(vnode_t * vp,cred_t * cr)174 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
175 {
176 int error;
177 struct vattr va;
178
179 if (ATTRCACHE_VALID(vp)) {
180 error = nfs_waitfor_purge_complete(vp);
181 if (error)
182 return (error);
183 return (0);
184 }
185
186 va.va_mask = AT_ALL;
187 return (nfs3_getattr_otw(vp, &va, cr));
188 }
189
190 /*
191 * Purge all of the various NFS `data' caches.
192 */
193 void
nfs_purge_caches(vnode_t * vp,int purge_dnlc,cred_t * cr)194 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
195 {
196 rnode_t *rp;
197 char *contents;
198 int size;
199 int error;
200
201 /*
202 * Purge the DNLC for any entries which refer to this file.
203 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
204 */
205 rp = VTOR(vp);
206 mutex_enter(&rp->r_statelock);
207 if (vp->v_count > 1 &&
208 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
209 !(rp->r_flags & RINDNLCPURGE)) {
210 /*
211 * Set the RINDNLCPURGE flag to prevent recursive entry
212 * into dnlc_purge_vp()
213 */
214 if (vp->v_type == VDIR)
215 rp->r_flags |= RINDNLCPURGE;
216 mutex_exit(&rp->r_statelock);
217 dnlc_purge_vp(vp);
218 mutex_enter(&rp->r_statelock);
219 if (rp->r_flags & RINDNLCPURGE)
220 rp->r_flags &= ~RINDNLCPURGE;
221 }
222
223 /*
224 * Clear any readdir state bits and purge the readlink response cache.
225 */
226 contents = rp->r_symlink.contents;
227 size = rp->r_symlink.size;
228 rp->r_symlink.contents = NULL;
229 mutex_exit(&rp->r_statelock);
230
231 if (contents != NULL) {
232
233 kmem_free((void *)contents, size);
234 }
235
236 /*
237 * Flush the page cache.
238 */
239 if (vn_has_cached_data(vp)) {
240 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
241 if (error && (error == ENOSPC || error == EDQUOT)) {
242 mutex_enter(&rp->r_statelock);
243 if (!rp->r_error)
244 rp->r_error = error;
245 mutex_exit(&rp->r_statelock);
246 }
247 }
248
249 /*
250 * Flush the readdir response cache.
251 */
252 if (HAVE_RDDIR_CACHE(rp))
253 nfs_purge_rddir_cache(vp);
254 }
255
256 /*
257 * Purge the readdir cache of all entries
258 */
259 void
nfs_purge_rddir_cache(vnode_t * vp)260 nfs_purge_rddir_cache(vnode_t *vp)
261 {
262 rnode_t *rp;
263 rddir_cache *rdc;
264 rddir_cache *nrdc;
265
266 rp = VTOR(vp);
267 top:
268 mutex_enter(&rp->r_statelock);
269 rp->r_direof = NULL;
270 rp->r_flags &= ~RLOOKUP;
271 rp->r_flags |= RREADDIRPLUS;
272 rdc = avl_first(&rp->r_dir);
273 while (rdc != NULL) {
274 nrdc = AVL_NEXT(&rp->r_dir, rdc);
275 avl_remove(&rp->r_dir, rdc);
276 rddir_cache_rele(rdc);
277 rdc = nrdc;
278 }
279 mutex_exit(&rp->r_statelock);
280 }
281
282 /*
283 * Do a cache check based on the post-operation attributes.
284 * Then make them the new cached attributes. If no attributes
285 * were returned, then mark the attributes as timed out.
286 */
287 void
nfs3_cache_post_op_attr(vnode_t * vp,post_op_attr * poap,hrtime_t t,cred_t * cr)288 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
289 {
290 vattr_t attr;
291
292 if (!poap->attributes) {
293 PURGE_ATTRCACHE(vp);
294 return;
295 }
296 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
297 }
298
299 /*
300 * Same as above, but using a vattr
301 */
302 void
nfs3_cache_post_op_vattr(vnode_t * vp,post_op_vattr * poap,hrtime_t t,cred_t * cr)303 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
304 cred_t *cr)
305 {
306 if (!poap->attributes) {
307 PURGE_ATTRCACHE(vp);
308 return;
309 }
310 nfs_attr_cache(vp, poap->fres.vap, t, cr);
311 }
312
313 /*
314 * Do a cache check based on the weak cache consistency attributes.
315 * These consist of a small set of pre-operation attributes and the
316 * full set of post-operation attributes.
317 *
318 * If we are given the pre-operation attributes, then use them to
319 * check the validity of the various caches. Then, if we got the
320 * post-operation attributes, make them the new cached attributes.
321 * If we didn't get the post-operation attributes, then mark the
322 * attribute cache as timed out so that the next reference will
323 * cause a GETATTR to the server to refresh with the current
324 * attributes.
325 *
326 * Otherwise, if we didn't get the pre-operation attributes, but
327 * we did get the post-operation attributes, then use these
328 * attributes to check the validity of the various caches. This
329 * will probably cause a flush of the caches because if the
330 * operation succeeded, the attributes of the object were changed
331 * in some way from the old post-operation attributes. This
332 * should be okay because it is the safe thing to do. After
333 * checking the data caches, then we make these the new cached
334 * attributes.
335 *
336 * Otherwise, we didn't get either the pre- or post-operation
337 * attributes. Simply mark the attribute cache as timed out so
338 * the next reference will cause a GETATTR to the server to
339 * refresh with the current attributes.
340 *
341 * If an error occurred trying to convert the over the wire
342 * attributes to a vattr, then simply mark the attribute cache as
343 * timed out.
344 */
345 void
nfs3_cache_wcc_data(vnode_t * vp,wcc_data * wccp,hrtime_t t,cred_t * cr)346 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
347 {
348 vattr_t bva;
349 vattr_t ava;
350
351 if (wccp->after.attributes) {
352 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
353 PURGE_ATTRCACHE(vp);
354 return;
355 }
356 if (wccp->before.attributes) {
357 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
358 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
359 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
360 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
361 bva.va_size = wccp->before.attr.size;
362 nfs3_attr_cache(vp, &bva, &ava, t, cr);
363 } else
364 nfs_attr_cache(vp, &ava, t, cr);
365 } else {
366 PURGE_ATTRCACHE(vp);
367 }
368 }
369
370 /*
371 * Set attributes cache for given vnode using nfsattr.
372 *
373 * This routine does not do cache validation with the attributes.
374 *
375 * If an error occurred trying to convert the over the wire
376 * attributes to a vattr, then simply mark the attribute cache as
377 * timed out.
378 */
379 void
nfs_attrcache(vnode_t * vp,struct nfsfattr * na,hrtime_t t)380 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
381 {
382 rnode_t *rp;
383 struct vattr va;
384
385 if (!nattr_to_vattr(vp, na, &va)) {
386 rp = VTOR(vp);
387 mutex_enter(&rp->r_statelock);
388 if (rp->r_mtime <= t)
389 nfs_attrcache_va(vp, &va);
390 mutex_exit(&rp->r_statelock);
391 } else {
392 PURGE_ATTRCACHE(vp);
393 }
394 }
395
396 /*
397 * Set attributes cache for given vnode using fattr3.
398 *
399 * This routine does not do cache validation with the attributes.
400 *
401 * If an error occurred trying to convert the over the wire
402 * attributes to a vattr, then simply mark the attribute cache as
403 * timed out.
404 */
405 void
nfs3_attrcache(vnode_t * vp,fattr3 * na,hrtime_t t)406 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
407 {
408 rnode_t *rp;
409 struct vattr va;
410
411 if (!fattr3_to_vattr(vp, na, &va)) {
412 rp = VTOR(vp);
413 mutex_enter(&rp->r_statelock);
414 if (rp->r_mtime <= t)
415 nfs_attrcache_va(vp, &va);
416 mutex_exit(&rp->r_statelock);
417 } else {
418 PURGE_ATTRCACHE(vp);
419 }
420 }
421
422 /*
423 * Do a cache check based on attributes returned over the wire. The
424 * new attributes are cached.
425 *
426 * If an error occurred trying to convert the over the wire attributes
427 * to a vattr, then just return that error.
428 *
429 * As a side affect, the vattr argument is filled in with the converted
430 * attributes.
431 */
432 int
nfs_cache_fattr(vnode_t * vp,struct nfsfattr * na,vattr_t * vap,hrtime_t t,cred_t * cr)433 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
434 cred_t *cr)
435 {
436 int error;
437
438 error = nattr_to_vattr(vp, na, vap);
439 if (error)
440 return (error);
441 nfs_attr_cache(vp, vap, t, cr);
442 return (0);
443 }
444
445 /*
446 * Do a cache check based on attributes returned over the wire. The
447 * new attributes are cached.
448 *
449 * If an error occurred trying to convert the over the wire attributes
450 * to a vattr, then just return that error.
451 *
452 * As a side affect, the vattr argument is filled in with the converted
453 * attributes.
454 */
455 int
nfs3_cache_fattr3(vnode_t * vp,fattr3 * na,vattr_t * vap,hrtime_t t,cred_t * cr)456 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
457 {
458 int error;
459
460 error = fattr3_to_vattr(vp, na, vap);
461 if (error)
462 return (error);
463 nfs_attr_cache(vp, vap, t, cr);
464 return (0);
465 }
466
467 /*
468 * Use the passed in virtual attributes to check to see whether the
469 * data and metadata caches are valid, cache the new attributes, and
470 * then do the cache invalidation if required.
471 *
472 * The cache validation and caching of the new attributes is done
473 * atomically via the use of the mutex, r_statelock. If required,
474 * the cache invalidation is done atomically w.r.t. the cache
475 * validation and caching of the attributes via the pseudo lock,
476 * r_serial.
477 *
478 * This routine is used to do cache validation and attributes caching
479 * for operations with a single set of post operation attributes.
480 */
481 void
nfs_attr_cache(vnode_t * vp,vattr_t * vap,hrtime_t t,cred_t * cr)482 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
483 {
484 rnode_t *rp;
485 int mtime_changed = 0;
486 int ctime_changed = 0;
487 vsecattr_t *vsp;
488 int was_serial;
489 len_t preattr_rsize;
490 boolean_t writeattr_set = B_FALSE;
491 boolean_t cachepurge_set = B_FALSE;
492
493 rp = VTOR(vp);
494
495 mutex_enter(&rp->r_statelock);
496
497 if (rp->r_serial != curthread) {
498 klwp_t *lwp = ttolwp(curthread);
499
500 was_serial = 0;
501 if (lwp != NULL)
502 lwp->lwp_nostop++;
503 while (rp->r_serial != NULL) {
504 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
505 mutex_exit(&rp->r_statelock);
506 if (lwp != NULL)
507 lwp->lwp_nostop--;
508 return;
509 }
510 }
511 if (lwp != NULL)
512 lwp->lwp_nostop--;
513 } else
514 was_serial = 1;
515
516 if (rp->r_mtime > t) {
517 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
518 PURGE_ATTRCACHE_LOCKED(rp);
519 mutex_exit(&rp->r_statelock);
520 return;
521 }
522
523 /*
524 * Write thread after writing data to file on remote server,
525 * will always set RWRITEATTR to indicate that file on remote
526 * server was modified with a WRITE operation and would have
527 * marked attribute cache as timed out. If RWRITEATTR
528 * is set, then do not check for mtime and ctime change.
529 */
530 if (!(rp->r_flags & RWRITEATTR)) {
531 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
532 mtime_changed = 1;
533
534 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
535 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
536 ctime_changed = 1;
537 } else {
538 writeattr_set = B_TRUE;
539 }
540
541 preattr_rsize = rp->r_size;
542
543 nfs_attrcache_va(vp, vap);
544
545 /*
546 * If we have updated filesize in nfs_attrcache_va, as soon as we
547 * drop statelock we will be in transition of purging all
548 * our caches and updating them. It is possible for another
549 * thread to pick this new file size and read in zeroed data.
550 * stall other threads till cache purge is complete.
551 */
552 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
553 /*
554 * If RWRITEATTR was set and we have updated the file
555 * size, Server's returned file size need not necessarily
556 * be because of this Client's WRITE. We need to purge
557 * all caches.
558 */
559 if (writeattr_set)
560 mtime_changed = 1;
561
562 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
563 rp->r_flags |= RINCACHEPURGE;
564 cachepurge_set = B_TRUE;
565 }
566 }
567
568 if (!mtime_changed && !ctime_changed) {
569 mutex_exit(&rp->r_statelock);
570 return;
571 }
572
573 rp->r_serial = curthread;
574
575 mutex_exit(&rp->r_statelock);
576
577 if (mtime_changed)
578 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
579
580 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
581 mutex_enter(&rp->r_statelock);
582 rp->r_flags &= ~RINCACHEPURGE;
583 cv_broadcast(&rp->r_cv);
584 mutex_exit(&rp->r_statelock);
585 cachepurge_set = B_FALSE;
586 }
587
588 if (ctime_changed) {
589 (void) nfs_access_purge_rp(rp);
590 if (rp->r_secattr != NULL) {
591 mutex_enter(&rp->r_statelock);
592 vsp = rp->r_secattr;
593 rp->r_secattr = NULL;
594 mutex_exit(&rp->r_statelock);
595 if (vsp != NULL)
596 nfs_acl_free(vsp);
597 }
598 }
599
600 if (!was_serial) {
601 mutex_enter(&rp->r_statelock);
602 rp->r_serial = NULL;
603 cv_broadcast(&rp->r_cv);
604 mutex_exit(&rp->r_statelock);
605 }
606 }
607
608 /*
609 * Use the passed in "before" virtual attributes to check to see
610 * whether the data and metadata caches are valid, cache the "after"
611 * new attributes, and then do the cache invalidation if required.
612 *
613 * The cache validation and caching of the new attributes is done
614 * atomically via the use of the mutex, r_statelock. If required,
615 * the cache invalidation is done atomically w.r.t. the cache
616 * validation and caching of the attributes via the pseudo lock,
617 * r_serial.
618 *
619 * This routine is used to do cache validation and attributes caching
620 * for operations with both pre operation attributes and post operation
621 * attributes.
622 */
623 static void
nfs3_attr_cache(vnode_t * vp,vattr_t * bvap,vattr_t * avap,hrtime_t t,cred_t * cr)624 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
625 cred_t *cr)
626 {
627 rnode_t *rp;
628 int mtime_changed = 0;
629 int ctime_changed = 0;
630 vsecattr_t *vsp;
631 int was_serial;
632 len_t preattr_rsize;
633 boolean_t writeattr_set = B_FALSE;
634 boolean_t cachepurge_set = B_FALSE;
635
636 rp = VTOR(vp);
637
638 mutex_enter(&rp->r_statelock);
639
640 if (rp->r_serial != curthread) {
641 klwp_t *lwp = ttolwp(curthread);
642
643 was_serial = 0;
644 if (lwp != NULL)
645 lwp->lwp_nostop++;
646 while (rp->r_serial != NULL) {
647 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
648 mutex_exit(&rp->r_statelock);
649 if (lwp != NULL)
650 lwp->lwp_nostop--;
651 return;
652 }
653 }
654 if (lwp != NULL)
655 lwp->lwp_nostop--;
656 } else
657 was_serial = 1;
658
659 if (rp->r_mtime > t) {
660 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
661 PURGE_ATTRCACHE_LOCKED(rp);
662 mutex_exit(&rp->r_statelock);
663 return;
664 }
665
666 /*
667 * Write thread after writing data to file on remote server,
668 * will always set RWRITEATTR to indicate that file on remote
669 * server was modified with a WRITE operation and would have
670 * marked attribute cache as timed out. If RWRITEATTR
671 * is set, then do not check for mtime and ctime change.
672 */
673 if (!(rp->r_flags & RWRITEATTR)) {
674 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
675 mtime_changed = 1;
676
677 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
678 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
679 ctime_changed = 1;
680 } else {
681 writeattr_set = B_TRUE;
682 }
683
684 preattr_rsize = rp->r_size;
685
686 nfs_attrcache_va(vp, avap);
687
688 /*
689 * If we have updated filesize in nfs_attrcache_va, as soon as we
690 * drop statelock we will be in transition of purging all
691 * our caches and updating them. It is possible for another
692 * thread to pick this new file size and read in zeroed data.
693 * stall other threads till cache purge is complete.
694 */
695 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
696 /*
697 * If RWRITEATTR was set and we have updated the file
698 * size, Server's returned file size need not necessarily
699 * be because of this Client's WRITE. We need to purge
700 * all caches.
701 */
702 if (writeattr_set)
703 mtime_changed = 1;
704
705 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
706 rp->r_flags |= RINCACHEPURGE;
707 cachepurge_set = B_TRUE;
708 }
709 }
710
711 if (!mtime_changed && !ctime_changed) {
712 mutex_exit(&rp->r_statelock);
713 return;
714 }
715
716 rp->r_serial = curthread;
717
718 mutex_exit(&rp->r_statelock);
719
720 if (mtime_changed)
721 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
722
723 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
724 mutex_enter(&rp->r_statelock);
725 rp->r_flags &= ~RINCACHEPURGE;
726 cv_broadcast(&rp->r_cv);
727 mutex_exit(&rp->r_statelock);
728 cachepurge_set = B_FALSE;
729 }
730
731 if (ctime_changed) {
732 (void) nfs_access_purge_rp(rp);
733 if (rp->r_secattr != NULL) {
734 mutex_enter(&rp->r_statelock);
735 vsp = rp->r_secattr;
736 rp->r_secattr = NULL;
737 mutex_exit(&rp->r_statelock);
738 if (vsp != NULL)
739 nfs_acl_free(vsp);
740 }
741 }
742
743 if (!was_serial) {
744 mutex_enter(&rp->r_statelock);
745 rp->r_serial = NULL;
746 cv_broadcast(&rp->r_cv);
747 mutex_exit(&rp->r_statelock);
748 }
749 }
750
751 /*
752 * Set attributes cache for given vnode using virtual attributes.
753 *
754 * Set the timeout value on the attribute cache and fill it
755 * with the passed in attributes.
756 *
757 * The caller must be holding r_statelock.
758 */
759 void
nfs_attrcache_va(vnode_t * vp,struct vattr * va)760 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
761 {
762 rnode_t *rp;
763 mntinfo_t *mi;
764 hrtime_t delta;
765 hrtime_t now;
766
767 rp = VTOR(vp);
768
769 ASSERT(MUTEX_HELD(&rp->r_statelock));
770
771 now = gethrtime();
772
773 mi = VTOMI(vp);
774
775 /*
776 * Delta is the number of nanoseconds that we will
777 * cache the attributes of the file. It is based on
778 * the number of nanoseconds since the last time that
779 * we detected a change. The assumption is that files
780 * that changed recently are likely to change again.
781 * There is a minimum and a maximum for regular files
782 * and for directories which is enforced though.
783 *
784 * Using the time since last change was detected
785 * eliminates direct comparison or calculation
786 * using mixed client and server times. NFS does
787 * not make any assumptions regarding the client
788 * and server clocks being synchronized.
789 */
790 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
791 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
792 va->va_size != rp->r_attr.va_size)
793 rp->r_mtime = now;
794
795 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
796 delta = 0;
797 else {
798 delta = now - rp->r_mtime;
799 if (vp->v_type == VDIR) {
800 if (delta < mi->mi_acdirmin)
801 delta = mi->mi_acdirmin;
802 else if (delta > mi->mi_acdirmax)
803 delta = mi->mi_acdirmax;
804 } else {
805 if (delta < mi->mi_acregmin)
806 delta = mi->mi_acregmin;
807 else if (delta > mi->mi_acregmax)
808 delta = mi->mi_acregmax;
809 }
810 }
811 rp->r_attrtime = now + delta;
812 rp->r_attr = *va;
813 /*
814 * Update the size of the file if there is no cached data or if
815 * the cached data is clean and there is no data being written
816 * out.
817 */
818 if (rp->r_size != va->va_size &&
819 (!vn_has_cached_data(vp) ||
820 (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
821 rp->r_size = va->va_size;
822 nfs_setswaplike(vp, va);
823 rp->r_flags &= ~RWRITEATTR;
824 }
825
826 /*
827 * Fill in attribute from the cache.
828 * If valid, then return 0 to indicate that no error occurred,
829 * otherwise return 1 to indicate that an error occurred.
830 */
831 static int
nfs_getattr_cache(vnode_t * vp,struct vattr * vap)832 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
833 {
834 rnode_t *rp;
835 uint_t mask = vap->va_mask;
836
837 rp = VTOR(vp);
838 mutex_enter(&rp->r_statelock);
839 if (ATTRCACHE_VALID(vp)) {
840 /*
841 * Cached attributes are valid
842 */
843 *vap = rp->r_attr;
844 /*
845 * Set the caller's va_mask to the set of attributes
846 * that were requested ANDed with the attributes that
847 * are available. If attributes were requested that
848 * are not available, those bits must be turned off
849 * in the callers va_mask.
850 */
851 vap->va_mask &= mask;
852 mutex_exit(&rp->r_statelock);
853 return (0);
854 }
855 mutex_exit(&rp->r_statelock);
856 return (1);
857 }
858
859 /*
860 * Get attributes over-the-wire and update attributes cache
861 * if no error occurred in the over-the-wire operation.
862 * Return 0 if successful, otherwise error.
863 */
864 int
nfs_getattr_otw(vnode_t * vp,struct vattr * vap,cred_t * cr)865 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
866 {
867 int error;
868 struct nfsattrstat ns;
869 int douprintf;
870 mntinfo_t *mi;
871 failinfo_t fi;
872 hrtime_t t;
873
874 mi = VTOMI(vp);
875 fi.vp = vp;
876 fi.fhp = NULL; /* no need to update, filehandle not copied */
877 fi.copyproc = nfscopyfh;
878 fi.lookupproc = nfslookup;
879 fi.xattrdirproc = acl_getxattrdir2;
880
881 if (mi->mi_flags & MI_ACL) {
882 error = acl_getattr2_otw(vp, vap, cr);
883 if (mi->mi_flags & MI_ACL)
884 return (error);
885 }
886
887 douprintf = 1;
888
889 t = gethrtime();
890
891 error = rfs2call(mi, RFS_GETATTR,
892 xdr_fhandle, (caddr_t)VTOFH(vp),
893 xdr_attrstat, (caddr_t)&ns, cr,
894 &douprintf, &ns.ns_status, 0, &fi);
895
896 if (!error) {
897 error = geterrno(ns.ns_status);
898 if (!error)
899 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
900 else {
901 PURGE_STALE_FH(error, vp, cr);
902 }
903 }
904
905 return (error);
906 }
907
908 /*
909 * Return either cached ot remote attributes. If get remote attr
910 * use them to check and invalidate caches, then cache the new attributes.
911 */
912 int
nfsgetattr(vnode_t * vp,struct vattr * vap,cred_t * cr)913 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
914 {
915 int error;
916 rnode_t *rp;
917
918 /*
919 * If we've got cached attributes, we're done, otherwise go
920 * to the server to get attributes, which will update the cache
921 * in the process.
922 */
923 error = nfs_getattr_cache(vp, vap);
924 if (error)
925 error = nfs_getattr_otw(vp, vap, cr);
926
927 /* Return the client's view of file size */
928 rp = VTOR(vp);
929 mutex_enter(&rp->r_statelock);
930 vap->va_size = rp->r_size;
931 mutex_exit(&rp->r_statelock);
932
933 return (error);
934 }
935
936 /*
937 * Get attributes over-the-wire and update attributes cache
938 * if no error occurred in the over-the-wire operation.
939 * Return 0 if successful, otherwise error.
940 */
941 int
nfs3_getattr_otw(vnode_t * vp,struct vattr * vap,cred_t * cr)942 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
943 {
944 int error;
945 GETATTR3args args;
946 GETATTR3vres res;
947 int douprintf;
948 failinfo_t fi;
949 hrtime_t t;
950
951 args.object = *VTOFH3(vp);
952 fi.vp = vp;
953 fi.fhp = (caddr_t)&args.object;
954 fi.copyproc = nfs3copyfh;
955 fi.lookupproc = nfs3lookup;
956 fi.xattrdirproc = acl_getxattrdir3;
957 res.fres.vp = vp;
958 res.fres.vap = vap;
959
960 douprintf = 1;
961
962 t = gethrtime();
963
964 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
965 xdr_nfs_fh3, (caddr_t)&args,
966 xdr_GETATTR3vres, (caddr_t)&res, cr,
967 &douprintf, &res.status, 0, &fi);
968
969 if (error)
970 return (error);
971
972 error = geterrno3(res.status);
973 if (error) {
974 PURGE_STALE_FH(error, vp, cr);
975 return (error);
976 }
977
978 /*
979 * Catch status codes that indicate fattr3 to vattr translation failure
980 */
981 if (res.fres.status)
982 return (res.fres.status);
983
984 nfs_attr_cache(vp, vap, t, cr);
985 return (0);
986 }
987
988 /*
989 * Return either cached or remote attributes. If get remote attr
990 * use them to check and invalidate caches, then cache the new attributes.
991 */
992 int
nfs3getattr(vnode_t * vp,struct vattr * vap,cred_t * cr)993 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
994 {
995 int error;
996 rnode_t *rp;
997
998 /*
999 * If we've got cached attributes, we're done, otherwise go
1000 * to the server to get attributes, which will update the cache
1001 * in the process.
1002 */
1003 error = nfs_getattr_cache(vp, vap);
1004 if (error)
1005 error = nfs3_getattr_otw(vp, vap, cr);
1006
1007 /* Return the client's view of file size */
1008 rp = VTOR(vp);
1009 mutex_enter(&rp->r_statelock);
1010 vap->va_size = rp->r_size;
1011 mutex_exit(&rp->r_statelock);
1012
1013 return (error);
1014 }
1015
1016 vtype_t nf_to_vt[] = {
1017 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
1018 };
1019 /*
1020 * Convert NFS Version 2 over the network attributes to the local
1021 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1022 * network representation and the local representation is done here.
1023 * Returns 0 for success, error if failed due to overflow.
1024 */
1025 int
nattr_to_vattr(vnode_t * vp,struct nfsfattr * na,struct vattr * vap)1026 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
1027 {
1028 /* overflow in time attributes? */
1029 #ifndef _LP64
1030 if (!NFS2_FATTR_TIME_OK(na))
1031 return (EOVERFLOW);
1032 #endif
1033
1034 vap->va_mask = AT_ALL;
1035
1036 if (na->na_type < NFNON || na->na_type > NFSOC)
1037 vap->va_type = VBAD;
1038 else
1039 vap->va_type = nf_to_vt[na->na_type];
1040 vap->va_mode = na->na_mode;
1041 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
1042 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
1043 vap->va_fsid = vp->v_vfsp->vfs_dev;
1044 vap->va_nodeid = na->na_nodeid;
1045 vap->va_nlink = na->na_nlink;
1046 vap->va_size = na->na_size; /* keep for cache validation */
1047 /*
1048 * nfs protocol defines times as unsigned so don't extend sign,
1049 * unless sysadmin set nfs_allow_preepoch_time.
1050 */
1051 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
1052 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
1053 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
1054 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
1055 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
1056 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
1057 /*
1058 * Shannon's law - uncompress the received dev_t
1059 * if the top half of is zero indicating a response
1060 * from an `older style' OS. Except for when it is a
1061 * `new style' OS sending the maj device of zero,
1062 * in which case the algorithm still works because the
1063 * fact that it is a new style server
1064 * is hidden by the minor device not being greater
1065 * than 255 (a requirement in this case).
1066 */
1067 if ((na->na_rdev & 0xffff0000) == 0)
1068 vap->va_rdev = nfsv2_expdev(na->na_rdev);
1069 else
1070 vap->va_rdev = expldev(na->na_rdev);
1071
1072 vap->va_nblocks = na->na_blocks;
1073 switch (na->na_type) {
1074 case NFBLK:
1075 vap->va_blksize = DEV_BSIZE;
1076 break;
1077
1078 case NFCHR:
1079 vap->va_blksize = MAXBSIZE;
1080 break;
1081
1082 case NFSOC:
1083 default:
1084 vap->va_blksize = na->na_blocksize;
1085 break;
1086 }
1087 /*
1088 * This bit of ugliness is a hack to preserve the
1089 * over-the-wire protocols for named-pipe vnodes.
1090 * It remaps the special over-the-wire type to the
1091 * VFIFO type. (see note in nfs.h)
1092 */
1093 if (NA_ISFIFO(na)) {
1094 vap->va_type = VFIFO;
1095 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1096 vap->va_rdev = 0;
1097 vap->va_blksize = na->na_blocksize;
1098 }
1099 vap->va_seq = 0;
1100 return (0);
1101 }
1102
1103 /*
1104 * Convert NFS Version 3 over the network attributes to the local
1105 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1106 * network representation and the local representation is done here.
1107 */
1108 vtype_t nf3_to_vt[] = {
1109 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1110 };
1111
1112 int
fattr3_to_vattr(vnode_t * vp,fattr3 * na,struct vattr * vap)1113 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1114 {
1115
1116 #ifndef _LP64
1117 /* overflow in time attributes? */
1118 if (!NFS3_FATTR_TIME_OK(na))
1119 return (EOVERFLOW);
1120 #endif
1121 if (!NFS3_SIZE_OK(na->size))
1122 /* file too big */
1123 return (EFBIG);
1124
1125 vap->va_mask = AT_ALL;
1126
1127 if (na->type < NF3REG || na->type > NF3FIFO)
1128 vap->va_type = VBAD;
1129 else
1130 vap->va_type = nf3_to_vt[na->type];
1131 vap->va_mode = na->mode;
1132 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1133 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1134 vap->va_fsid = vp->v_vfsp->vfs_dev;
1135 vap->va_nodeid = na->fileid;
1136 vap->va_nlink = na->nlink;
1137 vap->va_size = na->size;
1138
1139 /*
1140 * nfs protocol defines times as unsigned so don't extend sign,
1141 * unless sysadmin set nfs_allow_preepoch_time.
1142 */
1143 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1144 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1145 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1146 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1147 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1148 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1149
1150 switch (na->type) {
1151 case NF3BLK:
1152 vap->va_rdev = makedevice(na->rdev.specdata1,
1153 na->rdev.specdata2);
1154 vap->va_blksize = DEV_BSIZE;
1155 vap->va_nblocks = 0;
1156 break;
1157 case NF3CHR:
1158 vap->va_rdev = makedevice(na->rdev.specdata1,
1159 na->rdev.specdata2);
1160 vap->va_blksize = MAXBSIZE;
1161 vap->va_nblocks = 0;
1162 break;
1163 case NF3REG:
1164 case NF3DIR:
1165 case NF3LNK:
1166 vap->va_rdev = 0;
1167 vap->va_blksize = MAXBSIZE;
1168 vap->va_nblocks = (u_longlong_t)
1169 ((na->used + (size3)DEV_BSIZE - (size3)1) /
1170 (size3)DEV_BSIZE);
1171 break;
1172 case NF3SOCK:
1173 case NF3FIFO:
1174 default:
1175 vap->va_rdev = 0;
1176 vap->va_blksize = MAXBSIZE;
1177 vap->va_nblocks = 0;
1178 break;
1179 }
1180 vap->va_seq = 0;
1181 return (0);
1182 }
1183
1184 /*
1185 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1186 * for the demand-based allocation of async threads per-mount. The
1187 * nfs_async_timeout is the amount of time a thread will live after it
1188 * becomes idle, unless new I/O requests are received before the thread
1189 * dies. See nfs_async_putpage and nfs_async_start.
1190 */
1191
1192 int nfs_async_timeout = -1; /* uninitialized */
1193
1194 static void nfs_async_start(struct vfs *);
1195 static void nfs_async_pgops_start(struct vfs *);
1196 static void nfs_async_common_start(struct vfs *, int);
1197
1198 static void
free_async_args(struct nfs_async_reqs * args)1199 free_async_args(struct nfs_async_reqs *args)
1200 {
1201 rnode_t *rp;
1202
1203 if (args->a_io != NFS_INACTIVE) {
1204 rp = VTOR(args->a_vp);
1205 mutex_enter(&rp->r_statelock);
1206 rp->r_count--;
1207 if (args->a_io == NFS_PUTAPAGE ||
1208 args->a_io == NFS_PAGEIO)
1209 rp->r_awcount--;
1210 cv_broadcast(&rp->r_cv);
1211 mutex_exit(&rp->r_statelock);
1212 VN_RELE(args->a_vp);
1213 }
1214 crfree(args->a_cred);
1215 kmem_free(args, sizeof (*args));
1216 }
1217
1218 /*
1219 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1220 * pageout(), running in the global zone, have legitimate reasons to do
1221 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1222 * use of a a per-mount "asynchronous requests manager thread" which is
1223 * signaled by the various asynchronous work routines when there is
1224 * asynchronous work to be done. It is responsible for creating new
1225 * worker threads if necessary, and notifying existing worker threads
1226 * that there is work to be done.
1227 *
1228 * In other words, it will "take the specifications from the customers and
1229 * give them to the engineers."
1230 *
1231 * Worker threads die off of their own accord if they are no longer
1232 * needed.
1233 *
1234 * This thread is killed when the zone is going away or the filesystem
1235 * is being unmounted.
1236 */
1237 void
nfs_async_manager(vfs_t * vfsp)1238 nfs_async_manager(vfs_t *vfsp)
1239 {
1240 callb_cpr_t cprinfo;
1241 mntinfo_t *mi;
1242 uint_t max_threads;
1243
1244 mi = VFTOMI(vfsp);
1245
1246 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1247 "nfs_async_manager");
1248
1249 mutex_enter(&mi->mi_async_lock);
1250 /*
1251 * We want to stash the max number of threads that this mount was
1252 * allowed so we can use it later when the variable is set to zero as
1253 * part of the zone/mount going away.
1254 *
1255 * We want to be able to create at least one thread to handle
1256 * asynchronous inactive calls.
1257 */
1258 max_threads = MAX(mi->mi_max_threads, 1);
1259 /*
1260 * We don't want to wait for mi_max_threads to go to zero, since that
1261 * happens as part of a failed unmount, but this thread should only
1262 * exit when the mount/zone is really going away.
1263 *
1264 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1265 * attempted: the various _async_*() functions know to do things
1266 * inline if mi_max_threads == 0. Henceforth we just drain out the
1267 * outstanding requests.
1268 *
1269 * Note that we still create zthreads even if we notice the zone is
1270 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1271 * shutdown sequence to take slightly longer in some cases, but
1272 * doesn't violate the protocol, as all threads will exit as soon as
1273 * they're done processing the remaining requests.
1274 */
1275 for (;;) {
1276 while (mi->mi_async_req_count > 0) {
1277 /*
1278 * Paranoia: If the mount started out having
1279 * (mi->mi_max_threads == 0), and the value was
1280 * later changed (via a debugger or somesuch),
1281 * we could be confused since we will think we
1282 * can't create any threads, and the calling
1283 * code (which looks at the current value of
1284 * mi->mi_max_threads, now non-zero) thinks we
1285 * can.
1286 *
1287 * So, because we're paranoid, we create threads
1288 * up to the maximum of the original and the
1289 * current value. This means that future
1290 * (debugger-induced) lowerings of
1291 * mi->mi_max_threads are ignored for our
1292 * purposes, but who told them they could change
1293 * random values on a live kernel anyhow?
1294 */
1295 if (mi->mi_threads[NFS_ASYNC_QUEUE] <
1296 MAX(mi->mi_max_threads, max_threads)) {
1297 mi->mi_threads[NFS_ASYNC_QUEUE]++;
1298 mutex_exit(&mi->mi_async_lock);
1299 VFS_HOLD(vfsp); /* hold for new thread */
1300 (void) zthread_create(NULL, 0, nfs_async_start,
1301 vfsp, 0, minclsyspri);
1302 mutex_enter(&mi->mi_async_lock);
1303 } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
1304 NUM_ASYNC_PGOPS_THREADS) {
1305 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
1306 mutex_exit(&mi->mi_async_lock);
1307 VFS_HOLD(vfsp); /* hold for new thread */
1308 (void) zthread_create(NULL, 0,
1309 nfs_async_pgops_start, vfsp, 0,
1310 minclsyspri);
1311 mutex_enter(&mi->mi_async_lock);
1312 }
1313 NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1314 ASSERT(mi->mi_async_req_count != 0);
1315 mi->mi_async_req_count--;
1316 }
1317
1318 mutex_enter(&mi->mi_lock);
1319 if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
1320 mutex_exit(&mi->mi_lock);
1321 break;
1322 }
1323 mutex_exit(&mi->mi_lock);
1324
1325 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1326 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1327 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1328 }
1329 /*
1330 * Let everyone know we're done.
1331 */
1332 mi->mi_manager_thread = NULL;
1333 cv_broadcast(&mi->mi_async_cv);
1334
1335 /*
1336 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1337 * since CALLB_CPR_EXIT is actually responsible for releasing
1338 * 'mi_async_lock'.
1339 */
1340 CALLB_CPR_EXIT(&cprinfo);
1341 VFS_RELE(vfsp); /* release thread's hold */
1342 zthread_exit();
1343 }
1344
1345 /*
1346 * Signal (and wait for) the async manager thread to clean up and go away.
1347 */
1348 void
nfs_async_manager_stop(vfs_t * vfsp)1349 nfs_async_manager_stop(vfs_t *vfsp)
1350 {
1351 mntinfo_t *mi = VFTOMI(vfsp);
1352
1353 mutex_enter(&mi->mi_async_lock);
1354 mutex_enter(&mi->mi_lock);
1355 mi->mi_flags |= MI_ASYNC_MGR_STOP;
1356 mutex_exit(&mi->mi_lock);
1357 cv_broadcast(&mi->mi_async_reqs_cv);
1358 while (mi->mi_manager_thread != NULL)
1359 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1360 mutex_exit(&mi->mi_async_lock);
1361 }
1362
1363 int
nfs_async_readahead(vnode_t * vp,u_offset_t blkoff,caddr_t addr,struct seg * seg,cred_t * cr,void (* readahead)(vnode_t *,u_offset_t,caddr_t,struct seg *,cred_t *))1364 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1365 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1366 u_offset_t, caddr_t, struct seg *, cred_t *))
1367 {
1368 rnode_t *rp;
1369 mntinfo_t *mi;
1370 struct nfs_async_reqs *args;
1371
1372 rp = VTOR(vp);
1373 ASSERT(rp->r_freef == NULL);
1374
1375 mi = VTOMI(vp);
1376
1377 /*
1378 * If addr falls in a different segment, don't bother doing readahead.
1379 */
1380 if (addr >= seg->s_base + seg->s_size)
1381 return (-1);
1382
1383 /*
1384 * If we can't allocate a request structure, punt on the readahead.
1385 */
1386 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1387 return (-1);
1388
1389 /*
1390 * If a lock operation is pending, don't initiate any new
1391 * readaheads. Otherwise, bump r_count to indicate the new
1392 * asynchronous I/O.
1393 */
1394 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1395 kmem_free(args, sizeof (*args));
1396 return (-1);
1397 }
1398 mutex_enter(&rp->r_statelock);
1399 rp->r_count++;
1400 mutex_exit(&rp->r_statelock);
1401 nfs_rw_exit(&rp->r_lkserlock);
1402
1403 args->a_next = NULL;
1404 #ifdef DEBUG
1405 args->a_queuer = curthread;
1406 #endif
1407 VN_HOLD(vp);
1408 args->a_vp = vp;
1409 ASSERT(cr != NULL);
1410 crhold(cr);
1411 args->a_cred = cr;
1412 args->a_io = NFS_READ_AHEAD;
1413 args->a_nfs_readahead = readahead;
1414 args->a_nfs_blkoff = blkoff;
1415 args->a_nfs_seg = seg;
1416 args->a_nfs_addr = addr;
1417
1418 mutex_enter(&mi->mi_async_lock);
1419
1420 /*
1421 * If asyncio has been disabled, don't bother readahead.
1422 */
1423 if (mi->mi_max_threads == 0) {
1424 mutex_exit(&mi->mi_async_lock);
1425 goto noasync;
1426 }
1427
1428 /*
1429 * Link request structure into the async list and
1430 * wakeup async thread to do the i/o.
1431 */
1432 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1433 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1434 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1435 } else {
1436 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1437 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1438 }
1439
1440 if (mi->mi_io_kstats) {
1441 mutex_enter(&mi->mi_lock);
1442 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1443 mutex_exit(&mi->mi_lock);
1444 }
1445
1446 mi->mi_async_req_count++;
1447 ASSERT(mi->mi_async_req_count != 0);
1448 cv_signal(&mi->mi_async_reqs_cv);
1449 mutex_exit(&mi->mi_async_lock);
1450 return (0);
1451
1452 noasync:
1453 mutex_enter(&rp->r_statelock);
1454 rp->r_count--;
1455 cv_broadcast(&rp->r_cv);
1456 mutex_exit(&rp->r_statelock);
1457 VN_RELE(vp);
1458 crfree(cr);
1459 kmem_free(args, sizeof (*args));
1460 return (-1);
1461 }
1462
1463 int
nfs_async_putapage(vnode_t * vp,page_t * pp,u_offset_t off,size_t len,int flags,cred_t * cr,int (* putapage)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))1464 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1465 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1466 u_offset_t, size_t, int, cred_t *))
1467 {
1468 rnode_t *rp;
1469 mntinfo_t *mi;
1470 struct nfs_async_reqs *args;
1471
1472 ASSERT(flags & B_ASYNC);
1473 ASSERT(vp->v_vfsp != NULL);
1474
1475 rp = VTOR(vp);
1476 ASSERT(rp->r_count > 0);
1477
1478 mi = VTOMI(vp);
1479
1480 /*
1481 * If we can't allocate a request structure, do the putpage
1482 * operation synchronously in this thread's context.
1483 */
1484 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1485 goto noasync;
1486
1487 args->a_next = NULL;
1488 #ifdef DEBUG
1489 args->a_queuer = curthread;
1490 #endif
1491 VN_HOLD(vp);
1492 args->a_vp = vp;
1493 ASSERT(cr != NULL);
1494 crhold(cr);
1495 args->a_cred = cr;
1496 args->a_io = NFS_PUTAPAGE;
1497 args->a_nfs_putapage = putapage;
1498 args->a_nfs_pp = pp;
1499 args->a_nfs_off = off;
1500 args->a_nfs_len = (uint_t)len;
1501 args->a_nfs_flags = flags;
1502
1503 mutex_enter(&mi->mi_async_lock);
1504
1505 /*
1506 * If asyncio has been disabled, then make a synchronous request.
1507 * This check is done a second time in case async io was diabled
1508 * while this thread was blocked waiting for memory pressure to
1509 * reduce or for the queue to drain.
1510 */
1511 if (mi->mi_max_threads == 0) {
1512 mutex_exit(&mi->mi_async_lock);
1513 goto noasync;
1514 }
1515
1516 /*
1517 * Link request structure into the async list and
1518 * wakeup async thread to do the i/o.
1519 */
1520 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1521 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1522 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1523 } else {
1524 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1525 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1526 }
1527
1528 mutex_enter(&rp->r_statelock);
1529 rp->r_count++;
1530 rp->r_awcount++;
1531 mutex_exit(&rp->r_statelock);
1532
1533 if (mi->mi_io_kstats) {
1534 mutex_enter(&mi->mi_lock);
1535 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1536 mutex_exit(&mi->mi_lock);
1537 }
1538
1539 mi->mi_async_req_count++;
1540 ASSERT(mi->mi_async_req_count != 0);
1541 cv_signal(&mi->mi_async_reqs_cv);
1542 mutex_exit(&mi->mi_async_lock);
1543 return (0);
1544
1545 noasync:
1546 if (args != NULL) {
1547 VN_RELE(vp);
1548 crfree(cr);
1549 kmem_free(args, sizeof (*args));
1550 }
1551
1552 if (curproc == proc_pageout || curproc == proc_fsflush) {
1553 /*
1554 * If we get here in the context of the pageout/fsflush,
1555 * we refuse to do a sync write, because this may hang
1556 * pageout (and the machine). In this case, we just
1557 * re-mark the page as dirty and punt on the page.
1558 *
1559 * Make sure B_FORCE isn't set. We can re-mark the
1560 * pages as dirty and unlock the pages in one swoop by
1561 * passing in B_ERROR to pvn_write_done(). However,
1562 * we should make sure B_FORCE isn't set - we don't
1563 * want the page tossed before it gets written out.
1564 */
1565 if (flags & B_FORCE)
1566 flags &= ~(B_INVAL | B_FORCE);
1567 pvn_write_done(pp, flags | B_ERROR);
1568 return (0);
1569 }
1570 if (nfs_zone() != mi->mi_zone) {
1571 /*
1572 * So this was a cross-zone sync putpage. We pass in B_ERROR
1573 * to pvn_write_done() to re-mark the pages as dirty and unlock
1574 * them.
1575 *
1576 * We don't want to clear B_FORCE here as the caller presumably
1577 * knows what they're doing if they set it.
1578 */
1579 pvn_write_done(pp, flags | B_ERROR);
1580 return (EPERM);
1581 }
1582 return ((*putapage)(vp, pp, off, len, flags, cr));
1583 }
1584
1585 int
nfs_async_pageio(vnode_t * vp,page_t * pp,u_offset_t io_off,size_t io_len,int flags,cred_t * cr,int (* pageio)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))1586 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1587 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1588 size_t, int, cred_t *))
1589 {
1590 rnode_t *rp;
1591 mntinfo_t *mi;
1592 struct nfs_async_reqs *args;
1593
1594 ASSERT(flags & B_ASYNC);
1595 ASSERT(vp->v_vfsp != NULL);
1596
1597 rp = VTOR(vp);
1598 ASSERT(rp->r_count > 0);
1599
1600 mi = VTOMI(vp);
1601
1602 /*
1603 * If we can't allocate a request structure, do the pageio
1604 * request synchronously in this thread's context.
1605 */
1606 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1607 goto noasync;
1608
1609 args->a_next = NULL;
1610 #ifdef DEBUG
1611 args->a_queuer = curthread;
1612 #endif
1613 VN_HOLD(vp);
1614 args->a_vp = vp;
1615 ASSERT(cr != NULL);
1616 crhold(cr);
1617 args->a_cred = cr;
1618 args->a_io = NFS_PAGEIO;
1619 args->a_nfs_pageio = pageio;
1620 args->a_nfs_pp = pp;
1621 args->a_nfs_off = io_off;
1622 args->a_nfs_len = (uint_t)io_len;
1623 args->a_nfs_flags = flags;
1624
1625 mutex_enter(&mi->mi_async_lock);
1626
1627 /*
1628 * If asyncio has been disabled, then make a synchronous request.
1629 * This check is done a second time in case async io was diabled
1630 * while this thread was blocked waiting for memory pressure to
1631 * reduce or for the queue to drain.
1632 */
1633 if (mi->mi_max_threads == 0) {
1634 mutex_exit(&mi->mi_async_lock);
1635 goto noasync;
1636 }
1637
1638 /*
1639 * Link request structure into the async list and
1640 * wakeup async thread to do the i/o.
1641 */
1642 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1643 mi->mi_async_reqs[NFS_PAGEIO] = args;
1644 mi->mi_async_tail[NFS_PAGEIO] = args;
1645 } else {
1646 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1647 mi->mi_async_tail[NFS_PAGEIO] = args;
1648 }
1649
1650 mutex_enter(&rp->r_statelock);
1651 rp->r_count++;
1652 rp->r_awcount++;
1653 mutex_exit(&rp->r_statelock);
1654
1655 if (mi->mi_io_kstats) {
1656 mutex_enter(&mi->mi_lock);
1657 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1658 mutex_exit(&mi->mi_lock);
1659 }
1660
1661 mi->mi_async_req_count++;
1662 ASSERT(mi->mi_async_req_count != 0);
1663 cv_signal(&mi->mi_async_reqs_cv);
1664 mutex_exit(&mi->mi_async_lock);
1665 return (0);
1666
1667 noasync:
1668 if (args != NULL) {
1669 VN_RELE(vp);
1670 crfree(cr);
1671 kmem_free(args, sizeof (*args));
1672 }
1673
1674 /*
1675 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1676 * the page list), for writes we do it synchronously, except for
1677 * proc_pageout/proc_fsflush as described below.
1678 */
1679 if (flags & B_READ) {
1680 pvn_read_done(pp, flags | B_ERROR);
1681 return (0);
1682 }
1683
1684 if (curproc == proc_pageout || curproc == proc_fsflush) {
1685 /*
1686 * If we get here in the context of the pageout/fsflush,
1687 * we refuse to do a sync write, because this may hang
1688 * pageout/fsflush (and the machine). In this case, we just
1689 * re-mark the page as dirty and punt on the page.
1690 *
1691 * Make sure B_FORCE isn't set. We can re-mark the
1692 * pages as dirty and unlock the pages in one swoop by
1693 * passing in B_ERROR to pvn_write_done(). However,
1694 * we should make sure B_FORCE isn't set - we don't
1695 * want the page tossed before it gets written out.
1696 */
1697 if (flags & B_FORCE)
1698 flags &= ~(B_INVAL | B_FORCE);
1699 pvn_write_done(pp, flags | B_ERROR);
1700 return (0);
1701 }
1702
1703 if (nfs_zone() != mi->mi_zone) {
1704 /*
1705 * So this was a cross-zone sync pageio. We pass in B_ERROR
1706 * to pvn_write_done() to re-mark the pages as dirty and unlock
1707 * them.
1708 *
1709 * We don't want to clear B_FORCE here as the caller presumably
1710 * knows what they're doing if they set it.
1711 */
1712 pvn_write_done(pp, flags | B_ERROR);
1713 return (EPERM);
1714 }
1715 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1716 }
1717
1718 void
nfs_async_readdir(vnode_t * vp,rddir_cache * rdc,cred_t * cr,int (* readdir)(vnode_t *,rddir_cache *,cred_t *))1719 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1720 int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1721 {
1722 rnode_t *rp;
1723 mntinfo_t *mi;
1724 struct nfs_async_reqs *args;
1725
1726 rp = VTOR(vp);
1727 ASSERT(rp->r_freef == NULL);
1728
1729 mi = VTOMI(vp);
1730
1731 /*
1732 * If we can't allocate a request structure, do the readdir
1733 * operation synchronously in this thread's context.
1734 */
1735 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1736 goto noasync;
1737
1738 args->a_next = NULL;
1739 #ifdef DEBUG
1740 args->a_queuer = curthread;
1741 #endif
1742 VN_HOLD(vp);
1743 args->a_vp = vp;
1744 ASSERT(cr != NULL);
1745 crhold(cr);
1746 args->a_cred = cr;
1747 args->a_io = NFS_READDIR;
1748 args->a_nfs_readdir = readdir;
1749 args->a_nfs_rdc = rdc;
1750
1751 mutex_enter(&mi->mi_async_lock);
1752
1753 /*
1754 * If asyncio has been disabled, then make a synchronous request.
1755 */
1756 if (mi->mi_max_threads == 0) {
1757 mutex_exit(&mi->mi_async_lock);
1758 goto noasync;
1759 }
1760
1761 /*
1762 * Link request structure into the async list and
1763 * wakeup async thread to do the i/o.
1764 */
1765 if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1766 mi->mi_async_reqs[NFS_READDIR] = args;
1767 mi->mi_async_tail[NFS_READDIR] = args;
1768 } else {
1769 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1770 mi->mi_async_tail[NFS_READDIR] = args;
1771 }
1772
1773 mutex_enter(&rp->r_statelock);
1774 rp->r_count++;
1775 mutex_exit(&rp->r_statelock);
1776
1777 if (mi->mi_io_kstats) {
1778 mutex_enter(&mi->mi_lock);
1779 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1780 mutex_exit(&mi->mi_lock);
1781 }
1782
1783 mi->mi_async_req_count++;
1784 ASSERT(mi->mi_async_req_count != 0);
1785 cv_signal(&mi->mi_async_reqs_cv);
1786 mutex_exit(&mi->mi_async_lock);
1787 return;
1788
1789 noasync:
1790 if (args != NULL) {
1791 VN_RELE(vp);
1792 crfree(cr);
1793 kmem_free(args, sizeof (*args));
1794 }
1795
1796 rdc->entries = NULL;
1797 mutex_enter(&rp->r_statelock);
1798 ASSERT(rdc->flags & RDDIR);
1799 rdc->flags &= ~RDDIR;
1800 rdc->flags |= RDDIRREQ;
1801 /*
1802 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1803 * is set, wakeup the thread sleeping in cv_wait_sig().
1804 * The woken up thread will reset the flag to RDDIR and will
1805 * continue with the readdir opeartion.
1806 */
1807 if (rdc->flags & RDDIRWAIT) {
1808 rdc->flags &= ~RDDIRWAIT;
1809 cv_broadcast(&rdc->cv);
1810 }
1811 mutex_exit(&rp->r_statelock);
1812 rddir_cache_rele(rdc);
1813 }
1814
1815 void
nfs_async_commit(vnode_t * vp,page_t * plist,offset3 offset,count3 count,cred_t * cr,void (* commit)(vnode_t *,page_t *,offset3,count3,cred_t *))1816 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1817 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *))
1818 {
1819 rnode_t *rp;
1820 mntinfo_t *mi;
1821 struct nfs_async_reqs *args;
1822 page_t *pp;
1823
1824 rp = VTOR(vp);
1825 mi = VTOMI(vp);
1826
1827 /*
1828 * If we can't allocate a request structure, do the commit
1829 * operation synchronously in this thread's context.
1830 */
1831 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1832 goto noasync;
1833
1834 args->a_next = NULL;
1835 #ifdef DEBUG
1836 args->a_queuer = curthread;
1837 #endif
1838 VN_HOLD(vp);
1839 args->a_vp = vp;
1840 ASSERT(cr != NULL);
1841 crhold(cr);
1842 args->a_cred = cr;
1843 args->a_io = NFS_COMMIT;
1844 args->a_nfs_commit = commit;
1845 args->a_nfs_plist = plist;
1846 args->a_nfs_offset = offset;
1847 args->a_nfs_count = count;
1848
1849 mutex_enter(&mi->mi_async_lock);
1850
1851 /*
1852 * If asyncio has been disabled, then make a synchronous request.
1853 * This check is done a second time in case async io was diabled
1854 * while this thread was blocked waiting for memory pressure to
1855 * reduce or for the queue to drain.
1856 */
1857 if (mi->mi_max_threads == 0) {
1858 mutex_exit(&mi->mi_async_lock);
1859 goto noasync;
1860 }
1861
1862 /*
1863 * Link request structure into the async list and
1864 * wakeup async thread to do the i/o.
1865 */
1866 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1867 mi->mi_async_reqs[NFS_COMMIT] = args;
1868 mi->mi_async_tail[NFS_COMMIT] = args;
1869 } else {
1870 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1871 mi->mi_async_tail[NFS_COMMIT] = args;
1872 }
1873
1874 mutex_enter(&rp->r_statelock);
1875 rp->r_count++;
1876 mutex_exit(&rp->r_statelock);
1877
1878 if (mi->mi_io_kstats) {
1879 mutex_enter(&mi->mi_lock);
1880 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1881 mutex_exit(&mi->mi_lock);
1882 }
1883
1884 mi->mi_async_req_count++;
1885 ASSERT(mi->mi_async_req_count != 0);
1886 cv_signal(&mi->mi_async_reqs_cv);
1887 mutex_exit(&mi->mi_async_lock);
1888 return;
1889
1890 noasync:
1891 if (args != NULL) {
1892 VN_RELE(vp);
1893 crfree(cr);
1894 kmem_free(args, sizeof (*args));
1895 }
1896
1897 if (curproc == proc_pageout || curproc == proc_fsflush ||
1898 nfs_zone() != mi->mi_zone) {
1899 while (plist != NULL) {
1900 pp = plist;
1901 page_sub(&plist, pp);
1902 pp->p_fsdata = C_COMMIT;
1903 page_unlock(pp);
1904 }
1905 return;
1906 }
1907 (*commit)(vp, plist, offset, count, cr);
1908 }
1909
1910 void
nfs_async_inactive(vnode_t * vp,cred_t * cr,void (* inactive)(vnode_t *,cred_t *,caller_context_t *))1911 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1912 void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
1913 {
1914 mntinfo_t *mi;
1915 struct nfs_async_reqs *args;
1916
1917 mi = VTOMI(vp);
1918
1919 args = kmem_alloc(sizeof (*args), KM_SLEEP);
1920 args->a_next = NULL;
1921 #ifdef DEBUG
1922 args->a_queuer = curthread;
1923 #endif
1924 args->a_vp = vp;
1925 ASSERT(cr != NULL);
1926 crhold(cr);
1927 args->a_cred = cr;
1928 args->a_io = NFS_INACTIVE;
1929 args->a_nfs_inactive = inactive;
1930
1931 /*
1932 * Note that we don't check mi->mi_max_threads here, since we
1933 * *need* to get rid of this vnode regardless of whether someone
1934 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1935 *
1936 * The manager thread knows about this and is willing to create
1937 * at least one thread to accommodate us.
1938 */
1939 mutex_enter(&mi->mi_async_lock);
1940 if (mi->mi_manager_thread == NULL) {
1941 rnode_t *rp = VTOR(vp);
1942
1943 mutex_exit(&mi->mi_async_lock);
1944 crfree(cr); /* drop our reference */
1945 kmem_free(args, sizeof (*args));
1946 /*
1947 * We can't do an over-the-wire call since we're in the wrong
1948 * zone, so we need to clean up state as best we can and then
1949 * throw away the vnode.
1950 */
1951 mutex_enter(&rp->r_statelock);
1952 if (rp->r_unldvp != NULL) {
1953 vnode_t *unldvp;
1954 char *unlname;
1955 cred_t *unlcred;
1956
1957 unldvp = rp->r_unldvp;
1958 rp->r_unldvp = NULL;
1959 unlname = rp->r_unlname;
1960 rp->r_unlname = NULL;
1961 unlcred = rp->r_unlcred;
1962 rp->r_unlcred = NULL;
1963 mutex_exit(&rp->r_statelock);
1964
1965 VN_RELE(unldvp);
1966 kmem_free(unlname, MAXNAMELEN);
1967 crfree(unlcred);
1968 } else {
1969 mutex_exit(&rp->r_statelock);
1970 }
1971 /*
1972 * No need to explicitly throw away any cached pages. The
1973 * eventual rinactive() will attempt a synchronous
1974 * VOP_PUTPAGE() which will immediately fail since the request
1975 * is coming from the wrong zone, and then will proceed to call
1976 * nfs_invalidate_pages() which will clean things up for us.
1977 */
1978 rp_addfree(VTOR(vp), cr);
1979 return;
1980 }
1981
1982 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1983 mi->mi_async_reqs[NFS_INACTIVE] = args;
1984 } else {
1985 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1986 }
1987 mi->mi_async_tail[NFS_INACTIVE] = args;
1988 /*
1989 * Don't increment r_count, since we're trying to get rid of the vnode.
1990 */
1991
1992 mi->mi_async_req_count++;
1993 ASSERT(mi->mi_async_req_count != 0);
1994 cv_signal(&mi->mi_async_reqs_cv);
1995 mutex_exit(&mi->mi_async_lock);
1996 }
1997
1998 static void
nfs_async_start(struct vfs * vfsp)1999 nfs_async_start(struct vfs *vfsp)
2000 {
2001 nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
2002 }
2003
2004 static void
nfs_async_pgops_start(struct vfs * vfsp)2005 nfs_async_pgops_start(struct vfs *vfsp)
2006 {
2007 nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
2008 }
2009
2010 /*
2011 * The async queues for each mounted file system are arranged as a
2012 * set of queues, one for each async i/o type. Requests are taken
2013 * from the queues in a round-robin fashion. A number of consecutive
2014 * requests are taken from each queue before moving on to the next
2015 * queue. This functionality may allow the NFS Version 2 server to do
2016 * write clustering, even if the client is mixing writes and reads
2017 * because it will take multiple write requests from the queue
2018 * before processing any of the other async i/o types.
2019 *
2020 * XXX The nfs_async_common_start thread is unsafe in the light of the present
2021 * model defined by cpr to suspend the system. Specifically over the
2022 * wire calls are cpr-unsafe. The thread should be reevaluated in
2023 * case of future updates to the cpr model.
2024 */
2025 static void
nfs_async_common_start(struct vfs * vfsp,int async_queue)2026 nfs_async_common_start(struct vfs *vfsp, int async_queue)
2027 {
2028 struct nfs_async_reqs *args;
2029 mntinfo_t *mi = VFTOMI(vfsp);
2030 clock_t time_left = 1;
2031 callb_cpr_t cprinfo;
2032 int i;
2033 int async_types;
2034 kcondvar_t *async_work_cv;
2035
2036 if (async_queue == NFS_ASYNC_QUEUE) {
2037 async_types = NFS_ASYNC_TYPES;
2038 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
2039 } else {
2040 async_types = NFS_ASYNC_PGOPS_TYPES;
2041 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
2042 }
2043
2044 /*
2045 * Dynamic initialization of nfs_async_timeout to allow nfs to be
2046 * built in an implementation independent manner.
2047 */
2048 if (nfs_async_timeout == -1)
2049 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
2050
2051 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
2052
2053 mutex_enter(&mi->mi_async_lock);
2054 for (;;) {
2055 /*
2056 * Find the next queue containing an entry. We start
2057 * at the current queue pointer and then round robin
2058 * through all of them until we either find a non-empty
2059 * queue or have looked through all of them.
2060 */
2061 for (i = 0; i < async_types; i++) {
2062 args = *mi->mi_async_curr[async_queue];
2063 if (args != NULL)
2064 break;
2065 mi->mi_async_curr[async_queue]++;
2066 if (mi->mi_async_curr[async_queue] ==
2067 &mi->mi_async_reqs[async_types]) {
2068 mi->mi_async_curr[async_queue] =
2069 &mi->mi_async_reqs[0];
2070 }
2071 }
2072 /*
2073 * If we didn't find a entry, then block until woken up
2074 * again and then look through the queues again.
2075 */
2076 if (args == NULL) {
2077 /*
2078 * Exiting is considered to be safe for CPR as well
2079 */
2080 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2081
2082 /*
2083 * Wakeup thread waiting to unmount the file
2084 * system only if all async threads are inactive.
2085 *
2086 * If we've timed-out and there's nothing to do,
2087 * then get rid of this thread.
2088 */
2089 if (mi->mi_max_threads == 0 || time_left <= 0) {
2090 --mi->mi_threads[async_queue];
2091
2092 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
2093 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
2094 cv_signal(&mi->mi_async_cv);
2095 CALLB_CPR_EXIT(&cprinfo);
2096 VFS_RELE(vfsp); /* release thread's hold */
2097 zthread_exit();
2098 /* NOTREACHED */
2099 }
2100 time_left = cv_reltimedwait(async_work_cv,
2101 &mi->mi_async_lock, nfs_async_timeout,
2102 TR_CLOCK_TICK);
2103
2104 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
2105
2106 continue;
2107 }
2108 time_left = 1;
2109
2110 /*
2111 * Remove the request from the async queue and then
2112 * update the current async request queue pointer. If
2113 * the current queue is empty or we have removed enough
2114 * consecutive entries from it, then reset the counter
2115 * for this queue and then move the current pointer to
2116 * the next queue.
2117 */
2118 *mi->mi_async_curr[async_queue] = args->a_next;
2119 if (*mi->mi_async_curr[async_queue] == NULL ||
2120 --mi->mi_async_clusters[args->a_io] == 0) {
2121 mi->mi_async_clusters[args->a_io] =
2122 mi->mi_async_init_clusters;
2123 mi->mi_async_curr[async_queue]++;
2124 if (mi->mi_async_curr[async_queue] ==
2125 &mi->mi_async_reqs[async_types]) {
2126 mi->mi_async_curr[async_queue] =
2127 &mi->mi_async_reqs[0];
2128 }
2129 }
2130
2131 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2132 mutex_enter(&mi->mi_lock);
2133 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2134 mutex_exit(&mi->mi_lock);
2135 }
2136
2137 mutex_exit(&mi->mi_async_lock);
2138
2139 /*
2140 * Obtain arguments from the async request structure.
2141 */
2142 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2143 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2144 args->a_nfs_addr, args->a_nfs_seg,
2145 args->a_cred);
2146 } else if (args->a_io == NFS_PUTAPAGE) {
2147 (void) (*args->a_nfs_putapage)(args->a_vp,
2148 args->a_nfs_pp, args->a_nfs_off,
2149 args->a_nfs_len, args->a_nfs_flags,
2150 args->a_cred);
2151 } else if (args->a_io == NFS_PAGEIO) {
2152 (void) (*args->a_nfs_pageio)(args->a_vp,
2153 args->a_nfs_pp, args->a_nfs_off,
2154 args->a_nfs_len, args->a_nfs_flags,
2155 args->a_cred);
2156 } else if (args->a_io == NFS_READDIR) {
2157 (void) ((*args->a_nfs_readdir)(args->a_vp,
2158 args->a_nfs_rdc, args->a_cred));
2159 } else if (args->a_io == NFS_COMMIT) {
2160 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2161 args->a_nfs_offset, args->a_nfs_count,
2162 args->a_cred);
2163 } else if (args->a_io == NFS_INACTIVE) {
2164 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
2165 }
2166
2167 /*
2168 * Now, release the vnode and free the credentials
2169 * structure.
2170 */
2171 free_async_args(args);
2172 /*
2173 * Reacquire the mutex because it will be needed above.
2174 */
2175 mutex_enter(&mi->mi_async_lock);
2176 }
2177 }
2178
2179 void
nfs_async_stop(struct vfs * vfsp)2180 nfs_async_stop(struct vfs *vfsp)
2181 {
2182 mntinfo_t *mi = VFTOMI(vfsp);
2183
2184 /*
2185 * Wait for all outstanding async operations to complete and for the
2186 * worker threads to exit.
2187 */
2188 mutex_enter(&mi->mi_async_lock);
2189 mi->mi_max_threads = 0;
2190 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2191 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2192 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
2193 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2194 mutex_exit(&mi->mi_async_lock);
2195 }
2196
2197 /*
2198 * nfs_async_stop_sig:
2199 * Wait for all outstanding putpage operation to complete. If a signal
2200 * is deliver we will abort and return non-zero. If we can put all the
2201 * pages we will return 0. This routine is called from nfs_unmount and
2202 * nfs3_unmount to make these operations interruptible.
2203 */
2204 int
nfs_async_stop_sig(struct vfs * vfsp)2205 nfs_async_stop_sig(struct vfs *vfsp)
2206 {
2207 mntinfo_t *mi = VFTOMI(vfsp);
2208 ushort_t omax;
2209 int rval;
2210
2211 /*
2212 * Wait for all outstanding async operations to complete and for the
2213 * worker threads to exit.
2214 */
2215 mutex_enter(&mi->mi_async_lock);
2216 omax = mi->mi_max_threads;
2217 mi->mi_max_threads = 0;
2218 /*
2219 * Tell all the worker threads to exit.
2220 */
2221 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2222 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2223 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
2224 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2225 break;
2226 }
2227 rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2228 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0); /* Interrupted */
2229 if (rval)
2230 mi->mi_max_threads = omax;
2231 mutex_exit(&mi->mi_async_lock);
2232
2233 return (rval);
2234 }
2235
2236 int
writerp(rnode_t * rp,caddr_t base,int tcount,struct uio * uio,int pgcreated)2237 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2238 {
2239 int pagecreate;
2240 int n;
2241 int saved_n;
2242 caddr_t saved_base;
2243 u_offset_t offset;
2244 int error;
2245 int sm_error;
2246 vnode_t *vp = RTOV(rp);
2247
2248 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2249 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2250 if (!vpm_enable) {
2251 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2252 }
2253
2254 /*
2255 * Move bytes in at most PAGESIZE chunks. We must avoid
2256 * spanning pages in uiomove() because page faults may cause
2257 * the cache to be invalidated out from under us. The r_size is not
2258 * updated until after the uiomove. If we push the last page of a
2259 * file before r_size is correct, we will lose the data written past
2260 * the current (and invalid) r_size.
2261 */
2262 do {
2263 offset = uio->uio_loffset;
2264 pagecreate = 0;
2265
2266 /*
2267 * n is the number of bytes required to satisfy the request
2268 * or the number of bytes to fill out the page.
2269 */
2270 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2271
2272 /*
2273 * Check to see if we can skip reading in the page
2274 * and just allocate the memory. We can do this
2275 * if we are going to rewrite the entire mapping
2276 * or if we are going to write to or beyond the current
2277 * end of file from the beginning of the mapping.
2278 *
2279 * The read of r_size is now protected by r_statelock.
2280 */
2281 mutex_enter(&rp->r_statelock);
2282 /*
2283 * When pgcreated is nonzero the caller has already done
2284 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2285 * segkpm this means we already have at least one page
2286 * created and mapped at base.
2287 */
2288 pagecreate = pgcreated ||
2289 ((offset & PAGEOFFSET) == 0 &&
2290 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2291
2292 mutex_exit(&rp->r_statelock);
2293 if (!vpm_enable && pagecreate) {
2294 /*
2295 * The last argument tells segmap_pagecreate() to
2296 * always lock the page, as opposed to sometimes
2297 * returning with the page locked. This way we avoid a
2298 * fault on the ensuing uiomove(), but also
2299 * more importantly (to fix bug 1094402) we can
2300 * call segmap_fault() to unlock the page in all
2301 * cases. An alternative would be to modify
2302 * segmap_pagecreate() to tell us when it is
2303 * locking a page, but that's a fairly major
2304 * interface change.
2305 */
2306 if (pgcreated == 0)
2307 (void) segmap_pagecreate(segkmap, base,
2308 (uint_t)n, 1);
2309 saved_base = base;
2310 saved_n = n;
2311 }
2312
2313 /*
2314 * The number of bytes of data in the last page can not
2315 * be accurately be determined while page is being
2316 * uiomove'd to and the size of the file being updated.
2317 * Thus, inform threads which need to know accurately
2318 * how much data is in the last page of the file. They
2319 * will not do the i/o immediately, but will arrange for
2320 * the i/o to happen later when this modify operation
2321 * will have finished.
2322 */
2323 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2324 mutex_enter(&rp->r_statelock);
2325 rp->r_flags |= RMODINPROGRESS;
2326 rp->r_modaddr = (offset & MAXBMASK);
2327 mutex_exit(&rp->r_statelock);
2328
2329 if (vpm_enable) {
2330 /*
2331 * Copy data. If new pages are created, part of
2332 * the page that is not written will be initizliazed
2333 * with zeros.
2334 */
2335 error = vpm_data_copy(vp, offset, n, uio,
2336 !pagecreate, NULL, 0, S_WRITE);
2337 } else {
2338 error = uiomove(base, n, UIO_WRITE, uio);
2339 }
2340
2341 /*
2342 * r_size is the maximum number of
2343 * bytes known to be in the file.
2344 * Make sure it is at least as high as the
2345 * first unwritten byte pointed to by uio_loffset.
2346 */
2347 mutex_enter(&rp->r_statelock);
2348 if (rp->r_size < uio->uio_loffset)
2349 rp->r_size = uio->uio_loffset;
2350 rp->r_flags &= ~RMODINPROGRESS;
2351 rp->r_flags |= RDIRTY;
2352 mutex_exit(&rp->r_statelock);
2353
2354 /* n = # of bytes written */
2355 n = (int)(uio->uio_loffset - offset);
2356
2357 if (!vpm_enable) {
2358 base += n;
2359 }
2360 tcount -= n;
2361 /*
2362 * If we created pages w/o initializing them completely,
2363 * we need to zero the part that wasn't set up.
2364 * This happens on a most EOF write cases and if
2365 * we had some sort of error during the uiomove.
2366 */
2367 if (!vpm_enable && pagecreate) {
2368 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2369 (void) kzero(base, PAGESIZE - n);
2370
2371 if (pgcreated) {
2372 /*
2373 * Caller is responsible for this page,
2374 * it was not created in this loop.
2375 */
2376 pgcreated = 0;
2377 } else {
2378 /*
2379 * For bug 1094402: segmap_pagecreate locks
2380 * page. Unlock it. This also unlocks the
2381 * pages allocated by page_create_va() in
2382 * segmap_pagecreate().
2383 */
2384 sm_error = segmap_fault(kas.a_hat, segkmap,
2385 saved_base, saved_n,
2386 F_SOFTUNLOCK, S_WRITE);
2387 if (error == 0)
2388 error = sm_error;
2389 }
2390 }
2391 } while (tcount > 0 && error == 0);
2392
2393 return (error);
2394 }
2395
2396 int
nfs_putpages(vnode_t * vp,u_offset_t off,size_t len,int flags,cred_t * cr)2397 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2398 {
2399 rnode_t *rp;
2400 page_t *pp;
2401 u_offset_t eoff;
2402 u_offset_t io_off;
2403 size_t io_len;
2404 int error;
2405 int rdirty;
2406 int err;
2407
2408 rp = VTOR(vp);
2409 ASSERT(rp->r_count > 0);
2410
2411 if (!vn_has_cached_data(vp))
2412 return (0);
2413
2414 ASSERT(vp->v_type != VCHR);
2415
2416 /*
2417 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2418 * writes. B_FORCE is set to force the VM system to actually
2419 * invalidate the pages, even if the i/o failed. The pages
2420 * need to get invalidated because they can't be written out
2421 * because there isn't any space left on either the server's
2422 * file system or in the user's disk quota. The B_FREE bit
2423 * is cleared to avoid confusion as to whether this is a
2424 * request to place the page on the freelist or to destroy
2425 * it.
2426 */
2427 if ((rp->r_flags & ROUTOFSPACE) ||
2428 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2429 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2430
2431 if (len == 0) {
2432 /*
2433 * If doing a full file synchronous operation, then clear
2434 * the RDIRTY bit. If a page gets dirtied while the flush
2435 * is happening, then RDIRTY will get set again. The
2436 * RDIRTY bit must get cleared before the flush so that
2437 * we don't lose this information.
2438 *
2439 * If there are no full file async write operations
2440 * pending and RDIRTY bit is set, clear it.
2441 */
2442 if (off == (u_offset_t)0 &&
2443 !(flags & B_ASYNC) &&
2444 (rp->r_flags & RDIRTY)) {
2445 mutex_enter(&rp->r_statelock);
2446 rdirty = (rp->r_flags & RDIRTY);
2447 rp->r_flags &= ~RDIRTY;
2448 mutex_exit(&rp->r_statelock);
2449 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2450 mutex_enter(&rp->r_statelock);
2451 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
2452 rdirty = (rp->r_flags & RDIRTY);
2453 rp->r_flags &= ~RDIRTY;
2454 }
2455 mutex_exit(&rp->r_statelock);
2456 } else
2457 rdirty = 0;
2458
2459 /*
2460 * Search the entire vp list for pages >= off, and flush
2461 * the dirty pages.
2462 */
2463 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2464 flags, cr);
2465
2466 /*
2467 * If an error occurred and the file was marked as dirty
2468 * before and we aren't forcibly invalidating pages, then
2469 * reset the RDIRTY flag.
2470 */
2471 if (error && rdirty &&
2472 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2473 mutex_enter(&rp->r_statelock);
2474 rp->r_flags |= RDIRTY;
2475 mutex_exit(&rp->r_statelock);
2476 }
2477 } else {
2478 /*
2479 * Do a range from [off...off + len) looking for pages
2480 * to deal with.
2481 */
2482 error = 0;
2483 #ifdef lint
2484 io_len = 0;
2485 #endif
2486 eoff = off + len;
2487 mutex_enter(&rp->r_statelock);
2488 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2489 io_off += io_len) {
2490 mutex_exit(&rp->r_statelock);
2491 /*
2492 * If we are not invalidating, synchronously
2493 * freeing or writing pages use the routine
2494 * page_lookup_nowait() to prevent reclaiming
2495 * them from the free list.
2496 */
2497 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2498 pp = page_lookup(vp, io_off,
2499 (flags & (B_INVAL | B_FREE)) ?
2500 SE_EXCL : SE_SHARED);
2501 } else {
2502 pp = page_lookup_nowait(vp, io_off,
2503 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2504 }
2505
2506 if (pp == NULL || !pvn_getdirty(pp, flags))
2507 io_len = PAGESIZE;
2508 else {
2509 err = (*rp->r_putapage)(vp, pp, &io_off,
2510 &io_len, flags, cr);
2511 if (!error)
2512 error = err;
2513 /*
2514 * "io_off" and "io_len" are returned as
2515 * the range of pages we actually wrote.
2516 * This allows us to skip ahead more quickly
2517 * since several pages may've been dealt
2518 * with by this iteration of the loop.
2519 */
2520 }
2521 mutex_enter(&rp->r_statelock);
2522 }
2523 mutex_exit(&rp->r_statelock);
2524 }
2525
2526 return (error);
2527 }
2528
2529 void
nfs_invalidate_pages(vnode_t * vp,u_offset_t off,cred_t * cr)2530 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2531 {
2532 rnode_t *rp;
2533
2534 rp = VTOR(vp);
2535 mutex_enter(&rp->r_statelock);
2536 while (rp->r_flags & RTRUNCATE)
2537 cv_wait(&rp->r_cv, &rp->r_statelock);
2538 rp->r_flags |= RTRUNCATE;
2539 if (off == (u_offset_t)0) {
2540 rp->r_flags &= ~RDIRTY;
2541 if (!(rp->r_flags & RSTALE))
2542 rp->r_error = 0;
2543 }
2544 rp->r_truncaddr = off;
2545 mutex_exit(&rp->r_statelock);
2546 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2547 B_INVAL | B_TRUNC, cr);
2548 mutex_enter(&rp->r_statelock);
2549 rp->r_flags &= ~RTRUNCATE;
2550 cv_broadcast(&rp->r_cv);
2551 mutex_exit(&rp->r_statelock);
2552 }
2553
2554 static int nfs_write_error_to_cons_only = 0;
2555 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2556
2557 /*
2558 * Print a file handle
2559 */
2560 void
nfs_printfhandle(nfs_fhandle * fhp)2561 nfs_printfhandle(nfs_fhandle *fhp)
2562 {
2563 int *ip;
2564 char *buf;
2565 size_t bufsize;
2566 char *cp;
2567
2568 /*
2569 * 13 == "(file handle:"
2570 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2571 * 1 == ' '
2572 * 8 == maximum strlen of "%x"
2573 * 3 == ")\n\0"
2574 */
2575 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2576 buf = kmem_alloc(bufsize, KM_NOSLEEP);
2577 if (buf == NULL)
2578 return;
2579
2580 cp = buf;
2581 (void) strcpy(cp, "(file handle:");
2582 while (*cp != '\0')
2583 cp++;
2584 for (ip = (int *)fhp->fh_buf;
2585 ip < (int *)&fhp->fh_buf[fhp->fh_len];
2586 ip++) {
2587 (void) sprintf(cp, " %x", *ip);
2588 while (*cp != '\0')
2589 cp++;
2590 }
2591 (void) strcpy(cp, ")\n");
2592
2593 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2594
2595 kmem_free(buf, bufsize);
2596 }
2597
2598 /*
2599 * Notify the system administrator that an NFS write error has
2600 * occurred.
2601 */
2602
2603 /* seconds between ENOSPC/EDQUOT messages */
2604 clock_t nfs_write_error_interval = 5;
2605
2606 void
nfs_write_error(vnode_t * vp,int error,cred_t * cr)2607 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2608 {
2609 mntinfo_t *mi;
2610 clock_t now;
2611
2612 mi = VTOMI(vp);
2613 /*
2614 * In case of forced unmount or zone shutdown, do not print any
2615 * messages since it can flood the console with error messages.
2616 */
2617 if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2618 return;
2619
2620 /*
2621 * No use in flooding the console with ENOSPC
2622 * messages from the same file system.
2623 */
2624 now = ddi_get_lbolt();
2625 if ((error != ENOSPC && error != EDQUOT) ||
2626 now - mi->mi_printftime > 0) {
2627 zoneid_t zoneid = mi->mi_zone->zone_id;
2628
2629 #ifdef DEBUG
2630 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2631 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2632 #else
2633 nfs_perror(error, "NFS write error on host %s: %m.\n",
2634 VTOR(vp)->r_server->sv_hostname, NULL);
2635 #endif
2636 if (error == ENOSPC || error == EDQUOT) {
2637 zcmn_err(zoneid, CE_CONT,
2638 MSG("^File: userid=%d, groupid=%d\n"),
2639 crgetuid(cr), crgetgid(cr));
2640 if (crgetuid(CRED()) != crgetuid(cr) ||
2641 crgetgid(CRED()) != crgetgid(cr)) {
2642 zcmn_err(zoneid, CE_CONT,
2643 MSG("^User: userid=%d, groupid=%d\n"),
2644 crgetuid(CRED()), crgetgid(CRED()));
2645 }
2646 mi->mi_printftime = now +
2647 nfs_write_error_interval * hz;
2648 }
2649 nfs_printfhandle(&VTOR(vp)->r_fh);
2650 #ifdef DEBUG
2651 if (error == EACCES) {
2652 zcmn_err(zoneid, CE_CONT,
2653 MSG("^nfs_bio: cred is%s kcred\n"),
2654 cr == kcred ? "" : " not");
2655 }
2656 #endif
2657 }
2658 }
2659
2660 /* ARGSUSED */
2661 static void *
nfs_mi_init(zoneid_t zoneid)2662 nfs_mi_init(zoneid_t zoneid)
2663 {
2664 struct mi_globals *mig;
2665
2666 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2667 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2668 list_create(&mig->mig_list, sizeof (mntinfo_t),
2669 offsetof(mntinfo_t, mi_zone_node));
2670 mig->mig_destructor_called = B_FALSE;
2671 return (mig);
2672 }
2673
2674 /*
2675 * Callback routine to tell all NFS mounts in the zone to stop creating new
2676 * threads. Existing threads should exit.
2677 */
2678 /* ARGSUSED */
2679 static void
nfs_mi_shutdown(zoneid_t zoneid,void * data)2680 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2681 {
2682 struct mi_globals *mig = data;
2683 mntinfo_t *mi;
2684
2685 ASSERT(mig != NULL);
2686 again:
2687 mutex_enter(&mig->mig_lock);
2688 for (mi = list_head(&mig->mig_list); mi != NULL;
2689 mi = list_next(&mig->mig_list, mi)) {
2690
2691 /*
2692 * If we've done the shutdown work for this FS, skip.
2693 * Once we go off the end of the list, we're done.
2694 */
2695 if (mi->mi_flags & MI_DEAD)
2696 continue;
2697
2698 /*
2699 * We will do work, so not done. Get a hold on the FS.
2700 */
2701 VFS_HOLD(mi->mi_vfsp);
2702
2703 /*
2704 * purge the DNLC for this filesystem
2705 */
2706 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2707
2708 mutex_enter(&mi->mi_async_lock);
2709 /*
2710 * Tell existing async worker threads to exit.
2711 */
2712 mi->mi_max_threads = 0;
2713 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2714 /*
2715 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2716 * getting ready to exit when it's done with its current work.
2717 * Also set MI_DEAD to note we've acted on this FS.
2718 */
2719 mutex_enter(&mi->mi_lock);
2720 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2721 mutex_exit(&mi->mi_lock);
2722 /*
2723 * Wake up the async manager thread.
2724 */
2725 cv_broadcast(&mi->mi_async_reqs_cv);
2726 mutex_exit(&mi->mi_async_lock);
2727
2728 /*
2729 * Drop lock and release FS, which may change list, then repeat.
2730 * We're done when every mi has been done or the list is empty.
2731 */
2732 mutex_exit(&mig->mig_lock);
2733 VFS_RELE(mi->mi_vfsp);
2734 goto again;
2735 }
2736 mutex_exit(&mig->mig_lock);
2737 }
2738
2739 static void
nfs_mi_free_globals(struct mi_globals * mig)2740 nfs_mi_free_globals(struct mi_globals *mig)
2741 {
2742 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2743 mutex_destroy(&mig->mig_lock);
2744 kmem_free(mig, sizeof (*mig));
2745
2746 }
2747
2748 /* ARGSUSED */
2749 static void
nfs_mi_destroy(zoneid_t zoneid,void * data)2750 nfs_mi_destroy(zoneid_t zoneid, void *data)
2751 {
2752 struct mi_globals *mig = data;
2753
2754 ASSERT(mig != NULL);
2755 mutex_enter(&mig->mig_lock);
2756 if (list_head(&mig->mig_list) != NULL) {
2757 /* Still waiting for VFS_FREEVFS() */
2758 mig->mig_destructor_called = B_TRUE;
2759 mutex_exit(&mig->mig_lock);
2760 return;
2761 }
2762 nfs_mi_free_globals(mig);
2763 }
2764
2765 /*
2766 * Add an NFS mount to the per-zone list of NFS mounts.
2767 */
2768 void
nfs_mi_zonelist_add(mntinfo_t * mi)2769 nfs_mi_zonelist_add(mntinfo_t *mi)
2770 {
2771 struct mi_globals *mig;
2772
2773 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2774 mutex_enter(&mig->mig_lock);
2775 list_insert_head(&mig->mig_list, mi);
2776 mutex_exit(&mig->mig_lock);
2777 }
2778
2779 /*
2780 * Remove an NFS mount from the per-zone list of NFS mounts.
2781 */
2782 static void
nfs_mi_zonelist_remove(mntinfo_t * mi)2783 nfs_mi_zonelist_remove(mntinfo_t *mi)
2784 {
2785 struct mi_globals *mig;
2786
2787 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2788 mutex_enter(&mig->mig_lock);
2789 list_remove(&mig->mig_list, mi);
2790 /*
2791 * We can be called asynchronously by VFS_FREEVFS() after the zone
2792 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2793 * mi globals.
2794 */
2795 if (list_head(&mig->mig_list) == NULL &&
2796 mig->mig_destructor_called == B_TRUE) {
2797 nfs_mi_free_globals(mig);
2798 return;
2799 }
2800 mutex_exit(&mig->mig_lock);
2801 }
2802
2803 /*
2804 * NFS Client initialization routine. This routine should only be called
2805 * once. It performs the following tasks:
2806 * - Initalize all global locks
2807 * - Call sub-initialization routines (localize access to variables)
2808 */
2809 int
nfs_clntinit(void)2810 nfs_clntinit(void)
2811 {
2812 #ifdef DEBUG
2813 static boolean_t nfs_clntup = B_FALSE;
2814 #endif
2815 int error;
2816
2817 #ifdef DEBUG
2818 ASSERT(nfs_clntup == B_FALSE);
2819 #endif
2820
2821 error = nfs_subrinit();
2822 if (error)
2823 return (error);
2824
2825 error = nfs_vfsinit();
2826 if (error) {
2827 /*
2828 * Cleanup nfs_subrinit() work
2829 */
2830 nfs_subrfini();
2831 return (error);
2832 }
2833 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2834 nfs_mi_destroy);
2835
2836 nfs4_clnt_init();
2837
2838 nfscmd_init();
2839
2840 #ifdef DEBUG
2841 nfs_clntup = B_TRUE;
2842 #endif
2843
2844 return (0);
2845 }
2846
2847 /*
2848 * This routine is only called if the NFS Client has been initialized but
2849 * the module failed to be installed. This routine will cleanup the previously
2850 * allocated/initialized work.
2851 */
2852 void
nfs_clntfini(void)2853 nfs_clntfini(void)
2854 {
2855 (void) zone_key_delete(mi_list_key);
2856 nfs_subrfini();
2857 nfs_vfsfini();
2858 nfs4_clnt_fini();
2859 nfscmd_fini();
2860 }
2861
2862 /*
2863 * nfs_lockrelease:
2864 *
2865 * Release any locks on the given vnode that are held by the current
2866 * process.
2867 */
2868 void
nfs_lockrelease(vnode_t * vp,int flag,offset_t offset,cred_t * cr)2869 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2870 {
2871 flock64_t ld;
2872 struct shrlock shr;
2873 char *buf;
2874 int remote_lock_possible;
2875 int ret;
2876
2877 ASSERT((uintptr_t)vp > KERNELBASE);
2878
2879 /*
2880 * Generate an explicit unlock operation for the entire file. As a
2881 * partial optimization, only generate the unlock if there is a
2882 * lock registered for the file. We could check whether this
2883 * particular process has any locks on the file, but that would
2884 * require the local locking code to provide yet another query
2885 * routine. Note that no explicit synchronization is needed here.
2886 * At worst, flk_has_remote_locks() will return a false positive,
2887 * in which case the unlock call wastes time but doesn't harm
2888 * correctness.
2889 *
2890 * In addition, an unlock request is generated if the process
2891 * is listed as possibly having a lock on the file because the
2892 * server and client lock managers may have gotten out of sync.
2893 * N.B. It is important to make sure nfs_remove_locking_id() is
2894 * called here even if flk_has_remote_locks(vp) reports true.
2895 * If it is not called and there is an entry on the process id
2896 * list, that entry will never get removed.
2897 */
2898 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2899 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2900 if (remote_lock_possible || flk_has_remote_locks(vp)) {
2901 ld.l_type = F_UNLCK; /* set to unlock entire file */
2902 ld.l_whence = 0; /* unlock from start of file */
2903 ld.l_start = 0;
2904 ld.l_len = 0; /* do entire file */
2905 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
2906 NULL);
2907
2908 if (ret != 0) {
2909 /*
2910 * If VOP_FRLOCK fails, make sure we unregister
2911 * local locks before we continue.
2912 */
2913 ld.l_pid = ttoproc(curthread)->p_pid;
2914 lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2915 #ifdef DEBUG
2916 nfs_perror(ret,
2917 "NFS lock release error on vp %p: %m.\n",
2918 (void *)vp, NULL);
2919 #endif
2920 }
2921
2922 /*
2923 * The call to VOP_FRLOCK may put the pid back on the
2924 * list. We need to remove it.
2925 */
2926 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2927 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2928 }
2929
2930 /*
2931 * As long as the vp has a share matching our pid,
2932 * pluck it off and unshare it. There are circumstances in
2933 * which the call to nfs_remove_locking_id() may put the
2934 * owner back on the list, in which case we simply do a
2935 * redundant and harmless unshare.
2936 */
2937 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2938 while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2939 (char *)NULL, buf, &shr.s_own_len)) {
2940 shr.s_owner = buf;
2941 shr.s_access = 0;
2942 shr.s_deny = 0;
2943 shr.s_sysid = 0;
2944 shr.s_pid = curproc->p_pid;
2945
2946 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
2947 #ifdef DEBUG
2948 if (ret != 0) {
2949 nfs_perror(ret,
2950 "NFS share release error on vp %p: %m.\n",
2951 (void *)vp, NULL);
2952 }
2953 #endif
2954 }
2955 kmem_free(buf, MAX_SHR_OWNER_LEN);
2956 }
2957
2958 /*
2959 * nfs_lockcompletion:
2960 *
2961 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2962 * as non cachable (set VNOCACHE bit).
2963 */
2964
2965 void
nfs_lockcompletion(vnode_t * vp,int cmd)2966 nfs_lockcompletion(vnode_t *vp, int cmd)
2967 {
2968 #ifdef DEBUG
2969 rnode_t *rp = VTOR(vp);
2970
2971 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2972 #endif
2973
2974 if (cmd == F_SETLK || cmd == F_SETLKW) {
2975 if (!lm_safemap(vp)) {
2976 mutex_enter(&vp->v_lock);
2977 vp->v_flag |= VNOCACHE;
2978 mutex_exit(&vp->v_lock);
2979 } else {
2980 mutex_enter(&vp->v_lock);
2981 vp->v_flag &= ~VNOCACHE;
2982 mutex_exit(&vp->v_lock);
2983 }
2984 }
2985 /*
2986 * The cached attributes of the file are stale after acquiring
2987 * the lock on the file. They were updated when the file was
2988 * opened, but not updated when the lock was acquired. Therefore the
2989 * cached attributes are invalidated after the lock is obtained.
2990 */
2991 PURGE_ATTRCACHE(vp);
2992 }
2993
2994 /*
2995 * The lock manager holds state making it possible for the client
2996 * and server to be out of sync. For example, if the response from
2997 * the server granting a lock request is lost, the server will think
2998 * the lock is granted and the client will think the lock is lost.
2999 * The client can tell when it is not positive if it is in sync with
3000 * the server.
3001 *
3002 * To deal with this, a list of processes for which the client is
3003 * not sure if the server holds a lock is attached to the rnode.
3004 * When such a process closes the rnode, an unlock request is sent
3005 * to the server to unlock the entire file.
3006 *
3007 * The list is kept as a singularly linked NULL terminated list.
3008 * Because it is only added to under extreme error conditions, the
3009 * list shouldn't get very big. DEBUG kernels print a message if
3010 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily
3011 * choosen to be 8, but can be tuned at runtime.
3012 */
3013 #ifdef DEBUG
3014 /* int nfs_lmpl_high_water = 8; */
3015 int nfs_lmpl_high_water = 128;
3016 int nfs_cnt_add_locking_id = 0;
3017 int nfs_len_add_locking_id = 0;
3018 #endif /* DEBUG */
3019
3020 /*
3021 * Record that the nfs lock manager server may be holding a lock on
3022 * a vnode for a process.
3023 *
3024 * Because the nfs lock manager server holds state, it is possible
3025 * for the server to get out of sync with the client. This routine is called
3026 * from the client when it is no longer sure if the server is in sync
3027 * with the client. nfs_lockrelease() will then notice this and send
3028 * an unlock request when the file is closed
3029 */
3030 void
nfs_add_locking_id(vnode_t * vp,pid_t pid,int type,char * id,int len)3031 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
3032 {
3033 rnode_t *rp;
3034 lmpl_t *new;
3035 lmpl_t *cur;
3036 lmpl_t **lmplp;
3037 #ifdef DEBUG
3038 int list_len = 1;
3039 #endif /* DEBUG */
3040
3041 #ifdef DEBUG
3042 ++nfs_cnt_add_locking_id;
3043 #endif /* DEBUG */
3044 /*
3045 * allocate new lmpl_t now so we don't sleep
3046 * later after grabbing mutexes
3047 */
3048 ASSERT(len < MAX_SHR_OWNER_LEN);
3049 new = kmem_alloc(sizeof (*new), KM_SLEEP);
3050 new->lmpl_type = type;
3051 new->lmpl_pid = pid;
3052 new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
3053 bcopy(id, new->lmpl_owner, len);
3054 new->lmpl_own_len = len;
3055 new->lmpl_next = (lmpl_t *)NULL;
3056 #ifdef DEBUG
3057 if (type == RLMPL_PID) {
3058 ASSERT(len == sizeof (pid_t));
3059 ASSERT(pid == *(pid_t *)new->lmpl_owner);
3060 } else {
3061 ASSERT(type == RLMPL_OWNER);
3062 }
3063 #endif
3064
3065 rp = VTOR(vp);
3066 mutex_enter(&rp->r_statelock);
3067
3068 /*
3069 * Add this id to the list for this rnode only if the
3070 * rnode is active and the id is not already there.
3071 */
3072 ASSERT(rp->r_flags & RHASHED);
3073 lmplp = &(rp->r_lmpl);
3074 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3075 if (cur->lmpl_pid == pid &&
3076 cur->lmpl_type == type &&
3077 cur->lmpl_own_len == len &&
3078 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
3079 kmem_free(new->lmpl_owner, len);
3080 kmem_free(new, sizeof (*new));
3081 break;
3082 }
3083 lmplp = &cur->lmpl_next;
3084 #ifdef DEBUG
3085 ++list_len;
3086 #endif /* DEBUG */
3087 }
3088 if (cur == (lmpl_t *)NULL) {
3089 *lmplp = new;
3090 #ifdef DEBUG
3091 if (list_len > nfs_len_add_locking_id) {
3092 nfs_len_add_locking_id = list_len;
3093 }
3094 if (list_len > nfs_lmpl_high_water) {
3095 cmn_err(CE_WARN, "nfs_add_locking_id: long list "
3096 "vp=%p is %d", (void *)vp, list_len);
3097 }
3098 #endif /* DEBUG */
3099 }
3100
3101 #ifdef DEBUG
3102 if (share_debug) {
3103 int nitems = 0;
3104 int npids = 0;
3105 int nowners = 0;
3106
3107 /*
3108 * Count the number of things left on r_lmpl after the remove.
3109 */
3110 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3111 cur = cur->lmpl_next) {
3112 nitems++;
3113 if (cur->lmpl_type == RLMPL_PID) {
3114 npids++;
3115 } else if (cur->lmpl_type == RLMPL_OWNER) {
3116 nowners++;
3117 } else {
3118 cmn_err(CE_PANIC, "nfs_add_locking_id: "
3119 "unrecognized lmpl_type %d",
3120 cur->lmpl_type);
3121 }
3122 }
3123
3124 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
3125 "OWNs = %d items left on r_lmpl\n",
3126 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
3127 }
3128 #endif
3129
3130 mutex_exit(&rp->r_statelock);
3131 }
3132
3133 /*
3134 * Remove an id from the lock manager id list.
3135 *
3136 * If the id is not in the list return 0. If it was found and
3137 * removed, return 1.
3138 */
3139 static int
nfs_remove_locking_id(vnode_t * vp,int type,char * id,char * rid,int * rlen)3140 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3141 {
3142 lmpl_t *cur;
3143 lmpl_t **lmplp;
3144 rnode_t *rp;
3145 int rv = 0;
3146
3147 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3148
3149 rp = VTOR(vp);
3150
3151 mutex_enter(&rp->r_statelock);
3152 ASSERT(rp->r_flags & RHASHED);
3153 lmplp = &(rp->r_lmpl);
3154
3155 /*
3156 * Search through the list and remove the entry for this id
3157 * if it is there. The special case id == NULL allows removal
3158 * of the first share on the r_lmpl list belonging to the
3159 * current process (if any), without regard to further details
3160 * of its identity.
3161 */
3162 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3163 if (cur->lmpl_type == type &&
3164 cur->lmpl_pid == curproc->p_pid &&
3165 (id == (char *)NULL ||
3166 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3167 *lmplp = cur->lmpl_next;
3168 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3169 if (rid != NULL) {
3170 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3171 *rlen = cur->lmpl_own_len;
3172 }
3173 kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3174 kmem_free(cur, sizeof (*cur));
3175 rv = 1;
3176 break;
3177 }
3178 lmplp = &cur->lmpl_next;
3179 }
3180
3181 #ifdef DEBUG
3182 if (share_debug) {
3183 int nitems = 0;
3184 int npids = 0;
3185 int nowners = 0;
3186
3187 /*
3188 * Count the number of things left on r_lmpl after the remove.
3189 */
3190 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3191 cur = cur->lmpl_next) {
3192 nitems++;
3193 if (cur->lmpl_type == RLMPL_PID) {
3194 npids++;
3195 } else if (cur->lmpl_type == RLMPL_OWNER) {
3196 nowners++;
3197 } else {
3198 cmn_err(CE_PANIC,
3199 "nrli: unrecognized lmpl_type %d",
3200 cur->lmpl_type);
3201 }
3202 }
3203
3204 cmn_err(CE_CONT,
3205 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3206 (type == RLMPL_PID) ? "P" : "O",
3207 npids,
3208 nowners,
3209 nitems);
3210 }
3211 #endif
3212
3213 mutex_exit(&rp->r_statelock);
3214 return (rv);
3215 }
3216
3217 void
nfs_free_mi(mntinfo_t * mi)3218 nfs_free_mi(mntinfo_t *mi)
3219 {
3220 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3221 ASSERT(mi->mi_manager_thread == NULL);
3222 ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
3223 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
3224
3225 /*
3226 * Remove the node from the global list before we start tearing it down.
3227 */
3228 nfs_mi_zonelist_remove(mi);
3229 if (mi->mi_klmconfig) {
3230 lm_free_config(mi->mi_klmconfig);
3231 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3232 }
3233 mutex_destroy(&mi->mi_lock);
3234 mutex_destroy(&mi->mi_remap_lock);
3235 mutex_destroy(&mi->mi_async_lock);
3236 mutex_destroy(&mi->mi_rnodes_lock);
3237 cv_destroy(&mi->mi_failover_cv);
3238 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
3239 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
3240 cv_destroy(&mi->mi_async_reqs_cv);
3241 cv_destroy(&mi->mi_async_cv);
3242 list_destroy(&mi->mi_rnodes);
3243 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
3244 kmem_free(mi, sizeof (*mi));
3245 }
3246
3247 static int
mnt_kstat_update(kstat_t * ksp,int rw)3248 mnt_kstat_update(kstat_t *ksp, int rw)
3249 {
3250 mntinfo_t *mi;
3251 struct mntinfo_kstat *mik;
3252 vfs_t *vfsp;
3253 int i;
3254
3255 /* this is a read-only kstat. Bail out on a write */
3256 if (rw == KSTAT_WRITE)
3257 return (EACCES);
3258
3259 /*
3260 * We don't want to wait here as kstat_chain_lock could be held by
3261 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3262 * and thus could lead to a deadlock.
3263 */
3264 vfsp = (struct vfs *)ksp->ks_private;
3265
3266
3267 mi = VFTOMI(vfsp);
3268
3269 mik = (struct mntinfo_kstat *)ksp->ks_data;
3270
3271 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3272 mik->mik_vers = (uint32_t)mi->mi_vers;
3273 mik->mik_flags = mi->mi_flags;
3274 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3275 mik->mik_curread = (uint32_t)mi->mi_curread;
3276 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3277 mik->mik_retrans = mi->mi_retrans;
3278 mik->mik_timeo = mi->mi_timeo;
3279 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3280 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3281 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3282 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3283 for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3284 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3285 mik->mik_timers[i].deviate =
3286 (uint32_t)mi->mi_timers[i].rt_deviate;
3287 mik->mik_timers[i].rtxcur =
3288 (uint32_t)mi->mi_timers[i].rt_rtxcur;
3289 }
3290 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3291 mik->mik_failover = (uint32_t)mi->mi_failover;
3292 mik->mik_remap = (uint32_t)mi->mi_remap;
3293 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3294
3295 return (0);
3296 }
3297
3298 void
nfs_mnt_kstat_init(struct vfs * vfsp)3299 nfs_mnt_kstat_init(struct vfs *vfsp)
3300 {
3301 mntinfo_t *mi = VFTOMI(vfsp);
3302
3303 /*
3304 * Create the version specific kstats.
3305 *
3306 * PSARC 2001/697 Contract Private Interface
3307 * All nfs kstats are under SunMC contract
3308 * Please refer to the PSARC listed above and contact
3309 * SunMC before making any changes!
3310 *
3311 * Changes must be reviewed by Solaris File Sharing
3312 * Changes must be communicated to contract-2001-697@sun.com
3313 *
3314 */
3315
3316 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3317 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3318 if (mi->mi_io_kstats) {
3319 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3320 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3321 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3322 kstat_install(mi->mi_io_kstats);
3323 }
3324
3325 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3326 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3327 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3328 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3329 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3330 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3331 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3332 kstat_install(mi->mi_ro_kstats);
3333 }
3334 }
3335
3336 nfs_delmapcall_t *
nfs_init_delmapcall()3337 nfs_init_delmapcall()
3338 {
3339 nfs_delmapcall_t *delmap_call;
3340
3341 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3342 delmap_call->call_id = curthread;
3343 delmap_call->error = 0;
3344
3345 return (delmap_call);
3346 }
3347
3348 void
nfs_free_delmapcall(nfs_delmapcall_t * delmap_call)3349 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3350 {
3351 kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3352 }
3353
3354 /*
3355 * Searches for the current delmap caller (based on curthread) in the list of
3356 * callers. If it is found, we remove it and free the delmap caller.
3357 * Returns:
3358 * 0 if the caller wasn't found
3359 * 1 if the caller was found, removed and freed. *errp is set to what
3360 * the result of the delmap was.
3361 */
3362 int
nfs_find_and_delete_delmapcall(rnode_t * rp,int * errp)3363 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3364 {
3365 nfs_delmapcall_t *delmap_call;
3366
3367 /*
3368 * If the list doesn't exist yet, we create it and return
3369 * that the caller wasn't found. No list = no callers.
3370 */
3371 mutex_enter(&rp->r_statelock);
3372 if (!(rp->r_flags & RDELMAPLIST)) {
3373 /* The list does not exist */
3374 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3375 offsetof(nfs_delmapcall_t, call_node));
3376 rp->r_flags |= RDELMAPLIST;
3377 mutex_exit(&rp->r_statelock);
3378 return (0);
3379 } else {
3380 /* The list exists so search it */
3381 for (delmap_call = list_head(&rp->r_indelmap);
3382 delmap_call != NULL;
3383 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3384 if (delmap_call->call_id == curthread) {
3385 /* current caller is in the list */
3386 *errp = delmap_call->error;
3387 list_remove(&rp->r_indelmap, delmap_call);
3388 mutex_exit(&rp->r_statelock);
3389 nfs_free_delmapcall(delmap_call);
3390 return (1);
3391 }
3392 }
3393 }
3394 mutex_exit(&rp->r_statelock);
3395 return (0);
3396 }
3397