1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
25 * All rights reserved.
26 */
27
28 #include <sys/param.h>
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/thread.h>
32 #include <sys/t_lock.h>
33 #include <sys/time.h>
34 #include <sys/vnode.h>
35 #include <sys/vfs.h>
36 #include <sys/errno.h>
37 #include <sys/buf.h>
38 #include <sys/stat.h>
39 #include <sys/cred.h>
40 #include <sys/kmem.h>
41 #include <sys/debug.h>
42 #include <sys/dnlc.h>
43 #include <sys/vmsystm.h>
44 #include <sys/flock.h>
45 #include <sys/share.h>
46 #include <sys/cmn_err.h>
47 #include <sys/tiuser.h>
48 #include <sys/sysmacros.h>
49 #include <sys/callb.h>
50 #include <sys/acl.h>
51 #include <sys/kstat.h>
52 #include <sys/signal.h>
53 #include <sys/list.h>
54 #include <sys/zone.h>
55
56 #include <rpc/types.h>
57 #include <rpc/xdr.h>
58 #include <rpc/auth.h>
59 #include <rpc/clnt.h>
60
61 #include <nfs/nfs.h>
62 #include <nfs/nfs_clnt.h>
63
64 #include <nfs/rnode.h>
65 #include <nfs/nfs_acl.h>
66 #include <nfs/lm.h>
67
68 #include <vm/hat.h>
69 #include <vm/as.h>
70 #include <vm/page.h>
71 #include <vm/pvn.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_vn.h>
75
76 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
77 cred_t *);
78 static int nfs_getattr_cache(vnode_t *, struct vattr *);
79 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
80
81 struct mi_globals {
82 kmutex_t mig_lock; /* lock protecting mig_list */
83 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */
84 boolean_t mig_destructor_called;
85 };
86
87 static zone_key_t mi_list_key;
88
89 /* Debugging flag for PC file shares. */
90 extern int share_debug;
91
92 /*
93 * Attributes caching:
94 *
95 * Attributes are cached in the rnode in struct vattr form.
96 * There is a time associated with the cached attributes (r_attrtime)
97 * which tells whether the attributes are valid. The time is initialized
98 * to the difference between current time and the modify time of the vnode
99 * when new attributes are cached. This allows the attributes for
100 * files that have changed recently to be timed out sooner than for files
101 * that have not changed for a long time. There are minimum and maximum
102 * timeout values that can be set per mount point.
103 */
104
105 int
nfs_waitfor_purge_complete(vnode_t * vp)106 nfs_waitfor_purge_complete(vnode_t *vp)
107 {
108 rnode_t *rp;
109 k_sigset_t smask;
110
111 rp = VTOR(vp);
112 if (rp->r_serial != NULL && rp->r_serial != curthread) {
113 mutex_enter(&rp->r_statelock);
114 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
115 while (rp->r_serial != NULL) {
116 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
117 sigunintr(&smask);
118 mutex_exit(&rp->r_statelock);
119 return (EINTR);
120 }
121 }
122 sigunintr(&smask);
123 mutex_exit(&rp->r_statelock);
124 }
125 return (0);
126 }
127
128 /*
129 * Validate caches by checking cached attributes. If the cached
130 * attributes have timed out, then get new attributes from the server.
131 * As a side affect, this will do cache invalidation if the attributes
132 * have changed.
133 *
134 * If the attributes have not timed out and if there is a cache
135 * invalidation being done by some other thread, then wait until that
136 * thread has completed the cache invalidation.
137 */
138 int
nfs_validate_caches(vnode_t * vp,cred_t * cr)139 nfs_validate_caches(vnode_t *vp, cred_t *cr)
140 {
141 int error;
142 struct vattr va;
143
144 if (ATTRCACHE_VALID(vp)) {
145 error = nfs_waitfor_purge_complete(vp);
146 if (error)
147 return (error);
148 return (0);
149 }
150
151 va.va_mask = AT_ALL;
152 return (nfs_getattr_otw(vp, &va, cr));
153 }
154
155 /*
156 * Validate caches by checking cached attributes. If the cached
157 * attributes have timed out, then get new attributes from the server.
158 * As a side affect, this will do cache invalidation if the attributes
159 * have changed.
160 *
161 * If the attributes have not timed out and if there is a cache
162 * invalidation being done by some other thread, then wait until that
163 * thread has completed the cache invalidation.
164 */
165 int
nfs3_validate_caches(vnode_t * vp,cred_t * cr)166 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
167 {
168 int error;
169 struct vattr va;
170
171 if (ATTRCACHE_VALID(vp)) {
172 error = nfs_waitfor_purge_complete(vp);
173 if (error)
174 return (error);
175 return (0);
176 }
177
178 va.va_mask = AT_ALL;
179 return (nfs3_getattr_otw(vp, &va, cr));
180 }
181
182 /*
183 * Purge all of the various NFS `data' caches.
184 */
185 void
nfs_purge_caches(vnode_t * vp,int purge_dnlc,cred_t * cr)186 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
187 {
188 rnode_t *rp;
189 char *contents;
190 int size;
191 int error;
192
193 /*
194 * Purge the DNLC for any entries which refer to this file.
195 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
196 */
197 rp = VTOR(vp);
198 mutex_enter(&rp->r_statelock);
199 if (vp->v_count > 1 &&
200 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
201 !(rp->r_flags & RINDNLCPURGE)) {
202 /*
203 * Set the RINDNLCPURGE flag to prevent recursive entry
204 * into dnlc_purge_vp()
205 */
206 if (vp->v_type == VDIR)
207 rp->r_flags |= RINDNLCPURGE;
208 mutex_exit(&rp->r_statelock);
209 dnlc_purge_vp(vp);
210 mutex_enter(&rp->r_statelock);
211 if (rp->r_flags & RINDNLCPURGE)
212 rp->r_flags &= ~RINDNLCPURGE;
213 }
214
215 /*
216 * Clear any readdir state bits and purge the readlink response cache.
217 */
218 contents = rp->r_symlink.contents;
219 size = rp->r_symlink.size;
220 rp->r_symlink.contents = NULL;
221 mutex_exit(&rp->r_statelock);
222
223 if (contents != NULL) {
224
225 kmem_free((void *)contents, size);
226 }
227
228 /*
229 * Flush the page cache.
230 */
231 if (vn_has_cached_data(vp)) {
232 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
233 if (error && (error == ENOSPC || error == EDQUOT)) {
234 mutex_enter(&rp->r_statelock);
235 if (!rp->r_error)
236 rp->r_error = error;
237 mutex_exit(&rp->r_statelock);
238 }
239 }
240
241 /*
242 * Flush the readdir response cache.
243 */
244 if (HAVE_RDDIR_CACHE(rp))
245 nfs_purge_rddir_cache(vp);
246 }
247
248 /*
249 * Purge the readdir cache of all entries
250 */
251 void
nfs_purge_rddir_cache(vnode_t * vp)252 nfs_purge_rddir_cache(vnode_t *vp)
253 {
254 rnode_t *rp;
255 rddir_cache *rdc;
256 rddir_cache *nrdc;
257
258 rp = VTOR(vp);
259 top:
260 mutex_enter(&rp->r_statelock);
261 rp->r_direof = NULL;
262 rp->r_flags &= ~RLOOKUP;
263 rp->r_flags |= RREADDIRPLUS;
264 rdc = avl_first(&rp->r_dir);
265 while (rdc != NULL) {
266 nrdc = AVL_NEXT(&rp->r_dir, rdc);
267 avl_remove(&rp->r_dir, rdc);
268 rddir_cache_rele(rdc);
269 rdc = nrdc;
270 }
271 mutex_exit(&rp->r_statelock);
272 }
273
274 /*
275 * Do a cache check based on the post-operation attributes.
276 * Then make them the new cached attributes. If no attributes
277 * were returned, then mark the attributes as timed out.
278 */
279 void
nfs3_cache_post_op_attr(vnode_t * vp,post_op_attr * poap,hrtime_t t,cred_t * cr)280 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
281 {
282 vattr_t attr;
283
284 if (!poap->attributes) {
285 PURGE_ATTRCACHE(vp);
286 return;
287 }
288 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
289 }
290
291 /*
292 * Same as above, but using a vattr
293 */
294 void
nfs3_cache_post_op_vattr(vnode_t * vp,post_op_vattr * poap,hrtime_t t,cred_t * cr)295 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
296 cred_t *cr)
297 {
298 if (!poap->attributes) {
299 PURGE_ATTRCACHE(vp);
300 return;
301 }
302 nfs_attr_cache(vp, poap->fres.vap, t, cr);
303 }
304
305 /*
306 * Do a cache check based on the weak cache consistency attributes.
307 * These consist of a small set of pre-operation attributes and the
308 * full set of post-operation attributes.
309 *
310 * If we are given the pre-operation attributes, then use them to
311 * check the validity of the various caches. Then, if we got the
312 * post-operation attributes, make them the new cached attributes.
313 * If we didn't get the post-operation attributes, then mark the
314 * attribute cache as timed out so that the next reference will
315 * cause a GETATTR to the server to refresh with the current
316 * attributes.
317 *
318 * Otherwise, if we didn't get the pre-operation attributes, but
319 * we did get the post-operation attributes, then use these
320 * attributes to check the validity of the various caches. This
321 * will probably cause a flush of the caches because if the
322 * operation succeeded, the attributes of the object were changed
323 * in some way from the old post-operation attributes. This
324 * should be okay because it is the safe thing to do. After
325 * checking the data caches, then we make these the new cached
326 * attributes.
327 *
328 * Otherwise, we didn't get either the pre- or post-operation
329 * attributes. Simply mark the attribute cache as timed out so
330 * the next reference will cause a GETATTR to the server to
331 * refresh with the current attributes.
332 *
333 * If an error occurred trying to convert the over the wire
334 * attributes to a vattr, then simply mark the attribute cache as
335 * timed out.
336 */
337 void
nfs3_cache_wcc_data(vnode_t * vp,wcc_data * wccp,hrtime_t t,cred_t * cr)338 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
339 {
340 vattr_t bva;
341 vattr_t ava;
342
343 if (wccp->after.attributes) {
344 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
345 PURGE_ATTRCACHE(vp);
346 return;
347 }
348 if (wccp->before.attributes) {
349 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
350 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
351 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
352 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
353 bva.va_size = wccp->before.attr.size;
354 nfs3_attr_cache(vp, &bva, &ava, t, cr);
355 } else
356 nfs_attr_cache(vp, &ava, t, cr);
357 } else {
358 PURGE_ATTRCACHE(vp);
359 }
360 }
361
362 /*
363 * Set attributes cache for given vnode using nfsattr.
364 *
365 * This routine does not do cache validation with the attributes.
366 *
367 * If an error occurred trying to convert the over the wire
368 * attributes to a vattr, then simply mark the attribute cache as
369 * timed out.
370 */
371 void
nfs_attrcache(vnode_t * vp,struct nfsfattr * na,hrtime_t t)372 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
373 {
374 rnode_t *rp;
375 struct vattr va;
376
377 if (!nattr_to_vattr(vp, na, &va)) {
378 rp = VTOR(vp);
379 mutex_enter(&rp->r_statelock);
380 if (rp->r_mtime <= t)
381 nfs_attrcache_va(vp, &va);
382 mutex_exit(&rp->r_statelock);
383 } else {
384 PURGE_ATTRCACHE(vp);
385 }
386 }
387
388 /*
389 * Set attributes cache for given vnode using fattr3.
390 *
391 * This routine does not do cache validation with the attributes.
392 *
393 * If an error occurred trying to convert the over the wire
394 * attributes to a vattr, then simply mark the attribute cache as
395 * timed out.
396 */
397 void
nfs3_attrcache(vnode_t * vp,fattr3 * na,hrtime_t t)398 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
399 {
400 rnode_t *rp;
401 struct vattr va;
402
403 if (!fattr3_to_vattr(vp, na, &va)) {
404 rp = VTOR(vp);
405 mutex_enter(&rp->r_statelock);
406 if (rp->r_mtime <= t)
407 nfs_attrcache_va(vp, &va);
408 mutex_exit(&rp->r_statelock);
409 } else {
410 PURGE_ATTRCACHE(vp);
411 }
412 }
413
414 /*
415 * Do a cache check based on attributes returned over the wire. The
416 * new attributes are cached.
417 *
418 * If an error occurred trying to convert the over the wire attributes
419 * to a vattr, then just return that error.
420 *
421 * As a side affect, the vattr argument is filled in with the converted
422 * attributes.
423 */
424 int
nfs_cache_fattr(vnode_t * vp,struct nfsfattr * na,vattr_t * vap,hrtime_t t,cred_t * cr)425 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
426 cred_t *cr)
427 {
428 int error;
429
430 error = nattr_to_vattr(vp, na, vap);
431 if (error)
432 return (error);
433 nfs_attr_cache(vp, vap, t, cr);
434 return (0);
435 }
436
437 /*
438 * Do a cache check based on attributes returned over the wire. The
439 * new attributes are cached.
440 *
441 * If an error occurred trying to convert the over the wire attributes
442 * to a vattr, then just return that error.
443 *
444 * As a side affect, the vattr argument is filled in with the converted
445 * attributes.
446 */
447 int
nfs3_cache_fattr3(vnode_t * vp,fattr3 * na,vattr_t * vap,hrtime_t t,cred_t * cr)448 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
449 {
450 int error;
451
452 error = fattr3_to_vattr(vp, na, vap);
453 if (error)
454 return (error);
455 nfs_attr_cache(vp, vap, t, cr);
456 return (0);
457 }
458
459 /*
460 * Use the passed in virtual attributes to check to see whether the
461 * data and metadata caches are valid, cache the new attributes, and
462 * then do the cache invalidation if required.
463 *
464 * The cache validation and caching of the new attributes is done
465 * atomically via the use of the mutex, r_statelock. If required,
466 * the cache invalidation is done atomically w.r.t. the cache
467 * validation and caching of the attributes via the pseudo lock,
468 * r_serial.
469 *
470 * This routine is used to do cache validation and attributes caching
471 * for operations with a single set of post operation attributes.
472 */
473 void
nfs_attr_cache(vnode_t * vp,vattr_t * vap,hrtime_t t,cred_t * cr)474 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
475 {
476 rnode_t *rp;
477 int mtime_changed = 0;
478 int ctime_changed = 0;
479 vsecattr_t *vsp;
480 int was_serial;
481 len_t preattr_rsize;
482 boolean_t writeattr_set = B_FALSE;
483 boolean_t cachepurge_set = B_FALSE;
484
485 rp = VTOR(vp);
486
487 mutex_enter(&rp->r_statelock);
488
489 if (rp->r_serial != curthread) {
490 klwp_t *lwp = ttolwp(curthread);
491
492 was_serial = 0;
493 if (lwp != NULL)
494 lwp->lwp_nostop++;
495 while (rp->r_serial != NULL) {
496 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
497 mutex_exit(&rp->r_statelock);
498 if (lwp != NULL)
499 lwp->lwp_nostop--;
500 return;
501 }
502 }
503 if (lwp != NULL)
504 lwp->lwp_nostop--;
505 } else
506 was_serial = 1;
507
508 if (rp->r_mtime > t) {
509 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
510 PURGE_ATTRCACHE_LOCKED(rp);
511 mutex_exit(&rp->r_statelock);
512 return;
513 }
514
515 /*
516 * Write thread after writing data to file on remote server,
517 * will always set RWRITEATTR to indicate that file on remote
518 * server was modified with a WRITE operation and would have
519 * marked attribute cache as timed out. If RWRITEATTR
520 * is set, then do not check for mtime and ctime change.
521 */
522 if (!(rp->r_flags & RWRITEATTR)) {
523 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
524 mtime_changed = 1;
525
526 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
527 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
528 ctime_changed = 1;
529 } else {
530 writeattr_set = B_TRUE;
531 }
532
533 preattr_rsize = rp->r_size;
534
535 nfs_attrcache_va(vp, vap);
536
537 /*
538 * If we have updated filesize in nfs_attrcache_va, as soon as we
539 * drop statelock we will be in transition of purging all
540 * our caches and updating them. It is possible for another
541 * thread to pick this new file size and read in zeroed data.
542 * stall other threads till cache purge is complete.
543 */
544 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
545 /*
546 * If RWRITEATTR was set and we have updated the file
547 * size, Server's returned file size need not necessarily
548 * be because of this Client's WRITE. We need to purge
549 * all caches.
550 */
551 if (writeattr_set)
552 mtime_changed = 1;
553
554 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
555 rp->r_flags |= RINCACHEPURGE;
556 cachepurge_set = B_TRUE;
557 }
558 }
559
560 if (!mtime_changed && !ctime_changed) {
561 mutex_exit(&rp->r_statelock);
562 return;
563 }
564
565 rp->r_serial = curthread;
566
567 mutex_exit(&rp->r_statelock);
568
569 if (mtime_changed)
570 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
571
572 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
573 mutex_enter(&rp->r_statelock);
574 rp->r_flags &= ~RINCACHEPURGE;
575 cv_broadcast(&rp->r_cv);
576 mutex_exit(&rp->r_statelock);
577 cachepurge_set = B_FALSE;
578 }
579
580 if (ctime_changed) {
581 (void) nfs_access_purge_rp(rp);
582 if (rp->r_secattr != NULL) {
583 mutex_enter(&rp->r_statelock);
584 vsp = rp->r_secattr;
585 rp->r_secattr = NULL;
586 mutex_exit(&rp->r_statelock);
587 if (vsp != NULL)
588 nfs_acl_free(vsp);
589 }
590 }
591
592 if (!was_serial) {
593 mutex_enter(&rp->r_statelock);
594 rp->r_serial = NULL;
595 cv_broadcast(&rp->r_cv);
596 mutex_exit(&rp->r_statelock);
597 }
598 }
599
600 /*
601 * Use the passed in "before" virtual attributes to check to see
602 * whether the data and metadata caches are valid, cache the "after"
603 * new attributes, and then do the cache invalidation if required.
604 *
605 * The cache validation and caching of the new attributes is done
606 * atomically via the use of the mutex, r_statelock. If required,
607 * the cache invalidation is done atomically w.r.t. the cache
608 * validation and caching of the attributes via the pseudo lock,
609 * r_serial.
610 *
611 * This routine is used to do cache validation and attributes caching
612 * for operations with both pre operation attributes and post operation
613 * attributes.
614 */
615 static void
nfs3_attr_cache(vnode_t * vp,vattr_t * bvap,vattr_t * avap,hrtime_t t,cred_t * cr)616 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
617 cred_t *cr)
618 {
619 rnode_t *rp;
620 int mtime_changed = 0;
621 int ctime_changed = 0;
622 vsecattr_t *vsp;
623 int was_serial;
624 len_t preattr_rsize;
625 boolean_t writeattr_set = B_FALSE;
626 boolean_t cachepurge_set = B_FALSE;
627
628 rp = VTOR(vp);
629
630 mutex_enter(&rp->r_statelock);
631
632 if (rp->r_serial != curthread) {
633 klwp_t *lwp = ttolwp(curthread);
634
635 was_serial = 0;
636 if (lwp != NULL)
637 lwp->lwp_nostop++;
638 while (rp->r_serial != NULL) {
639 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
640 mutex_exit(&rp->r_statelock);
641 if (lwp != NULL)
642 lwp->lwp_nostop--;
643 return;
644 }
645 }
646 if (lwp != NULL)
647 lwp->lwp_nostop--;
648 } else
649 was_serial = 1;
650
651 if (rp->r_mtime > t) {
652 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
653 PURGE_ATTRCACHE_LOCKED(rp);
654 mutex_exit(&rp->r_statelock);
655 return;
656 }
657
658 /*
659 * Write thread after writing data to file on remote server,
660 * will always set RWRITEATTR to indicate that file on remote
661 * server was modified with a WRITE operation and would have
662 * marked attribute cache as timed out. If RWRITEATTR
663 * is set, then do not check for mtime and ctime change.
664 */
665 if (!(rp->r_flags & RWRITEATTR)) {
666 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
667 mtime_changed = 1;
668
669 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
670 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
671 ctime_changed = 1;
672 } else {
673 writeattr_set = B_TRUE;
674 }
675
676 preattr_rsize = rp->r_size;
677
678 nfs_attrcache_va(vp, avap);
679
680 /*
681 * If we have updated filesize in nfs_attrcache_va, as soon as we
682 * drop statelock we will be in transition of purging all
683 * our caches and updating them. It is possible for another
684 * thread to pick this new file size and read in zeroed data.
685 * stall other threads till cache purge is complete.
686 */
687 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
688 /*
689 * If RWRITEATTR was set and we have updated the file
690 * size, Server's returned file size need not necessarily
691 * be because of this Client's WRITE. We need to purge
692 * all caches.
693 */
694 if (writeattr_set)
695 mtime_changed = 1;
696
697 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
698 rp->r_flags |= RINCACHEPURGE;
699 cachepurge_set = B_TRUE;
700 }
701 }
702
703 if (!mtime_changed && !ctime_changed) {
704 mutex_exit(&rp->r_statelock);
705 return;
706 }
707
708 rp->r_serial = curthread;
709
710 mutex_exit(&rp->r_statelock);
711
712 if (mtime_changed)
713 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
714
715 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
716 mutex_enter(&rp->r_statelock);
717 rp->r_flags &= ~RINCACHEPURGE;
718 cv_broadcast(&rp->r_cv);
719 mutex_exit(&rp->r_statelock);
720 cachepurge_set = B_FALSE;
721 }
722
723 if (ctime_changed) {
724 (void) nfs_access_purge_rp(rp);
725 if (rp->r_secattr != NULL) {
726 mutex_enter(&rp->r_statelock);
727 vsp = rp->r_secattr;
728 rp->r_secattr = NULL;
729 mutex_exit(&rp->r_statelock);
730 if (vsp != NULL)
731 nfs_acl_free(vsp);
732 }
733 }
734
735 if (!was_serial) {
736 mutex_enter(&rp->r_statelock);
737 rp->r_serial = NULL;
738 cv_broadcast(&rp->r_cv);
739 mutex_exit(&rp->r_statelock);
740 }
741 }
742
743 /*
744 * Set attributes cache for given vnode using virtual attributes.
745 *
746 * Set the timeout value on the attribute cache and fill it
747 * with the passed in attributes.
748 *
749 * The caller must be holding r_statelock.
750 */
751 void
nfs_attrcache_va(vnode_t * vp,struct vattr * va)752 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
753 {
754 rnode_t *rp;
755 mntinfo_t *mi;
756 hrtime_t delta;
757 hrtime_t now;
758
759 rp = VTOR(vp);
760
761 ASSERT(MUTEX_HELD(&rp->r_statelock));
762
763 now = gethrtime();
764
765 mi = VTOMI(vp);
766
767 /*
768 * Delta is the number of nanoseconds that we will
769 * cache the attributes of the file. It is based on
770 * the number of nanoseconds since the last time that
771 * we detected a change. The assumption is that files
772 * that changed recently are likely to change again.
773 * There is a minimum and a maximum for regular files
774 * and for directories which is enforced though.
775 *
776 * Using the time since last change was detected
777 * eliminates direct comparison or calculation
778 * using mixed client and server times. NFS does
779 * not make any assumptions regarding the client
780 * and server clocks being synchronized.
781 */
782 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
783 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
784 va->va_size != rp->r_attr.va_size)
785 rp->r_mtime = now;
786
787 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
788 delta = 0;
789 else {
790 delta = now - rp->r_mtime;
791 if (vp->v_type == VDIR) {
792 if (delta < mi->mi_acdirmin)
793 delta = mi->mi_acdirmin;
794 else if (delta > mi->mi_acdirmax)
795 delta = mi->mi_acdirmax;
796 } else {
797 if (delta < mi->mi_acregmin)
798 delta = mi->mi_acregmin;
799 else if (delta > mi->mi_acregmax)
800 delta = mi->mi_acregmax;
801 }
802 }
803 rp->r_attrtime = now + delta;
804 rp->r_attr = *va;
805 /*
806 * Update the size of the file if there is no cached data or if
807 * the cached data is clean and there is no data being written
808 * out.
809 */
810 if (rp->r_size != va->va_size &&
811 (!vn_has_cached_data(vp) ||
812 (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
813 rp->r_size = va->va_size;
814 nfs_setswaplike(vp, va);
815 rp->r_flags &= ~RWRITEATTR;
816 }
817
818 /*
819 * Fill in attribute from the cache.
820 * If valid, then return 0 to indicate that no error occurred,
821 * otherwise return 1 to indicate that an error occurred.
822 */
823 static int
nfs_getattr_cache(vnode_t * vp,struct vattr * vap)824 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
825 {
826 rnode_t *rp;
827 uint_t mask = vap->va_mask;
828
829 rp = VTOR(vp);
830 mutex_enter(&rp->r_statelock);
831 if (ATTRCACHE_VALID(vp)) {
832 /*
833 * Cached attributes are valid
834 */
835 *vap = rp->r_attr;
836 /*
837 * Set the caller's va_mask to the set of attributes
838 * that were requested ANDed with the attributes that
839 * are available. If attributes were requested that
840 * are not available, those bits must be turned off
841 * in the callers va_mask.
842 */
843 vap->va_mask &= mask;
844 mutex_exit(&rp->r_statelock);
845 return (0);
846 }
847 mutex_exit(&rp->r_statelock);
848 return (1);
849 }
850
851 /*
852 * Get attributes over-the-wire and update attributes cache
853 * if no error occurred in the over-the-wire operation.
854 * Return 0 if successful, otherwise error.
855 */
856 int
nfs_getattr_otw(vnode_t * vp,struct vattr * vap,cred_t * cr)857 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
858 {
859 int error;
860 struct nfsattrstat ns;
861 int douprintf;
862 mntinfo_t *mi;
863 failinfo_t fi;
864 hrtime_t t;
865
866 mi = VTOMI(vp);
867 fi.vp = vp;
868 fi.fhp = NULL; /* no need to update, filehandle not copied */
869 fi.copyproc = nfscopyfh;
870 fi.lookupproc = nfslookup;
871 fi.xattrdirproc = acl_getxattrdir2;
872
873 if (mi->mi_flags & MI_ACL) {
874 error = acl_getattr2_otw(vp, vap, cr);
875 if (mi->mi_flags & MI_ACL)
876 return (error);
877 }
878
879 douprintf = 1;
880
881 t = gethrtime();
882
883 error = rfs2call(mi, RFS_GETATTR,
884 xdr_fhandle, (caddr_t)VTOFH(vp),
885 xdr_attrstat, (caddr_t)&ns, cr,
886 &douprintf, &ns.ns_status, 0, &fi);
887
888 if (!error) {
889 error = geterrno(ns.ns_status);
890 if (!error)
891 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
892 else {
893 PURGE_STALE_FH(error, vp, cr);
894 }
895 }
896
897 return (error);
898 }
899
900 /*
901 * Return either cached ot remote attributes. If get remote attr
902 * use them to check and invalidate caches, then cache the new attributes.
903 */
904 int
nfsgetattr(vnode_t * vp,struct vattr * vap,cred_t * cr)905 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
906 {
907 int error;
908 rnode_t *rp;
909
910 /*
911 * If we've got cached attributes, we're done, otherwise go
912 * to the server to get attributes, which will update the cache
913 * in the process.
914 */
915 error = nfs_getattr_cache(vp, vap);
916 if (error)
917 error = nfs_getattr_otw(vp, vap, cr);
918
919 /* Return the client's view of file size */
920 rp = VTOR(vp);
921 mutex_enter(&rp->r_statelock);
922 vap->va_size = rp->r_size;
923 mutex_exit(&rp->r_statelock);
924
925 return (error);
926 }
927
928 /*
929 * Get attributes over-the-wire and update attributes cache
930 * if no error occurred in the over-the-wire operation.
931 * Return 0 if successful, otherwise error.
932 */
933 int
nfs3_getattr_otw(vnode_t * vp,struct vattr * vap,cred_t * cr)934 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
935 {
936 int error;
937 GETATTR3args args;
938 GETATTR3vres res;
939 int douprintf;
940 failinfo_t fi;
941 hrtime_t t;
942
943 args.object = *VTOFH3(vp);
944 fi.vp = vp;
945 fi.fhp = (caddr_t)&args.object;
946 fi.copyproc = nfs3copyfh;
947 fi.lookupproc = nfs3lookup;
948 fi.xattrdirproc = acl_getxattrdir3;
949 res.fres.vp = vp;
950 res.fres.vap = vap;
951
952 douprintf = 1;
953
954 t = gethrtime();
955
956 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
957 xdr_nfs_fh3, (caddr_t)&args,
958 xdr_GETATTR3vres, (caddr_t)&res, cr,
959 &douprintf, &res.status, 0, &fi);
960
961 if (error)
962 return (error);
963
964 error = geterrno3(res.status);
965 if (error) {
966 PURGE_STALE_FH(error, vp, cr);
967 return (error);
968 }
969
970 /*
971 * Catch status codes that indicate fattr3 to vattr translation failure
972 */
973 if (res.fres.status)
974 return (res.fres.status);
975
976 nfs_attr_cache(vp, vap, t, cr);
977 return (0);
978 }
979
980 /*
981 * Return either cached or remote attributes. If get remote attr
982 * use them to check and invalidate caches, then cache the new attributes.
983 */
984 int
nfs3getattr(vnode_t * vp,struct vattr * vap,cred_t * cr)985 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
986 {
987 int error;
988 rnode_t *rp;
989
990 /*
991 * If we've got cached attributes, we're done, otherwise go
992 * to the server to get attributes, which will update the cache
993 * in the process.
994 */
995 error = nfs_getattr_cache(vp, vap);
996 if (error)
997 error = nfs3_getattr_otw(vp, vap, cr);
998
999 /* Return the client's view of file size */
1000 rp = VTOR(vp);
1001 mutex_enter(&rp->r_statelock);
1002 vap->va_size = rp->r_size;
1003 mutex_exit(&rp->r_statelock);
1004
1005 return (error);
1006 }
1007
1008 vtype_t nf_to_vt[] = {
1009 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
1010 };
1011 /*
1012 * Convert NFS Version 2 over the network attributes to the local
1013 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1014 * network representation and the local representation is done here.
1015 * Returns 0 for success, error if failed due to overflow.
1016 */
1017 int
nattr_to_vattr(vnode_t * vp,struct nfsfattr * na,struct vattr * vap)1018 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
1019 {
1020 /* overflow in time attributes? */
1021 #ifndef _LP64
1022 if (!NFS2_FATTR_TIME_OK(na))
1023 return (EOVERFLOW);
1024 #endif
1025
1026 vap->va_mask = AT_ALL;
1027
1028 if (na->na_type < NFNON || na->na_type > NFSOC)
1029 vap->va_type = VBAD;
1030 else
1031 vap->va_type = nf_to_vt[na->na_type];
1032 vap->va_mode = na->na_mode;
1033 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
1034 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
1035 vap->va_fsid = vp->v_vfsp->vfs_dev;
1036 vap->va_nodeid = na->na_nodeid;
1037 vap->va_nlink = na->na_nlink;
1038 vap->va_size = na->na_size; /* keep for cache validation */
1039 /*
1040 * nfs protocol defines times as unsigned so don't extend sign,
1041 * unless sysadmin set nfs_allow_preepoch_time.
1042 */
1043 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
1044 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
1045 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
1046 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
1047 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
1048 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
1049 /*
1050 * Shannon's law - uncompress the received dev_t
1051 * if the top half of is zero indicating a response
1052 * from an `older style' OS. Except for when it is a
1053 * `new style' OS sending the maj device of zero,
1054 * in which case the algorithm still works because the
1055 * fact that it is a new style server
1056 * is hidden by the minor device not being greater
1057 * than 255 (a requirement in this case).
1058 */
1059 if ((na->na_rdev & 0xffff0000) == 0)
1060 vap->va_rdev = nfsv2_expdev(na->na_rdev);
1061 else
1062 vap->va_rdev = expldev(na->na_rdev);
1063
1064 vap->va_nblocks = na->na_blocks;
1065 switch (na->na_type) {
1066 case NFBLK:
1067 vap->va_blksize = DEV_BSIZE;
1068 break;
1069
1070 case NFCHR:
1071 vap->va_blksize = MAXBSIZE;
1072 break;
1073
1074 case NFSOC:
1075 default:
1076 vap->va_blksize = na->na_blocksize;
1077 break;
1078 }
1079 /*
1080 * This bit of ugliness is a hack to preserve the
1081 * over-the-wire protocols for named-pipe vnodes.
1082 * It remaps the special over-the-wire type to the
1083 * VFIFO type. (see note in nfs.h)
1084 */
1085 if (NA_ISFIFO(na)) {
1086 vap->va_type = VFIFO;
1087 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1088 vap->va_rdev = 0;
1089 vap->va_blksize = na->na_blocksize;
1090 }
1091 vap->va_seq = 0;
1092 return (0);
1093 }
1094
1095 /*
1096 * Convert NFS Version 3 over the network attributes to the local
1097 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1098 * network representation and the local representation is done here.
1099 */
1100 vtype_t nf3_to_vt[] = {
1101 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1102 };
1103
1104 int
fattr3_to_vattr(vnode_t * vp,fattr3 * na,struct vattr * vap)1105 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1106 {
1107
1108 #ifndef _LP64
1109 /* overflow in time attributes? */
1110 if (!NFS3_FATTR_TIME_OK(na))
1111 return (EOVERFLOW);
1112 #endif
1113 if (!NFS3_SIZE_OK(na->size))
1114 /* file too big */
1115 return (EFBIG);
1116
1117 vap->va_mask = AT_ALL;
1118
1119 if (na->type < NF3REG || na->type > NF3FIFO)
1120 vap->va_type = VBAD;
1121 else
1122 vap->va_type = nf3_to_vt[na->type];
1123 vap->va_mode = na->mode;
1124 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1125 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1126 vap->va_fsid = vp->v_vfsp->vfs_dev;
1127 vap->va_nodeid = na->fileid;
1128 vap->va_nlink = na->nlink;
1129 vap->va_size = na->size;
1130
1131 /*
1132 * nfs protocol defines times as unsigned so don't extend sign,
1133 * unless sysadmin set nfs_allow_preepoch_time.
1134 */
1135 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1136 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1137 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1138 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1139 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1140 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1141
1142 switch (na->type) {
1143 case NF3BLK:
1144 vap->va_rdev = makedevice(na->rdev.specdata1,
1145 na->rdev.specdata2);
1146 vap->va_blksize = DEV_BSIZE;
1147 vap->va_nblocks = 0;
1148 break;
1149 case NF3CHR:
1150 vap->va_rdev = makedevice(na->rdev.specdata1,
1151 na->rdev.specdata2);
1152 vap->va_blksize = MAXBSIZE;
1153 vap->va_nblocks = 0;
1154 break;
1155 case NF3REG:
1156 case NF3DIR:
1157 case NF3LNK:
1158 vap->va_rdev = 0;
1159 vap->va_blksize = MAXBSIZE;
1160 vap->va_nblocks = (u_longlong_t)
1161 ((na->used + (size3)DEV_BSIZE - (size3)1) /
1162 (size3)DEV_BSIZE);
1163 break;
1164 case NF3SOCK:
1165 case NF3FIFO:
1166 default:
1167 vap->va_rdev = 0;
1168 vap->va_blksize = MAXBSIZE;
1169 vap->va_nblocks = 0;
1170 break;
1171 }
1172 vap->va_seq = 0;
1173 return (0);
1174 }
1175
1176 /*
1177 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1178 * for the demand-based allocation of async threads per-mount. The
1179 * nfs_async_timeout is the amount of time a thread will live after it
1180 * becomes idle, unless new I/O requests are received before the thread
1181 * dies. See nfs_async_putpage and nfs_async_start.
1182 */
1183
1184 int nfs_async_timeout = -1; /* uninitialized */
1185
1186 static void nfs_async_start(struct vfs *);
1187 static void nfs_async_pgops_start(struct vfs *);
1188 static void nfs_async_common_start(struct vfs *, int);
1189
1190 static void
free_async_args(struct nfs_async_reqs * args)1191 free_async_args(struct nfs_async_reqs *args)
1192 {
1193 rnode_t *rp;
1194
1195 if (args->a_io != NFS_INACTIVE) {
1196 rp = VTOR(args->a_vp);
1197 mutex_enter(&rp->r_statelock);
1198 rp->r_count--;
1199 if (args->a_io == NFS_PUTAPAGE ||
1200 args->a_io == NFS_PAGEIO)
1201 rp->r_awcount--;
1202 cv_broadcast(&rp->r_cv);
1203 mutex_exit(&rp->r_statelock);
1204 VN_RELE(args->a_vp);
1205 }
1206 crfree(args->a_cred);
1207 kmem_free(args, sizeof (*args));
1208 }
1209
1210 /*
1211 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1212 * pageout(), running in the global zone, have legitimate reasons to do
1213 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1214 * use of a a per-mount "asynchronous requests manager thread" which is
1215 * signaled by the various asynchronous work routines when there is
1216 * asynchronous work to be done. It is responsible for creating new
1217 * worker threads if necessary, and notifying existing worker threads
1218 * that there is work to be done.
1219 *
1220 * In other words, it will "take the specifications from the customers and
1221 * give them to the engineers."
1222 *
1223 * Worker threads die off of their own accord if they are no longer
1224 * needed.
1225 *
1226 * This thread is killed when the zone is going away or the filesystem
1227 * is being unmounted.
1228 */
1229 void
nfs_async_manager(vfs_t * vfsp)1230 nfs_async_manager(vfs_t *vfsp)
1231 {
1232 callb_cpr_t cprinfo;
1233 mntinfo_t *mi;
1234 uint_t max_threads;
1235
1236 mi = VFTOMI(vfsp);
1237
1238 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1239 "nfs_async_manager");
1240
1241 mutex_enter(&mi->mi_async_lock);
1242 /*
1243 * We want to stash the max number of threads that this mount was
1244 * allowed so we can use it later when the variable is set to zero as
1245 * part of the zone/mount going away.
1246 *
1247 * We want to be able to create at least one thread to handle
1248 * asynchronous inactive calls.
1249 */
1250 max_threads = MAX(mi->mi_max_threads, 1);
1251 /*
1252 * We don't want to wait for mi_max_threads to go to zero, since that
1253 * happens as part of a failed unmount, but this thread should only
1254 * exit when the mount/zone is really going away.
1255 *
1256 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1257 * attempted: the various _async_*() functions know to do things
1258 * inline if mi_max_threads == 0. Henceforth we just drain out the
1259 * outstanding requests.
1260 *
1261 * Note that we still create zthreads even if we notice the zone is
1262 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1263 * shutdown sequence to take slightly longer in some cases, but
1264 * doesn't violate the protocol, as all threads will exit as soon as
1265 * they're done processing the remaining requests.
1266 */
1267 for (;;) {
1268 while (mi->mi_async_req_count > 0) {
1269 /*
1270 * Paranoia: If the mount started out having
1271 * (mi->mi_max_threads == 0), and the value was
1272 * later changed (via a debugger or somesuch),
1273 * we could be confused since we will think we
1274 * can't create any threads, and the calling
1275 * code (which looks at the current value of
1276 * mi->mi_max_threads, now non-zero) thinks we
1277 * can.
1278 *
1279 * So, because we're paranoid, we create threads
1280 * up to the maximum of the original and the
1281 * current value. This means that future
1282 * (debugger-induced) lowerings of
1283 * mi->mi_max_threads are ignored for our
1284 * purposes, but who told them they could change
1285 * random values on a live kernel anyhow?
1286 */
1287 if (mi->mi_threads[NFS_ASYNC_QUEUE] <
1288 MAX(mi->mi_max_threads, max_threads)) {
1289 mi->mi_threads[NFS_ASYNC_QUEUE]++;
1290 mutex_exit(&mi->mi_async_lock);
1291 VFS_HOLD(vfsp); /* hold for new thread */
1292 (void) zthread_create(NULL, 0, nfs_async_start,
1293 vfsp, 0, minclsyspri);
1294 mutex_enter(&mi->mi_async_lock);
1295 } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
1296 NUM_ASYNC_PGOPS_THREADS) {
1297 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
1298 mutex_exit(&mi->mi_async_lock);
1299 VFS_HOLD(vfsp); /* hold for new thread */
1300 (void) zthread_create(NULL, 0,
1301 nfs_async_pgops_start, vfsp, 0,
1302 minclsyspri);
1303 mutex_enter(&mi->mi_async_lock);
1304 }
1305 NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1306 ASSERT(mi->mi_async_req_count != 0);
1307 mi->mi_async_req_count--;
1308 }
1309
1310 mutex_enter(&mi->mi_lock);
1311 if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
1312 mutex_exit(&mi->mi_lock);
1313 break;
1314 }
1315 mutex_exit(&mi->mi_lock);
1316
1317 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1318 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1319 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1320 }
1321 /*
1322 * Let everyone know we're done.
1323 */
1324 mi->mi_manager_thread = NULL;
1325 cv_broadcast(&mi->mi_async_cv);
1326
1327 /*
1328 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1329 * since CALLB_CPR_EXIT is actually responsible for releasing
1330 * 'mi_async_lock'.
1331 */
1332 CALLB_CPR_EXIT(&cprinfo);
1333 VFS_RELE(vfsp); /* release thread's hold */
1334 zthread_exit();
1335 }
1336
1337 /*
1338 * Signal (and wait for) the async manager thread to clean up and go away.
1339 */
1340 void
nfs_async_manager_stop(vfs_t * vfsp)1341 nfs_async_manager_stop(vfs_t *vfsp)
1342 {
1343 mntinfo_t *mi = VFTOMI(vfsp);
1344
1345 mutex_enter(&mi->mi_async_lock);
1346 mutex_enter(&mi->mi_lock);
1347 mi->mi_flags |= MI_ASYNC_MGR_STOP;
1348 mutex_exit(&mi->mi_lock);
1349 cv_broadcast(&mi->mi_async_reqs_cv);
1350 while (mi->mi_manager_thread != NULL)
1351 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1352 mutex_exit(&mi->mi_async_lock);
1353 }
1354
1355 int
nfs_async_readahead(vnode_t * vp,u_offset_t blkoff,caddr_t addr,struct seg * seg,cred_t * cr,void (* readahead)(vnode_t *,u_offset_t,caddr_t,struct seg *,cred_t *))1356 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1357 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1358 u_offset_t, caddr_t, struct seg *, cred_t *))
1359 {
1360 rnode_t *rp;
1361 mntinfo_t *mi;
1362 struct nfs_async_reqs *args;
1363
1364 rp = VTOR(vp);
1365 ASSERT(rp->r_freef == NULL);
1366
1367 mi = VTOMI(vp);
1368
1369 /*
1370 * If addr falls in a different segment, don't bother doing readahead.
1371 */
1372 if (addr >= seg->s_base + seg->s_size)
1373 return (-1);
1374
1375 /*
1376 * If we can't allocate a request structure, punt on the readahead.
1377 */
1378 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1379 return (-1);
1380
1381 /*
1382 * If a lock operation is pending, don't initiate any new
1383 * readaheads. Otherwise, bump r_count to indicate the new
1384 * asynchronous I/O.
1385 */
1386 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1387 kmem_free(args, sizeof (*args));
1388 return (-1);
1389 }
1390 mutex_enter(&rp->r_statelock);
1391 rp->r_count++;
1392 mutex_exit(&rp->r_statelock);
1393 nfs_rw_exit(&rp->r_lkserlock);
1394
1395 args->a_next = NULL;
1396 #ifdef DEBUG
1397 args->a_queuer = curthread;
1398 #endif
1399 VN_HOLD(vp);
1400 args->a_vp = vp;
1401 ASSERT(cr != NULL);
1402 crhold(cr);
1403 args->a_cred = cr;
1404 args->a_io = NFS_READ_AHEAD;
1405 args->a_nfs_readahead = readahead;
1406 args->a_nfs_blkoff = blkoff;
1407 args->a_nfs_seg = seg;
1408 args->a_nfs_addr = addr;
1409
1410 mutex_enter(&mi->mi_async_lock);
1411
1412 /*
1413 * If asyncio has been disabled, don't bother readahead.
1414 */
1415 if (mi->mi_max_threads == 0) {
1416 mutex_exit(&mi->mi_async_lock);
1417 goto noasync;
1418 }
1419
1420 /*
1421 * Link request structure into the async list and
1422 * wakeup async thread to do the i/o.
1423 */
1424 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1425 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1426 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1427 } else {
1428 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1429 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1430 }
1431
1432 if (mi->mi_io_kstats) {
1433 mutex_enter(&mi->mi_lock);
1434 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1435 mutex_exit(&mi->mi_lock);
1436 }
1437
1438 mi->mi_async_req_count++;
1439 ASSERT(mi->mi_async_req_count != 0);
1440 cv_signal(&mi->mi_async_reqs_cv);
1441 mutex_exit(&mi->mi_async_lock);
1442 return (0);
1443
1444 noasync:
1445 mutex_enter(&rp->r_statelock);
1446 rp->r_count--;
1447 cv_broadcast(&rp->r_cv);
1448 mutex_exit(&rp->r_statelock);
1449 VN_RELE(vp);
1450 crfree(cr);
1451 kmem_free(args, sizeof (*args));
1452 return (-1);
1453 }
1454
1455 int
nfs_async_putapage(vnode_t * vp,page_t * pp,u_offset_t off,size_t len,int flags,cred_t * cr,int (* putapage)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))1456 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1457 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1458 u_offset_t, size_t, int, cred_t *))
1459 {
1460 rnode_t *rp;
1461 mntinfo_t *mi;
1462 struct nfs_async_reqs *args;
1463
1464 ASSERT(flags & B_ASYNC);
1465 ASSERT(vp->v_vfsp != NULL);
1466
1467 rp = VTOR(vp);
1468 ASSERT(rp->r_count > 0);
1469
1470 mi = VTOMI(vp);
1471
1472 /*
1473 * If we can't allocate a request structure, do the putpage
1474 * operation synchronously in this thread's context.
1475 */
1476 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1477 goto noasync;
1478
1479 args->a_next = NULL;
1480 #ifdef DEBUG
1481 args->a_queuer = curthread;
1482 #endif
1483 VN_HOLD(vp);
1484 args->a_vp = vp;
1485 ASSERT(cr != NULL);
1486 crhold(cr);
1487 args->a_cred = cr;
1488 args->a_io = NFS_PUTAPAGE;
1489 args->a_nfs_putapage = putapage;
1490 args->a_nfs_pp = pp;
1491 args->a_nfs_off = off;
1492 args->a_nfs_len = (uint_t)len;
1493 args->a_nfs_flags = flags;
1494
1495 mutex_enter(&mi->mi_async_lock);
1496
1497 /*
1498 * If asyncio has been disabled, then make a synchronous request.
1499 * This check is done a second time in case async io was diabled
1500 * while this thread was blocked waiting for memory pressure to
1501 * reduce or for the queue to drain.
1502 */
1503 if (mi->mi_max_threads == 0) {
1504 mutex_exit(&mi->mi_async_lock);
1505 goto noasync;
1506 }
1507
1508 /*
1509 * Link request structure into the async list and
1510 * wakeup async thread to do the i/o.
1511 */
1512 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1513 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1514 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1515 } else {
1516 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1517 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1518 }
1519
1520 mutex_enter(&rp->r_statelock);
1521 rp->r_count++;
1522 rp->r_awcount++;
1523 mutex_exit(&rp->r_statelock);
1524
1525 if (mi->mi_io_kstats) {
1526 mutex_enter(&mi->mi_lock);
1527 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1528 mutex_exit(&mi->mi_lock);
1529 }
1530
1531 mi->mi_async_req_count++;
1532 ASSERT(mi->mi_async_req_count != 0);
1533 cv_signal(&mi->mi_async_reqs_cv);
1534 mutex_exit(&mi->mi_async_lock);
1535 return (0);
1536
1537 noasync:
1538 if (args != NULL) {
1539 VN_RELE(vp);
1540 crfree(cr);
1541 kmem_free(args, sizeof (*args));
1542 }
1543
1544 if (curproc == proc_pageout || curproc == proc_fsflush) {
1545 /*
1546 * If we get here in the context of the pageout/fsflush,
1547 * we refuse to do a sync write, because this may hang
1548 * pageout (and the machine). In this case, we just
1549 * re-mark the page as dirty and punt on the page.
1550 *
1551 * Make sure B_FORCE isn't set. We can re-mark the
1552 * pages as dirty and unlock the pages in one swoop by
1553 * passing in B_ERROR to pvn_write_done(). However,
1554 * we should make sure B_FORCE isn't set - we don't
1555 * want the page tossed before it gets written out.
1556 */
1557 if (flags & B_FORCE)
1558 flags &= ~(B_INVAL | B_FORCE);
1559 pvn_write_done(pp, flags | B_ERROR);
1560 return (0);
1561 }
1562 if (nfs_zone() != mi->mi_zone) {
1563 /*
1564 * So this was a cross-zone sync putpage. We pass in B_ERROR
1565 * to pvn_write_done() to re-mark the pages as dirty and unlock
1566 * them.
1567 *
1568 * We don't want to clear B_FORCE here as the caller presumably
1569 * knows what they're doing if they set it.
1570 */
1571 pvn_write_done(pp, flags | B_ERROR);
1572 return (EPERM);
1573 }
1574 return ((*putapage)(vp, pp, off, len, flags, cr));
1575 }
1576
1577 int
nfs_async_pageio(vnode_t * vp,page_t * pp,u_offset_t io_off,size_t io_len,int flags,cred_t * cr,int (* pageio)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))1578 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1579 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1580 size_t, int, cred_t *))
1581 {
1582 rnode_t *rp;
1583 mntinfo_t *mi;
1584 struct nfs_async_reqs *args;
1585
1586 ASSERT(flags & B_ASYNC);
1587 ASSERT(vp->v_vfsp != NULL);
1588
1589 rp = VTOR(vp);
1590 ASSERT(rp->r_count > 0);
1591
1592 mi = VTOMI(vp);
1593
1594 /*
1595 * If we can't allocate a request structure, do the pageio
1596 * request synchronously in this thread's context.
1597 */
1598 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1599 goto noasync;
1600
1601 args->a_next = NULL;
1602 #ifdef DEBUG
1603 args->a_queuer = curthread;
1604 #endif
1605 VN_HOLD(vp);
1606 args->a_vp = vp;
1607 ASSERT(cr != NULL);
1608 crhold(cr);
1609 args->a_cred = cr;
1610 args->a_io = NFS_PAGEIO;
1611 args->a_nfs_pageio = pageio;
1612 args->a_nfs_pp = pp;
1613 args->a_nfs_off = io_off;
1614 args->a_nfs_len = (uint_t)io_len;
1615 args->a_nfs_flags = flags;
1616
1617 mutex_enter(&mi->mi_async_lock);
1618
1619 /*
1620 * If asyncio has been disabled, then make a synchronous request.
1621 * This check is done a second time in case async io was diabled
1622 * while this thread was blocked waiting for memory pressure to
1623 * reduce or for the queue to drain.
1624 */
1625 if (mi->mi_max_threads == 0) {
1626 mutex_exit(&mi->mi_async_lock);
1627 goto noasync;
1628 }
1629
1630 /*
1631 * Link request structure into the async list and
1632 * wakeup async thread to do the i/o.
1633 */
1634 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1635 mi->mi_async_reqs[NFS_PAGEIO] = args;
1636 mi->mi_async_tail[NFS_PAGEIO] = args;
1637 } else {
1638 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1639 mi->mi_async_tail[NFS_PAGEIO] = args;
1640 }
1641
1642 mutex_enter(&rp->r_statelock);
1643 rp->r_count++;
1644 rp->r_awcount++;
1645 mutex_exit(&rp->r_statelock);
1646
1647 if (mi->mi_io_kstats) {
1648 mutex_enter(&mi->mi_lock);
1649 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1650 mutex_exit(&mi->mi_lock);
1651 }
1652
1653 mi->mi_async_req_count++;
1654 ASSERT(mi->mi_async_req_count != 0);
1655 cv_signal(&mi->mi_async_reqs_cv);
1656 mutex_exit(&mi->mi_async_lock);
1657 return (0);
1658
1659 noasync:
1660 if (args != NULL) {
1661 VN_RELE(vp);
1662 crfree(cr);
1663 kmem_free(args, sizeof (*args));
1664 }
1665
1666 /*
1667 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1668 * the page list), for writes we do it synchronously, except for
1669 * proc_pageout/proc_fsflush as described below.
1670 */
1671 if (flags & B_READ) {
1672 pvn_read_done(pp, flags | B_ERROR);
1673 return (0);
1674 }
1675
1676 if (curproc == proc_pageout || curproc == proc_fsflush) {
1677 /*
1678 * If we get here in the context of the pageout/fsflush,
1679 * we refuse to do a sync write, because this may hang
1680 * pageout/fsflush (and the machine). In this case, we just
1681 * re-mark the page as dirty and punt on the page.
1682 *
1683 * Make sure B_FORCE isn't set. We can re-mark the
1684 * pages as dirty and unlock the pages in one swoop by
1685 * passing in B_ERROR to pvn_write_done(). However,
1686 * we should make sure B_FORCE isn't set - we don't
1687 * want the page tossed before it gets written out.
1688 */
1689 if (flags & B_FORCE)
1690 flags &= ~(B_INVAL | B_FORCE);
1691 pvn_write_done(pp, flags | B_ERROR);
1692 return (0);
1693 }
1694
1695 if (nfs_zone() != mi->mi_zone) {
1696 /*
1697 * So this was a cross-zone sync pageio. We pass in B_ERROR
1698 * to pvn_write_done() to re-mark the pages as dirty and unlock
1699 * them.
1700 *
1701 * We don't want to clear B_FORCE here as the caller presumably
1702 * knows what they're doing if they set it.
1703 */
1704 pvn_write_done(pp, flags | B_ERROR);
1705 return (EPERM);
1706 }
1707 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1708 }
1709
1710 void
nfs_async_readdir(vnode_t * vp,rddir_cache * rdc,cred_t * cr,int (* readdir)(vnode_t *,rddir_cache *,cred_t *))1711 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1712 int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1713 {
1714 rnode_t *rp;
1715 mntinfo_t *mi;
1716 struct nfs_async_reqs *args;
1717
1718 rp = VTOR(vp);
1719 ASSERT(rp->r_freef == NULL);
1720
1721 mi = VTOMI(vp);
1722
1723 /*
1724 * If we can't allocate a request structure, do the readdir
1725 * operation synchronously in this thread's context.
1726 */
1727 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1728 goto noasync;
1729
1730 args->a_next = NULL;
1731 #ifdef DEBUG
1732 args->a_queuer = curthread;
1733 #endif
1734 VN_HOLD(vp);
1735 args->a_vp = vp;
1736 ASSERT(cr != NULL);
1737 crhold(cr);
1738 args->a_cred = cr;
1739 args->a_io = NFS_READDIR;
1740 args->a_nfs_readdir = readdir;
1741 args->a_nfs_rdc = rdc;
1742
1743 mutex_enter(&mi->mi_async_lock);
1744
1745 /*
1746 * If asyncio has been disabled, then make a synchronous request.
1747 */
1748 if (mi->mi_max_threads == 0) {
1749 mutex_exit(&mi->mi_async_lock);
1750 goto noasync;
1751 }
1752
1753 /*
1754 * Link request structure into the async list and
1755 * wakeup async thread to do the i/o.
1756 */
1757 if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1758 mi->mi_async_reqs[NFS_READDIR] = args;
1759 mi->mi_async_tail[NFS_READDIR] = args;
1760 } else {
1761 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1762 mi->mi_async_tail[NFS_READDIR] = args;
1763 }
1764
1765 mutex_enter(&rp->r_statelock);
1766 rp->r_count++;
1767 mutex_exit(&rp->r_statelock);
1768
1769 if (mi->mi_io_kstats) {
1770 mutex_enter(&mi->mi_lock);
1771 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1772 mutex_exit(&mi->mi_lock);
1773 }
1774
1775 mi->mi_async_req_count++;
1776 ASSERT(mi->mi_async_req_count != 0);
1777 cv_signal(&mi->mi_async_reqs_cv);
1778 mutex_exit(&mi->mi_async_lock);
1779 return;
1780
1781 noasync:
1782 if (args != NULL) {
1783 VN_RELE(vp);
1784 crfree(cr);
1785 kmem_free(args, sizeof (*args));
1786 }
1787
1788 rdc->entries = NULL;
1789 mutex_enter(&rp->r_statelock);
1790 ASSERT(rdc->flags & RDDIR);
1791 rdc->flags &= ~RDDIR;
1792 rdc->flags |= RDDIRREQ;
1793 /*
1794 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1795 * is set, wakeup the thread sleeping in cv_wait_sig().
1796 * The woken up thread will reset the flag to RDDIR and will
1797 * continue with the readdir opeartion.
1798 */
1799 if (rdc->flags & RDDIRWAIT) {
1800 rdc->flags &= ~RDDIRWAIT;
1801 cv_broadcast(&rdc->cv);
1802 }
1803 mutex_exit(&rp->r_statelock);
1804 rddir_cache_rele(rdc);
1805 }
1806
1807 void
nfs_async_commit(vnode_t * vp,page_t * plist,offset3 offset,count3 count,cred_t * cr,void (* commit)(vnode_t *,page_t *,offset3,count3,cred_t *))1808 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1809 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *))
1810 {
1811 rnode_t *rp;
1812 mntinfo_t *mi;
1813 struct nfs_async_reqs *args;
1814 page_t *pp;
1815
1816 rp = VTOR(vp);
1817 mi = VTOMI(vp);
1818
1819 /*
1820 * If we can't allocate a request structure, do the commit
1821 * operation synchronously in this thread's context.
1822 */
1823 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1824 goto noasync;
1825
1826 args->a_next = NULL;
1827 #ifdef DEBUG
1828 args->a_queuer = curthread;
1829 #endif
1830 VN_HOLD(vp);
1831 args->a_vp = vp;
1832 ASSERT(cr != NULL);
1833 crhold(cr);
1834 args->a_cred = cr;
1835 args->a_io = NFS_COMMIT;
1836 args->a_nfs_commit = commit;
1837 args->a_nfs_plist = plist;
1838 args->a_nfs_offset = offset;
1839 args->a_nfs_count = count;
1840
1841 mutex_enter(&mi->mi_async_lock);
1842
1843 /*
1844 * If asyncio has been disabled, then make a synchronous request.
1845 * This check is done a second time in case async io was diabled
1846 * while this thread was blocked waiting for memory pressure to
1847 * reduce or for the queue to drain.
1848 */
1849 if (mi->mi_max_threads == 0) {
1850 mutex_exit(&mi->mi_async_lock);
1851 goto noasync;
1852 }
1853
1854 /*
1855 * Link request structure into the async list and
1856 * wakeup async thread to do the i/o.
1857 */
1858 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1859 mi->mi_async_reqs[NFS_COMMIT] = args;
1860 mi->mi_async_tail[NFS_COMMIT] = args;
1861 } else {
1862 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1863 mi->mi_async_tail[NFS_COMMIT] = args;
1864 }
1865
1866 mutex_enter(&rp->r_statelock);
1867 rp->r_count++;
1868 mutex_exit(&rp->r_statelock);
1869
1870 if (mi->mi_io_kstats) {
1871 mutex_enter(&mi->mi_lock);
1872 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1873 mutex_exit(&mi->mi_lock);
1874 }
1875
1876 mi->mi_async_req_count++;
1877 ASSERT(mi->mi_async_req_count != 0);
1878 cv_signal(&mi->mi_async_reqs_cv);
1879 mutex_exit(&mi->mi_async_lock);
1880 return;
1881
1882 noasync:
1883 if (args != NULL) {
1884 VN_RELE(vp);
1885 crfree(cr);
1886 kmem_free(args, sizeof (*args));
1887 }
1888
1889 if (curproc == proc_pageout || curproc == proc_fsflush ||
1890 nfs_zone() != mi->mi_zone) {
1891 while (plist != NULL) {
1892 pp = plist;
1893 page_sub(&plist, pp);
1894 pp->p_fsdata = C_COMMIT;
1895 page_unlock(pp);
1896 }
1897 return;
1898 }
1899 (*commit)(vp, plist, offset, count, cr);
1900 }
1901
1902 void
nfs_async_inactive(vnode_t * vp,cred_t * cr,void (* inactive)(vnode_t *,cred_t *,caller_context_t *))1903 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1904 void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
1905 {
1906 mntinfo_t *mi;
1907 struct nfs_async_reqs *args;
1908
1909 mi = VTOMI(vp);
1910
1911 args = kmem_alloc(sizeof (*args), KM_SLEEP);
1912 args->a_next = NULL;
1913 #ifdef DEBUG
1914 args->a_queuer = curthread;
1915 #endif
1916 args->a_vp = vp;
1917 ASSERT(cr != NULL);
1918 crhold(cr);
1919 args->a_cred = cr;
1920 args->a_io = NFS_INACTIVE;
1921 args->a_nfs_inactive = inactive;
1922
1923 /*
1924 * Note that we don't check mi->mi_max_threads here, since we
1925 * *need* to get rid of this vnode regardless of whether someone
1926 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1927 *
1928 * The manager thread knows about this and is willing to create
1929 * at least one thread to accommodate us.
1930 */
1931 mutex_enter(&mi->mi_async_lock);
1932 if (mi->mi_manager_thread == NULL) {
1933 rnode_t *rp = VTOR(vp);
1934
1935 mutex_exit(&mi->mi_async_lock);
1936 crfree(cr); /* drop our reference */
1937 kmem_free(args, sizeof (*args));
1938 /*
1939 * We can't do an over-the-wire call since we're in the wrong
1940 * zone, so we need to clean up state as best we can and then
1941 * throw away the vnode.
1942 */
1943 mutex_enter(&rp->r_statelock);
1944 if (rp->r_unldvp != NULL) {
1945 vnode_t *unldvp;
1946 char *unlname;
1947 cred_t *unlcred;
1948
1949 unldvp = rp->r_unldvp;
1950 rp->r_unldvp = NULL;
1951 unlname = rp->r_unlname;
1952 rp->r_unlname = NULL;
1953 unlcred = rp->r_unlcred;
1954 rp->r_unlcred = NULL;
1955 mutex_exit(&rp->r_statelock);
1956
1957 VN_RELE(unldvp);
1958 kmem_free(unlname, MAXNAMELEN);
1959 crfree(unlcred);
1960 } else {
1961 mutex_exit(&rp->r_statelock);
1962 }
1963 /*
1964 * No need to explicitly throw away any cached pages. The
1965 * eventual rinactive() will attempt a synchronous
1966 * VOP_PUTPAGE() which will immediately fail since the request
1967 * is coming from the wrong zone, and then will proceed to call
1968 * nfs_invalidate_pages() which will clean things up for us.
1969 */
1970 rp_addfree(VTOR(vp), cr);
1971 return;
1972 }
1973
1974 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1975 mi->mi_async_reqs[NFS_INACTIVE] = args;
1976 } else {
1977 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1978 }
1979 mi->mi_async_tail[NFS_INACTIVE] = args;
1980 /*
1981 * Don't increment r_count, since we're trying to get rid of the vnode.
1982 */
1983
1984 mi->mi_async_req_count++;
1985 ASSERT(mi->mi_async_req_count != 0);
1986 cv_signal(&mi->mi_async_reqs_cv);
1987 mutex_exit(&mi->mi_async_lock);
1988 }
1989
1990 static void
nfs_async_start(struct vfs * vfsp)1991 nfs_async_start(struct vfs *vfsp)
1992 {
1993 nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
1994 }
1995
1996 static void
nfs_async_pgops_start(struct vfs * vfsp)1997 nfs_async_pgops_start(struct vfs *vfsp)
1998 {
1999 nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
2000 }
2001
2002 /*
2003 * The async queues for each mounted file system are arranged as a
2004 * set of queues, one for each async i/o type. Requests are taken
2005 * from the queues in a round-robin fashion. A number of consecutive
2006 * requests are taken from each queue before moving on to the next
2007 * queue. This functionality may allow the NFS Version 2 server to do
2008 * write clustering, even if the client is mixing writes and reads
2009 * because it will take multiple write requests from the queue
2010 * before processing any of the other async i/o types.
2011 *
2012 * XXX The nfs_async_common_start thread is unsafe in the light of the present
2013 * model defined by cpr to suspend the system. Specifically over the
2014 * wire calls are cpr-unsafe. The thread should be reevaluated in
2015 * case of future updates to the cpr model.
2016 */
2017 static void
nfs_async_common_start(struct vfs * vfsp,int async_queue)2018 nfs_async_common_start(struct vfs *vfsp, int async_queue)
2019 {
2020 struct nfs_async_reqs *args;
2021 mntinfo_t *mi = VFTOMI(vfsp);
2022 clock_t time_left = 1;
2023 callb_cpr_t cprinfo;
2024 int i;
2025 int async_types;
2026 kcondvar_t *async_work_cv;
2027
2028 if (async_queue == NFS_ASYNC_QUEUE) {
2029 async_types = NFS_ASYNC_TYPES;
2030 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
2031 } else {
2032 async_types = NFS_ASYNC_PGOPS_TYPES;
2033 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
2034 }
2035
2036 /*
2037 * Dynamic initialization of nfs_async_timeout to allow nfs to be
2038 * built in an implementation independent manner.
2039 */
2040 if (nfs_async_timeout == -1)
2041 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
2042
2043 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
2044
2045 mutex_enter(&mi->mi_async_lock);
2046 for (;;) {
2047 /*
2048 * Find the next queue containing an entry. We start
2049 * at the current queue pointer and then round robin
2050 * through all of them until we either find a non-empty
2051 * queue or have looked through all of them.
2052 */
2053 for (i = 0; i < async_types; i++) {
2054 args = *mi->mi_async_curr[async_queue];
2055 if (args != NULL)
2056 break;
2057 mi->mi_async_curr[async_queue]++;
2058 if (mi->mi_async_curr[async_queue] ==
2059 &mi->mi_async_reqs[async_types]) {
2060 mi->mi_async_curr[async_queue] =
2061 &mi->mi_async_reqs[0];
2062 }
2063 }
2064 /*
2065 * If we didn't find a entry, then block until woken up
2066 * again and then look through the queues again.
2067 */
2068 if (args == NULL) {
2069 /*
2070 * Exiting is considered to be safe for CPR as well
2071 */
2072 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2073
2074 /*
2075 * Wakeup thread waiting to unmount the file
2076 * system only if all async threads are inactive.
2077 *
2078 * If we've timed-out and there's nothing to do,
2079 * then get rid of this thread.
2080 */
2081 if (mi->mi_max_threads == 0 || time_left <= 0) {
2082 --mi->mi_threads[async_queue];
2083
2084 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
2085 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
2086 cv_signal(&mi->mi_async_cv);
2087 CALLB_CPR_EXIT(&cprinfo);
2088 VFS_RELE(vfsp); /* release thread's hold */
2089 zthread_exit();
2090 /* NOTREACHED */
2091 }
2092 time_left = cv_reltimedwait(async_work_cv,
2093 &mi->mi_async_lock, nfs_async_timeout,
2094 TR_CLOCK_TICK);
2095
2096 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
2097
2098 continue;
2099 }
2100 time_left = 1;
2101
2102 /*
2103 * Remove the request from the async queue and then
2104 * update the current async request queue pointer. If
2105 * the current queue is empty or we have removed enough
2106 * consecutive entries from it, then reset the counter
2107 * for this queue and then move the current pointer to
2108 * the next queue.
2109 */
2110 *mi->mi_async_curr[async_queue] = args->a_next;
2111 if (*mi->mi_async_curr[async_queue] == NULL ||
2112 --mi->mi_async_clusters[args->a_io] == 0) {
2113 mi->mi_async_clusters[args->a_io] =
2114 mi->mi_async_init_clusters;
2115 mi->mi_async_curr[async_queue]++;
2116 if (mi->mi_async_curr[async_queue] ==
2117 &mi->mi_async_reqs[async_types]) {
2118 mi->mi_async_curr[async_queue] =
2119 &mi->mi_async_reqs[0];
2120 }
2121 }
2122
2123 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2124 mutex_enter(&mi->mi_lock);
2125 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2126 mutex_exit(&mi->mi_lock);
2127 }
2128
2129 mutex_exit(&mi->mi_async_lock);
2130
2131 /*
2132 * Obtain arguments from the async request structure.
2133 */
2134 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2135 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2136 args->a_nfs_addr, args->a_nfs_seg,
2137 args->a_cred);
2138 } else if (args->a_io == NFS_PUTAPAGE) {
2139 (void) (*args->a_nfs_putapage)(args->a_vp,
2140 args->a_nfs_pp, args->a_nfs_off,
2141 args->a_nfs_len, args->a_nfs_flags,
2142 args->a_cred);
2143 } else if (args->a_io == NFS_PAGEIO) {
2144 (void) (*args->a_nfs_pageio)(args->a_vp,
2145 args->a_nfs_pp, args->a_nfs_off,
2146 args->a_nfs_len, args->a_nfs_flags,
2147 args->a_cred);
2148 } else if (args->a_io == NFS_READDIR) {
2149 (void) ((*args->a_nfs_readdir)(args->a_vp,
2150 args->a_nfs_rdc, args->a_cred));
2151 } else if (args->a_io == NFS_COMMIT) {
2152 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2153 args->a_nfs_offset, args->a_nfs_count,
2154 args->a_cred);
2155 } else if (args->a_io == NFS_INACTIVE) {
2156 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
2157 }
2158
2159 /*
2160 * Now, release the vnode and free the credentials
2161 * structure.
2162 */
2163 free_async_args(args);
2164 /*
2165 * Reacquire the mutex because it will be needed above.
2166 */
2167 mutex_enter(&mi->mi_async_lock);
2168 }
2169 }
2170
2171 void
nfs_async_stop(struct vfs * vfsp)2172 nfs_async_stop(struct vfs *vfsp)
2173 {
2174 mntinfo_t *mi = VFTOMI(vfsp);
2175
2176 /*
2177 * Wait for all outstanding async operations to complete and for the
2178 * worker threads to exit.
2179 */
2180 mutex_enter(&mi->mi_async_lock);
2181 mi->mi_max_threads = 0;
2182 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2183 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2184 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
2185 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2186 mutex_exit(&mi->mi_async_lock);
2187 }
2188
2189 /*
2190 * nfs_async_stop_sig:
2191 * Wait for all outstanding putpage operation to complete. If a signal
2192 * is deliver we will abort and return non-zero. If we can put all the
2193 * pages we will return 0. This routine is called from nfs_unmount and
2194 * nfs3_unmount to make these operations interruptible.
2195 */
2196 int
nfs_async_stop_sig(struct vfs * vfsp)2197 nfs_async_stop_sig(struct vfs *vfsp)
2198 {
2199 mntinfo_t *mi = VFTOMI(vfsp);
2200 ushort_t omax;
2201 int rval;
2202
2203 /*
2204 * Wait for all outstanding async operations to complete and for the
2205 * worker threads to exit.
2206 */
2207 mutex_enter(&mi->mi_async_lock);
2208 omax = mi->mi_max_threads;
2209 mi->mi_max_threads = 0;
2210 /*
2211 * Tell all the worker threads to exit.
2212 */
2213 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2214 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2215 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
2216 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2217 break;
2218 }
2219 rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2220 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0); /* Interrupted */
2221 if (rval)
2222 mi->mi_max_threads = omax;
2223 mutex_exit(&mi->mi_async_lock);
2224
2225 return (rval);
2226 }
2227
2228 int
writerp(rnode_t * rp,caddr_t base,int tcount,struct uio * uio,int pgcreated)2229 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2230 {
2231 int pagecreate;
2232 int n;
2233 int saved_n;
2234 caddr_t saved_base;
2235 u_offset_t offset;
2236 int error;
2237 int sm_error;
2238 vnode_t *vp = RTOV(rp);
2239
2240 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2241 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2242 if (!vpm_enable) {
2243 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2244 }
2245
2246 /*
2247 * Move bytes in at most PAGESIZE chunks. We must avoid
2248 * spanning pages in uiomove() because page faults may cause
2249 * the cache to be invalidated out from under us. The r_size is not
2250 * updated until after the uiomove. If we push the last page of a
2251 * file before r_size is correct, we will lose the data written past
2252 * the current (and invalid) r_size.
2253 */
2254 do {
2255 offset = uio->uio_loffset;
2256 pagecreate = 0;
2257
2258 /*
2259 * n is the number of bytes required to satisfy the request
2260 * or the number of bytes to fill out the page.
2261 */
2262 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2263
2264 /*
2265 * Check to see if we can skip reading in the page
2266 * and just allocate the memory. We can do this
2267 * if we are going to rewrite the entire mapping
2268 * or if we are going to write to or beyond the current
2269 * end of file from the beginning of the mapping.
2270 *
2271 * The read of r_size is now protected by r_statelock.
2272 */
2273 mutex_enter(&rp->r_statelock);
2274 /*
2275 * When pgcreated is nonzero the caller has already done
2276 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2277 * segkpm this means we already have at least one page
2278 * created and mapped at base.
2279 */
2280 pagecreate = pgcreated ||
2281 ((offset & PAGEOFFSET) == 0 &&
2282 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2283
2284 mutex_exit(&rp->r_statelock);
2285 if (!vpm_enable && pagecreate) {
2286 /*
2287 * The last argument tells segmap_pagecreate() to
2288 * always lock the page, as opposed to sometimes
2289 * returning with the page locked. This way we avoid a
2290 * fault on the ensuing uiomove(), but also
2291 * more importantly (to fix bug 1094402) we can
2292 * call segmap_fault() to unlock the page in all
2293 * cases. An alternative would be to modify
2294 * segmap_pagecreate() to tell us when it is
2295 * locking a page, but that's a fairly major
2296 * interface change.
2297 */
2298 if (pgcreated == 0)
2299 (void) segmap_pagecreate(segkmap, base,
2300 (uint_t)n, 1);
2301 saved_base = base;
2302 saved_n = n;
2303 }
2304
2305 /*
2306 * The number of bytes of data in the last page can not
2307 * be accurately be determined while page is being
2308 * uiomove'd to and the size of the file being updated.
2309 * Thus, inform threads which need to know accurately
2310 * how much data is in the last page of the file. They
2311 * will not do the i/o immediately, but will arrange for
2312 * the i/o to happen later when this modify operation
2313 * will have finished.
2314 */
2315 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2316 mutex_enter(&rp->r_statelock);
2317 rp->r_flags |= RMODINPROGRESS;
2318 rp->r_modaddr = (offset & MAXBMASK);
2319 mutex_exit(&rp->r_statelock);
2320
2321 if (vpm_enable) {
2322 /*
2323 * Copy data. If new pages are created, part of
2324 * the page that is not written will be initizliazed
2325 * with zeros.
2326 */
2327 error = vpm_data_copy(vp, offset, n, uio,
2328 !pagecreate, NULL, 0, S_WRITE);
2329 } else {
2330 error = uiomove(base, n, UIO_WRITE, uio);
2331 }
2332
2333 /*
2334 * r_size is the maximum number of
2335 * bytes known to be in the file.
2336 * Make sure it is at least as high as the
2337 * first unwritten byte pointed to by uio_loffset.
2338 */
2339 mutex_enter(&rp->r_statelock);
2340 if (rp->r_size < uio->uio_loffset)
2341 rp->r_size = uio->uio_loffset;
2342 rp->r_flags &= ~RMODINPROGRESS;
2343 rp->r_flags |= RDIRTY;
2344 mutex_exit(&rp->r_statelock);
2345
2346 /* n = # of bytes written */
2347 n = (int)(uio->uio_loffset - offset);
2348
2349 if (!vpm_enable) {
2350 base += n;
2351 }
2352 tcount -= n;
2353 /*
2354 * If we created pages w/o initializing them completely,
2355 * we need to zero the part that wasn't set up.
2356 * This happens on a most EOF write cases and if
2357 * we had some sort of error during the uiomove.
2358 */
2359 if (!vpm_enable && pagecreate) {
2360 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2361 (void) kzero(base, PAGESIZE - n);
2362
2363 if (pgcreated) {
2364 /*
2365 * Caller is responsible for this page,
2366 * it was not created in this loop.
2367 */
2368 pgcreated = 0;
2369 } else {
2370 /*
2371 * For bug 1094402: segmap_pagecreate locks
2372 * page. Unlock it. This also unlocks the
2373 * pages allocated by page_create_va() in
2374 * segmap_pagecreate().
2375 */
2376 sm_error = segmap_fault(kas.a_hat, segkmap,
2377 saved_base, saved_n,
2378 F_SOFTUNLOCK, S_WRITE);
2379 if (error == 0)
2380 error = sm_error;
2381 }
2382 }
2383 } while (tcount > 0 && error == 0);
2384
2385 return (error);
2386 }
2387
2388 int
nfs_putpages(vnode_t * vp,u_offset_t off,size_t len,int flags,cred_t * cr)2389 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2390 {
2391 rnode_t *rp;
2392 page_t *pp;
2393 u_offset_t eoff;
2394 u_offset_t io_off;
2395 size_t io_len;
2396 int error;
2397 int rdirty;
2398 int err;
2399
2400 rp = VTOR(vp);
2401 ASSERT(rp->r_count > 0);
2402
2403 if (!vn_has_cached_data(vp))
2404 return (0);
2405
2406 ASSERT(vp->v_type != VCHR);
2407
2408 /*
2409 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2410 * writes. B_FORCE is set to force the VM system to actually
2411 * invalidate the pages, even if the i/o failed. The pages
2412 * need to get invalidated because they can't be written out
2413 * because there isn't any space left on either the server's
2414 * file system or in the user's disk quota. The B_FREE bit
2415 * is cleared to avoid confusion as to whether this is a
2416 * request to place the page on the freelist or to destroy
2417 * it.
2418 */
2419 if ((rp->r_flags & ROUTOFSPACE) ||
2420 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2421 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2422
2423 if (len == 0) {
2424 /*
2425 * If doing a full file synchronous operation, then clear
2426 * the RDIRTY bit. If a page gets dirtied while the flush
2427 * is happening, then RDIRTY will get set again. The
2428 * RDIRTY bit must get cleared before the flush so that
2429 * we don't lose this information.
2430 *
2431 * If there are no full file async write operations
2432 * pending and RDIRTY bit is set, clear it.
2433 */
2434 if (off == (u_offset_t)0 &&
2435 !(flags & B_ASYNC) &&
2436 (rp->r_flags & RDIRTY)) {
2437 mutex_enter(&rp->r_statelock);
2438 rdirty = (rp->r_flags & RDIRTY);
2439 rp->r_flags &= ~RDIRTY;
2440 mutex_exit(&rp->r_statelock);
2441 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2442 mutex_enter(&rp->r_statelock);
2443 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
2444 rdirty = (rp->r_flags & RDIRTY);
2445 rp->r_flags &= ~RDIRTY;
2446 }
2447 mutex_exit(&rp->r_statelock);
2448 } else
2449 rdirty = 0;
2450
2451 /*
2452 * Search the entire vp list for pages >= off, and flush
2453 * the dirty pages.
2454 */
2455 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2456 flags, cr);
2457
2458 /*
2459 * If an error occurred and the file was marked as dirty
2460 * before and we aren't forcibly invalidating pages, then
2461 * reset the RDIRTY flag.
2462 */
2463 if (error && rdirty &&
2464 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2465 mutex_enter(&rp->r_statelock);
2466 rp->r_flags |= RDIRTY;
2467 mutex_exit(&rp->r_statelock);
2468 }
2469 } else {
2470 /*
2471 * Do a range from [off...off + len) looking for pages
2472 * to deal with.
2473 */
2474 error = 0;
2475 #ifdef lint
2476 io_len = 0;
2477 #endif
2478 eoff = off + len;
2479 mutex_enter(&rp->r_statelock);
2480 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2481 io_off += io_len) {
2482 mutex_exit(&rp->r_statelock);
2483 /*
2484 * If we are not invalidating, synchronously
2485 * freeing or writing pages use the routine
2486 * page_lookup_nowait() to prevent reclaiming
2487 * them from the free list.
2488 */
2489 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2490 pp = page_lookup(vp, io_off,
2491 (flags & (B_INVAL | B_FREE)) ?
2492 SE_EXCL : SE_SHARED);
2493 } else {
2494 pp = page_lookup_nowait(vp, io_off,
2495 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2496 }
2497
2498 if (pp == NULL || !pvn_getdirty(pp, flags))
2499 io_len = PAGESIZE;
2500 else {
2501 err = (*rp->r_putapage)(vp, pp, &io_off,
2502 &io_len, flags, cr);
2503 if (!error)
2504 error = err;
2505 /*
2506 * "io_off" and "io_len" are returned as
2507 * the range of pages we actually wrote.
2508 * This allows us to skip ahead more quickly
2509 * since several pages may've been dealt
2510 * with by this iteration of the loop.
2511 */
2512 }
2513 mutex_enter(&rp->r_statelock);
2514 }
2515 mutex_exit(&rp->r_statelock);
2516 }
2517
2518 return (error);
2519 }
2520
2521 void
nfs_invalidate_pages(vnode_t * vp,u_offset_t off,cred_t * cr)2522 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2523 {
2524 rnode_t *rp;
2525
2526 rp = VTOR(vp);
2527 mutex_enter(&rp->r_statelock);
2528 while (rp->r_flags & RTRUNCATE)
2529 cv_wait(&rp->r_cv, &rp->r_statelock);
2530 rp->r_flags |= RTRUNCATE;
2531 if (off == (u_offset_t)0) {
2532 rp->r_flags &= ~RDIRTY;
2533 if (!(rp->r_flags & RSTALE))
2534 rp->r_error = 0;
2535 }
2536 rp->r_truncaddr = off;
2537 mutex_exit(&rp->r_statelock);
2538 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2539 B_INVAL | B_TRUNC, cr);
2540 mutex_enter(&rp->r_statelock);
2541 rp->r_flags &= ~RTRUNCATE;
2542 cv_broadcast(&rp->r_cv);
2543 mutex_exit(&rp->r_statelock);
2544 }
2545
2546 static int nfs_write_error_to_cons_only = 0;
2547 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2548
2549 /*
2550 * Print a file handle
2551 */
2552 void
nfs_printfhandle(nfs_fhandle * fhp)2553 nfs_printfhandle(nfs_fhandle *fhp)
2554 {
2555 int *ip;
2556 char *buf;
2557 size_t bufsize;
2558 char *cp;
2559
2560 /*
2561 * 13 == "(file handle:"
2562 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2563 * 1 == ' '
2564 * 8 == maximum strlen of "%x"
2565 * 3 == ")\n\0"
2566 */
2567 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2568 buf = kmem_alloc(bufsize, KM_NOSLEEP);
2569 if (buf == NULL)
2570 return;
2571
2572 cp = buf;
2573 (void) strcpy(cp, "(file handle:");
2574 while (*cp != '\0')
2575 cp++;
2576 for (ip = (int *)fhp->fh_buf;
2577 ip < (int *)&fhp->fh_buf[fhp->fh_len];
2578 ip++) {
2579 (void) sprintf(cp, " %x", *ip);
2580 while (*cp != '\0')
2581 cp++;
2582 }
2583 (void) strcpy(cp, ")\n");
2584
2585 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2586
2587 kmem_free(buf, bufsize);
2588 }
2589
2590 /*
2591 * Notify the system administrator that an NFS write error has
2592 * occurred.
2593 */
2594
2595 /* seconds between ENOSPC/EDQUOT messages */
2596 clock_t nfs_write_error_interval = 5;
2597
2598 void
nfs_write_error(vnode_t * vp,int error,cred_t * cr)2599 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2600 {
2601 mntinfo_t *mi;
2602 clock_t now;
2603
2604 mi = VTOMI(vp);
2605 /*
2606 * In case of forced unmount or zone shutdown, do not print any
2607 * messages since it can flood the console with error messages.
2608 */
2609 if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2610 return;
2611
2612 /*
2613 * No use in flooding the console with ENOSPC
2614 * messages from the same file system.
2615 */
2616 now = ddi_get_lbolt();
2617 if ((error != ENOSPC && error != EDQUOT) ||
2618 now - mi->mi_printftime > 0) {
2619 zoneid_t zoneid = mi->mi_zone->zone_id;
2620
2621 #ifdef DEBUG
2622 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2623 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2624 #else
2625 nfs_perror(error, "NFS write error on host %s: %m.\n",
2626 VTOR(vp)->r_server->sv_hostname, NULL);
2627 #endif
2628 if (error == ENOSPC || error == EDQUOT) {
2629 zcmn_err(zoneid, CE_CONT,
2630 MSG("^File: userid=%d, groupid=%d\n"),
2631 crgetuid(cr), crgetgid(cr));
2632 if (crgetuid(CRED()) != crgetuid(cr) ||
2633 crgetgid(CRED()) != crgetgid(cr)) {
2634 zcmn_err(zoneid, CE_CONT,
2635 MSG("^User: userid=%d, groupid=%d\n"),
2636 crgetuid(CRED()), crgetgid(CRED()));
2637 }
2638 mi->mi_printftime = now +
2639 nfs_write_error_interval * hz;
2640 }
2641 nfs_printfhandle(&VTOR(vp)->r_fh);
2642 #ifdef DEBUG
2643 if (error == EACCES) {
2644 zcmn_err(zoneid, CE_CONT,
2645 MSG("^nfs_bio: cred is%s kcred\n"),
2646 cr == kcred ? "" : " not");
2647 }
2648 #endif
2649 }
2650 }
2651
2652 /* ARGSUSED */
2653 static void *
nfs_mi_init(zoneid_t zoneid)2654 nfs_mi_init(zoneid_t zoneid)
2655 {
2656 struct mi_globals *mig;
2657
2658 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2659 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2660 list_create(&mig->mig_list, sizeof (mntinfo_t),
2661 offsetof(mntinfo_t, mi_zone_node));
2662 mig->mig_destructor_called = B_FALSE;
2663 return (mig);
2664 }
2665
2666 /*
2667 * Callback routine to tell all NFS mounts in the zone to stop creating new
2668 * threads. Existing threads should exit.
2669 */
2670 /* ARGSUSED */
2671 static void
nfs_mi_shutdown(zoneid_t zoneid,void * data)2672 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2673 {
2674 struct mi_globals *mig = data;
2675 mntinfo_t *mi;
2676
2677 ASSERT(mig != NULL);
2678 again:
2679 mutex_enter(&mig->mig_lock);
2680 for (mi = list_head(&mig->mig_list); mi != NULL;
2681 mi = list_next(&mig->mig_list, mi)) {
2682
2683 /*
2684 * If we've done the shutdown work for this FS, skip.
2685 * Once we go off the end of the list, we're done.
2686 */
2687 if (mi->mi_flags & MI_DEAD)
2688 continue;
2689
2690 /*
2691 * We will do work, so not done. Get a hold on the FS.
2692 */
2693 VFS_HOLD(mi->mi_vfsp);
2694
2695 /*
2696 * purge the DNLC for this filesystem
2697 */
2698 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2699
2700 mutex_enter(&mi->mi_async_lock);
2701 /*
2702 * Tell existing async worker threads to exit.
2703 */
2704 mi->mi_max_threads = 0;
2705 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2706 /*
2707 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2708 * getting ready to exit when it's done with its current work.
2709 * Also set MI_DEAD to note we've acted on this FS.
2710 */
2711 mutex_enter(&mi->mi_lock);
2712 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2713 mutex_exit(&mi->mi_lock);
2714 /*
2715 * Wake up the async manager thread.
2716 */
2717 cv_broadcast(&mi->mi_async_reqs_cv);
2718 mutex_exit(&mi->mi_async_lock);
2719
2720 /*
2721 * Drop lock and release FS, which may change list, then repeat.
2722 * We're done when every mi has been done or the list is empty.
2723 */
2724 mutex_exit(&mig->mig_lock);
2725 VFS_RELE(mi->mi_vfsp);
2726 goto again;
2727 }
2728 mutex_exit(&mig->mig_lock);
2729 }
2730
2731 static void
nfs_mi_free_globals(struct mi_globals * mig)2732 nfs_mi_free_globals(struct mi_globals *mig)
2733 {
2734 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2735 mutex_destroy(&mig->mig_lock);
2736 kmem_free(mig, sizeof (*mig));
2737
2738 }
2739
2740 /* ARGSUSED */
2741 static void
nfs_mi_destroy(zoneid_t zoneid,void * data)2742 nfs_mi_destroy(zoneid_t zoneid, void *data)
2743 {
2744 struct mi_globals *mig = data;
2745
2746 ASSERT(mig != NULL);
2747 mutex_enter(&mig->mig_lock);
2748 if (list_head(&mig->mig_list) != NULL) {
2749 /* Still waiting for VFS_FREEVFS() */
2750 mig->mig_destructor_called = B_TRUE;
2751 mutex_exit(&mig->mig_lock);
2752 return;
2753 }
2754 nfs_mi_free_globals(mig);
2755 }
2756
2757 /*
2758 * Add an NFS mount to the per-zone list of NFS mounts.
2759 */
2760 void
nfs_mi_zonelist_add(mntinfo_t * mi)2761 nfs_mi_zonelist_add(mntinfo_t *mi)
2762 {
2763 struct mi_globals *mig;
2764
2765 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2766 mutex_enter(&mig->mig_lock);
2767 list_insert_head(&mig->mig_list, mi);
2768 mutex_exit(&mig->mig_lock);
2769 }
2770
2771 /*
2772 * Remove an NFS mount from the per-zone list of NFS mounts.
2773 */
2774 static void
nfs_mi_zonelist_remove(mntinfo_t * mi)2775 nfs_mi_zonelist_remove(mntinfo_t *mi)
2776 {
2777 struct mi_globals *mig;
2778
2779 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2780 mutex_enter(&mig->mig_lock);
2781 list_remove(&mig->mig_list, mi);
2782 /*
2783 * We can be called asynchronously by VFS_FREEVFS() after the zone
2784 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2785 * mi globals.
2786 */
2787 if (list_head(&mig->mig_list) == NULL &&
2788 mig->mig_destructor_called == B_TRUE) {
2789 nfs_mi_free_globals(mig);
2790 return;
2791 }
2792 mutex_exit(&mig->mig_lock);
2793 }
2794
2795 /*
2796 * NFS Client initialization routine. This routine should only be called
2797 * once. It performs the following tasks:
2798 * - Initalize all global locks
2799 * - Call sub-initialization routines (localize access to variables)
2800 */
2801 int
nfs_clntinit(void)2802 nfs_clntinit(void)
2803 {
2804 #ifdef DEBUG
2805 static boolean_t nfs_clntup = B_FALSE;
2806 #endif
2807 int error;
2808
2809 #ifdef DEBUG
2810 ASSERT(nfs_clntup == B_FALSE);
2811 #endif
2812
2813 error = nfs_subrinit();
2814 if (error)
2815 return (error);
2816
2817 error = nfs_vfsinit();
2818 if (error) {
2819 /*
2820 * Cleanup nfs_subrinit() work
2821 */
2822 nfs_subrfini();
2823 return (error);
2824 }
2825 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2826 nfs_mi_destroy);
2827
2828 nfs4_clnt_init();
2829
2830 #ifdef DEBUG
2831 nfs_clntup = B_TRUE;
2832 #endif
2833
2834 return (0);
2835 }
2836
2837 /*
2838 * This routine is only called if the NFS Client has been initialized but
2839 * the module failed to be installed. This routine will cleanup the previously
2840 * allocated/initialized work.
2841 */
2842 void
nfs_clntfini(void)2843 nfs_clntfini(void)
2844 {
2845 (void) zone_key_delete(mi_list_key);
2846 nfs_subrfini();
2847 nfs_vfsfini();
2848 nfs4_clnt_fini();
2849 }
2850
2851 /*
2852 * nfs_lockrelease:
2853 *
2854 * Release any locks on the given vnode that are held by the current
2855 * process.
2856 */
2857 void
nfs_lockrelease(vnode_t * vp,int flag,offset_t offset,cred_t * cr)2858 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2859 {
2860 flock64_t ld;
2861 struct shrlock shr;
2862 char *buf;
2863 int remote_lock_possible;
2864 int ret;
2865
2866 ASSERT((uintptr_t)vp > KERNELBASE);
2867
2868 /*
2869 * Generate an explicit unlock operation for the entire file. As a
2870 * partial optimization, only generate the unlock if there is a
2871 * lock registered for the file. We could check whether this
2872 * particular process has any locks on the file, but that would
2873 * require the local locking code to provide yet another query
2874 * routine. Note that no explicit synchronization is needed here.
2875 * At worst, flk_has_remote_locks() will return a false positive,
2876 * in which case the unlock call wastes time but doesn't harm
2877 * correctness.
2878 *
2879 * In addition, an unlock request is generated if the process
2880 * is listed as possibly having a lock on the file because the
2881 * server and client lock managers may have gotten out of sync.
2882 * N.B. It is important to make sure nfs_remove_locking_id() is
2883 * called here even if flk_has_remote_locks(vp) reports true.
2884 * If it is not called and there is an entry on the process id
2885 * list, that entry will never get removed.
2886 */
2887 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2888 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2889 if (remote_lock_possible || flk_has_remote_locks(vp)) {
2890 ld.l_type = F_UNLCK; /* set to unlock entire file */
2891 ld.l_whence = 0; /* unlock from start of file */
2892 ld.l_start = 0;
2893 ld.l_len = 0; /* do entire file */
2894 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
2895 NULL);
2896
2897 if (ret != 0) {
2898 /*
2899 * If VOP_FRLOCK fails, make sure we unregister
2900 * local locks before we continue.
2901 */
2902 ld.l_pid = ttoproc(curthread)->p_pid;
2903 lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2904 #ifdef DEBUG
2905 nfs_perror(ret,
2906 "NFS lock release error on vp %p: %m.\n",
2907 (void *)vp, NULL);
2908 #endif
2909 }
2910
2911 /*
2912 * The call to VOP_FRLOCK may put the pid back on the
2913 * list. We need to remove it.
2914 */
2915 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2916 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2917 }
2918
2919 /*
2920 * As long as the vp has a share matching our pid,
2921 * pluck it off and unshare it. There are circumstances in
2922 * which the call to nfs_remove_locking_id() may put the
2923 * owner back on the list, in which case we simply do a
2924 * redundant and harmless unshare.
2925 */
2926 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2927 while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2928 (char *)NULL, buf, &shr.s_own_len)) {
2929 shr.s_owner = buf;
2930 shr.s_access = 0;
2931 shr.s_deny = 0;
2932 shr.s_sysid = 0;
2933 shr.s_pid = curproc->p_pid;
2934
2935 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
2936 #ifdef DEBUG
2937 if (ret != 0) {
2938 nfs_perror(ret,
2939 "NFS share release error on vp %p: %m.\n",
2940 (void *)vp, NULL);
2941 }
2942 #endif
2943 }
2944 kmem_free(buf, MAX_SHR_OWNER_LEN);
2945 }
2946
2947 /*
2948 * nfs_lockcompletion:
2949 *
2950 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2951 * as non cachable (set VNOCACHE bit).
2952 */
2953
2954 void
nfs_lockcompletion(vnode_t * vp,int cmd)2955 nfs_lockcompletion(vnode_t *vp, int cmd)
2956 {
2957 #ifdef DEBUG
2958 rnode_t *rp = VTOR(vp);
2959
2960 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2961 #endif
2962
2963 if (cmd == F_SETLK || cmd == F_SETLKW) {
2964 if (!lm_safemap(vp)) {
2965 mutex_enter(&vp->v_lock);
2966 vp->v_flag |= VNOCACHE;
2967 mutex_exit(&vp->v_lock);
2968 } else {
2969 mutex_enter(&vp->v_lock);
2970 vp->v_flag &= ~VNOCACHE;
2971 mutex_exit(&vp->v_lock);
2972 }
2973 }
2974 /*
2975 * The cached attributes of the file are stale after acquiring
2976 * the lock on the file. They were updated when the file was
2977 * opened, but not updated when the lock was acquired. Therefore the
2978 * cached attributes are invalidated after the lock is obtained.
2979 */
2980 PURGE_ATTRCACHE(vp);
2981 }
2982
2983 /*
2984 * The lock manager holds state making it possible for the client
2985 * and server to be out of sync. For example, if the response from
2986 * the server granting a lock request is lost, the server will think
2987 * the lock is granted and the client will think the lock is lost.
2988 * The client can tell when it is not positive if it is in sync with
2989 * the server.
2990 *
2991 * To deal with this, a list of processes for which the client is
2992 * not sure if the server holds a lock is attached to the rnode.
2993 * When such a process closes the rnode, an unlock request is sent
2994 * to the server to unlock the entire file.
2995 *
2996 * The list is kept as a singularly linked NULL terminated list.
2997 * Because it is only added to under extreme error conditions, the
2998 * list shouldn't get very big. DEBUG kernels print a message if
2999 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily
3000 * choosen to be 8, but can be tuned at runtime.
3001 */
3002 #ifdef DEBUG
3003 /* int nfs_lmpl_high_water = 8; */
3004 int nfs_lmpl_high_water = 128;
3005 int nfs_cnt_add_locking_id = 0;
3006 int nfs_len_add_locking_id = 0;
3007 #endif /* DEBUG */
3008
3009 /*
3010 * Record that the nfs lock manager server may be holding a lock on
3011 * a vnode for a process.
3012 *
3013 * Because the nfs lock manager server holds state, it is possible
3014 * for the server to get out of sync with the client. This routine is called
3015 * from the client when it is no longer sure if the server is in sync
3016 * with the client. nfs_lockrelease() will then notice this and send
3017 * an unlock request when the file is closed
3018 */
3019 void
nfs_add_locking_id(vnode_t * vp,pid_t pid,int type,char * id,int len)3020 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
3021 {
3022 rnode_t *rp;
3023 lmpl_t *new;
3024 lmpl_t *cur;
3025 lmpl_t **lmplp;
3026 #ifdef DEBUG
3027 int list_len = 1;
3028 #endif /* DEBUG */
3029
3030 #ifdef DEBUG
3031 ++nfs_cnt_add_locking_id;
3032 #endif /* DEBUG */
3033 /*
3034 * allocate new lmpl_t now so we don't sleep
3035 * later after grabbing mutexes
3036 */
3037 ASSERT(len < MAX_SHR_OWNER_LEN);
3038 new = kmem_alloc(sizeof (*new), KM_SLEEP);
3039 new->lmpl_type = type;
3040 new->lmpl_pid = pid;
3041 new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
3042 bcopy(id, new->lmpl_owner, len);
3043 new->lmpl_own_len = len;
3044 new->lmpl_next = (lmpl_t *)NULL;
3045 #ifdef DEBUG
3046 if (type == RLMPL_PID) {
3047 ASSERT(len == sizeof (pid_t));
3048 ASSERT(pid == *(pid_t *)new->lmpl_owner);
3049 } else {
3050 ASSERT(type == RLMPL_OWNER);
3051 }
3052 #endif
3053
3054 rp = VTOR(vp);
3055 mutex_enter(&rp->r_statelock);
3056
3057 /*
3058 * Add this id to the list for this rnode only if the
3059 * rnode is active and the id is not already there.
3060 */
3061 ASSERT(rp->r_flags & RHASHED);
3062 lmplp = &(rp->r_lmpl);
3063 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3064 if (cur->lmpl_pid == pid &&
3065 cur->lmpl_type == type &&
3066 cur->lmpl_own_len == len &&
3067 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
3068 kmem_free(new->lmpl_owner, len);
3069 kmem_free(new, sizeof (*new));
3070 break;
3071 }
3072 lmplp = &cur->lmpl_next;
3073 #ifdef DEBUG
3074 ++list_len;
3075 #endif /* DEBUG */
3076 }
3077 if (cur == (lmpl_t *)NULL) {
3078 *lmplp = new;
3079 #ifdef DEBUG
3080 if (list_len > nfs_len_add_locking_id) {
3081 nfs_len_add_locking_id = list_len;
3082 }
3083 if (list_len > nfs_lmpl_high_water) {
3084 cmn_err(CE_WARN, "nfs_add_locking_id: long list "
3085 "vp=%p is %d", (void *)vp, list_len);
3086 }
3087 #endif /* DEBUG */
3088 }
3089
3090 #ifdef DEBUG
3091 if (share_debug) {
3092 int nitems = 0;
3093 int npids = 0;
3094 int nowners = 0;
3095
3096 /*
3097 * Count the number of things left on r_lmpl after the remove.
3098 */
3099 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3100 cur = cur->lmpl_next) {
3101 nitems++;
3102 if (cur->lmpl_type == RLMPL_PID) {
3103 npids++;
3104 } else if (cur->lmpl_type == RLMPL_OWNER) {
3105 nowners++;
3106 } else {
3107 cmn_err(CE_PANIC, "nfs_add_locking_id: "
3108 "unrecognized lmpl_type %d",
3109 cur->lmpl_type);
3110 }
3111 }
3112
3113 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
3114 "OWNs = %d items left on r_lmpl\n",
3115 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
3116 }
3117 #endif
3118
3119 mutex_exit(&rp->r_statelock);
3120 }
3121
3122 /*
3123 * Remove an id from the lock manager id list.
3124 *
3125 * If the id is not in the list return 0. If it was found and
3126 * removed, return 1.
3127 */
3128 static int
nfs_remove_locking_id(vnode_t * vp,int type,char * id,char * rid,int * rlen)3129 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3130 {
3131 lmpl_t *cur;
3132 lmpl_t **lmplp;
3133 rnode_t *rp;
3134 int rv = 0;
3135
3136 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3137
3138 rp = VTOR(vp);
3139
3140 mutex_enter(&rp->r_statelock);
3141 ASSERT(rp->r_flags & RHASHED);
3142 lmplp = &(rp->r_lmpl);
3143
3144 /*
3145 * Search through the list and remove the entry for this id
3146 * if it is there. The special case id == NULL allows removal
3147 * of the first share on the r_lmpl list belonging to the
3148 * current process (if any), without regard to further details
3149 * of its identity.
3150 */
3151 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3152 if (cur->lmpl_type == type &&
3153 cur->lmpl_pid == curproc->p_pid &&
3154 (id == (char *)NULL ||
3155 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3156 *lmplp = cur->lmpl_next;
3157 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3158 if (rid != NULL) {
3159 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3160 *rlen = cur->lmpl_own_len;
3161 }
3162 kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3163 kmem_free(cur, sizeof (*cur));
3164 rv = 1;
3165 break;
3166 }
3167 lmplp = &cur->lmpl_next;
3168 }
3169
3170 #ifdef DEBUG
3171 if (share_debug) {
3172 int nitems = 0;
3173 int npids = 0;
3174 int nowners = 0;
3175
3176 /*
3177 * Count the number of things left on r_lmpl after the remove.
3178 */
3179 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3180 cur = cur->lmpl_next) {
3181 nitems++;
3182 if (cur->lmpl_type == RLMPL_PID) {
3183 npids++;
3184 } else if (cur->lmpl_type == RLMPL_OWNER) {
3185 nowners++;
3186 } else {
3187 cmn_err(CE_PANIC,
3188 "nrli: unrecognized lmpl_type %d",
3189 cur->lmpl_type);
3190 }
3191 }
3192
3193 cmn_err(CE_CONT,
3194 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3195 (type == RLMPL_PID) ? "P" : "O",
3196 npids,
3197 nowners,
3198 nitems);
3199 }
3200 #endif
3201
3202 mutex_exit(&rp->r_statelock);
3203 return (rv);
3204 }
3205
3206 void
nfs_free_mi(mntinfo_t * mi)3207 nfs_free_mi(mntinfo_t *mi)
3208 {
3209 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3210 ASSERT(mi->mi_manager_thread == NULL);
3211 ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
3212 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
3213
3214 /*
3215 * Remove the node from the global list before we start tearing it down.
3216 */
3217 nfs_mi_zonelist_remove(mi);
3218 if (mi->mi_klmconfig) {
3219 lm_free_config(mi->mi_klmconfig);
3220 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3221 }
3222 mutex_destroy(&mi->mi_lock);
3223 mutex_destroy(&mi->mi_remap_lock);
3224 mutex_destroy(&mi->mi_async_lock);
3225 mutex_destroy(&mi->mi_rnodes_lock);
3226 cv_destroy(&mi->mi_failover_cv);
3227 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
3228 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
3229 cv_destroy(&mi->mi_async_reqs_cv);
3230 cv_destroy(&mi->mi_async_cv);
3231 list_destroy(&mi->mi_rnodes);
3232 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
3233 kmem_free(mi, sizeof (*mi));
3234 }
3235
3236 static int
mnt_kstat_update(kstat_t * ksp,int rw)3237 mnt_kstat_update(kstat_t *ksp, int rw)
3238 {
3239 mntinfo_t *mi;
3240 struct mntinfo_kstat *mik;
3241 vfs_t *vfsp;
3242 int i;
3243
3244 /* this is a read-only kstat. Bail out on a write */
3245 if (rw == KSTAT_WRITE)
3246 return (EACCES);
3247
3248 /*
3249 * We don't want to wait here as kstat_chain_lock could be held by
3250 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3251 * and thus could lead to a deadlock.
3252 */
3253 vfsp = (struct vfs *)ksp->ks_private;
3254
3255
3256 mi = VFTOMI(vfsp);
3257
3258 mik = (struct mntinfo_kstat *)ksp->ks_data;
3259
3260 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3261 mik->mik_vers = (uint32_t)mi->mi_vers;
3262 mik->mik_flags = mi->mi_flags;
3263 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3264 mik->mik_curread = (uint32_t)mi->mi_curread;
3265 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3266 mik->mik_retrans = mi->mi_retrans;
3267 mik->mik_timeo = mi->mi_timeo;
3268 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3269 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3270 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3271 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3272 for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3273 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3274 mik->mik_timers[i].deviate =
3275 (uint32_t)mi->mi_timers[i].rt_deviate;
3276 mik->mik_timers[i].rtxcur =
3277 (uint32_t)mi->mi_timers[i].rt_rtxcur;
3278 }
3279 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3280 mik->mik_failover = (uint32_t)mi->mi_failover;
3281 mik->mik_remap = (uint32_t)mi->mi_remap;
3282 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3283
3284 return (0);
3285 }
3286
3287 void
nfs_mnt_kstat_init(struct vfs * vfsp)3288 nfs_mnt_kstat_init(struct vfs *vfsp)
3289 {
3290 mntinfo_t *mi = VFTOMI(vfsp);
3291
3292 /*
3293 * Create the version specific kstats.
3294 *
3295 * PSARC 2001/697 Contract Private Interface
3296 * All nfs kstats are under SunMC contract
3297 * Please refer to the PSARC listed above and contact
3298 * SunMC before making any changes!
3299 *
3300 * Changes must be reviewed by Solaris File Sharing
3301 * Changes must be communicated to contract-2001-697@sun.com
3302 *
3303 */
3304
3305 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3306 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3307 if (mi->mi_io_kstats) {
3308 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3309 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3310 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3311 kstat_install(mi->mi_io_kstats);
3312 }
3313
3314 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3315 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3316 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3317 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3318 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3319 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3320 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3321 kstat_install(mi->mi_ro_kstats);
3322 }
3323 }
3324
3325 nfs_delmapcall_t *
nfs_init_delmapcall()3326 nfs_init_delmapcall()
3327 {
3328 nfs_delmapcall_t *delmap_call;
3329
3330 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3331 delmap_call->call_id = curthread;
3332 delmap_call->error = 0;
3333
3334 return (delmap_call);
3335 }
3336
3337 void
nfs_free_delmapcall(nfs_delmapcall_t * delmap_call)3338 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3339 {
3340 kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3341 }
3342
3343 /*
3344 * Searches for the current delmap caller (based on curthread) in the list of
3345 * callers. If it is found, we remove it and free the delmap caller.
3346 * Returns:
3347 * 0 if the caller wasn't found
3348 * 1 if the caller was found, removed and freed. *errp is set to what
3349 * the result of the delmap was.
3350 */
3351 int
nfs_find_and_delete_delmapcall(rnode_t * rp,int * errp)3352 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3353 {
3354 nfs_delmapcall_t *delmap_call;
3355
3356 /*
3357 * If the list doesn't exist yet, we create it and return
3358 * that the caller wasn't found. No list = no callers.
3359 */
3360 mutex_enter(&rp->r_statelock);
3361 if (!(rp->r_flags & RDELMAPLIST)) {
3362 /* The list does not exist */
3363 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3364 offsetof(nfs_delmapcall_t, call_node));
3365 rp->r_flags |= RDELMAPLIST;
3366 mutex_exit(&rp->r_statelock);
3367 return (0);
3368 } else {
3369 /* The list exists so search it */
3370 for (delmap_call = list_head(&rp->r_indelmap);
3371 delmap_call != NULL;
3372 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3373 if (delmap_call->call_id == curthread) {
3374 /* current caller is in the list */
3375 *errp = delmap_call->error;
3376 list_remove(&rp->r_indelmap, delmap_call);
3377 mutex_exit(&rp->r_statelock);
3378 nfs_free_delmapcall(delmap_call);
3379 return (1);
3380 }
3381 }
3382 }
3383 mutex_exit(&rp->r_statelock);
3384 return (0);
3385 }
3386