1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 * All rights reserved.
29 */
30
31 /*
32 * Copyright 2018 Nexenta Systems, Inc.
33 */
34
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/systm.h>
38 #include <sys/thread.h>
39 #include <sys/t_lock.h>
40 #include <sys/time.h>
41 #include <sys/vnode.h>
42 #include <sys/vfs.h>
43 #include <sys/errno.h>
44 #include <sys/buf.h>
45 #include <sys/stat.h>
46 #include <sys/cred.h>
47 #include <sys/kmem.h>
48 #include <sys/debug.h>
49 #include <sys/dnlc.h>
50 #include <sys/vmsystm.h>
51 #include <sys/flock.h>
52 #include <sys/share.h>
53 #include <sys/cmn_err.h>
54 #include <sys/tiuser.h>
55 #include <sys/sysmacros.h>
56 #include <sys/callb.h>
57 #include <sys/acl.h>
58 #include <sys/kstat.h>
59 #include <sys/signal.h>
60 #include <sys/list.h>
61 #include <sys/zone.h>
62
63 #include <rpc/types.h>
64 #include <rpc/xdr.h>
65 #include <rpc/auth.h>
66 #include <rpc/clnt.h>
67
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/nfs_cmd.h>
71
72 #include <nfs/rnode.h>
73 #include <nfs/nfs_acl.h>
74 #include <nfs/lm.h>
75
76 #include <vm/hat.h>
77 #include <vm/as.h>
78 #include <vm/page.h>
79 #include <vm/pvn.h>
80 #include <vm/seg.h>
81 #include <vm/seg_map.h>
82 #include <vm/seg_vn.h>
83
84 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
85 cred_t *);
86 static int nfs_getattr_cache(vnode_t *, struct vattr *);
87 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
88
89 struct mi_globals {
90 kmutex_t mig_lock; /* lock protecting mig_list */
91 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */
92 boolean_t mig_destructor_called;
93 };
94
95 static zone_key_t mi_list_key;
96
97 /* Debugging flag for PC file shares. */
98 extern int share_debug;
99
100 /*
101 * Attributes caching:
102 *
103 * Attributes are cached in the rnode in struct vattr form.
104 * There is a time associated with the cached attributes (r_attrtime)
105 * which tells whether the attributes are valid. The time is initialized
106 * to the difference between current time and the modify time of the vnode
107 * when new attributes are cached. This allows the attributes for
108 * files that have changed recently to be timed out sooner than for files
109 * that have not changed for a long time. There are minimum and maximum
110 * timeout values that can be set per mount point.
111 */
112
113 int
nfs_waitfor_purge_complete(vnode_t * vp)114 nfs_waitfor_purge_complete(vnode_t *vp)
115 {
116 rnode_t *rp;
117 k_sigset_t smask;
118
119 rp = VTOR(vp);
120 if (rp->r_serial != NULL && rp->r_serial != curthread) {
121 mutex_enter(&rp->r_statelock);
122 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
123 while (rp->r_serial != NULL) {
124 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
125 sigunintr(&smask);
126 mutex_exit(&rp->r_statelock);
127 return (EINTR);
128 }
129 }
130 sigunintr(&smask);
131 mutex_exit(&rp->r_statelock);
132 }
133 return (0);
134 }
135
136 /*
137 * Validate caches by checking cached attributes. If the cached
138 * attributes have timed out, then get new attributes from the server.
139 * As a side affect, this will do cache invalidation if the attributes
140 * have changed.
141 *
142 * If the attributes have not timed out and if there is a cache
143 * invalidation being done by some other thread, then wait until that
144 * thread has completed the cache invalidation.
145 */
146 int
nfs_validate_caches(vnode_t * vp,cred_t * cr)147 nfs_validate_caches(vnode_t *vp, cred_t *cr)
148 {
149 int error;
150 struct vattr va;
151
152 if (ATTRCACHE_VALID(vp)) {
153 error = nfs_waitfor_purge_complete(vp);
154 if (error)
155 return (error);
156 return (0);
157 }
158
159 va.va_mask = AT_ALL;
160 return (nfs_getattr_otw(vp, &va, cr));
161 }
162
163 /*
164 * Validate caches by checking cached attributes. If the cached
165 * attributes have timed out, then get new attributes from the server.
166 * As a side affect, this will do cache invalidation if the attributes
167 * have changed.
168 *
169 * If the attributes have not timed out and if there is a cache
170 * invalidation being done by some other thread, then wait until that
171 * thread has completed the cache invalidation.
172 */
173 int
nfs3_validate_caches(vnode_t * vp,cred_t * cr)174 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
175 {
176 int error;
177 struct vattr va;
178
179 if (ATTRCACHE_VALID(vp)) {
180 error = nfs_waitfor_purge_complete(vp);
181 if (error)
182 return (error);
183 return (0);
184 }
185
186 va.va_mask = AT_ALL;
187 return (nfs3_getattr_otw(vp, &va, cr));
188 }
189
190 /*
191 * Purge all of the various NFS `data' caches.
192 */
193 void
nfs_purge_caches(vnode_t * vp,int purge_dnlc,cred_t * cr)194 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
195 {
196 rnode_t *rp;
197 char *contents;
198 int size;
199 int error;
200
201 /*
202 * Purge the DNLC for any entries which refer to this file.
203 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
204 */
205 rp = VTOR(vp);
206 mutex_enter(&rp->r_statelock);
207 if (vp->v_count > 1 &&
208 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
209 !(rp->r_flags & RINDNLCPURGE)) {
210 /*
211 * Set the RINDNLCPURGE flag to prevent recursive entry
212 * into dnlc_purge_vp()
213 */
214 if (vp->v_type == VDIR)
215 rp->r_flags |= RINDNLCPURGE;
216 mutex_exit(&rp->r_statelock);
217 dnlc_purge_vp(vp);
218 mutex_enter(&rp->r_statelock);
219 if (rp->r_flags & RINDNLCPURGE)
220 rp->r_flags &= ~RINDNLCPURGE;
221 }
222
223 /*
224 * Clear any readdir state bits and purge the readlink response cache.
225 */
226 contents = rp->r_symlink.contents;
227 size = rp->r_symlink.size;
228 rp->r_symlink.contents = NULL;
229 mutex_exit(&rp->r_statelock);
230
231 if (contents != NULL) {
232
233 kmem_free((void *)contents, size);
234 }
235
236 /*
237 * Flush the page cache.
238 */
239 if (vn_has_cached_data(vp)) {
240 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
241 if (error && (error == ENOSPC || error == EDQUOT)) {
242 mutex_enter(&rp->r_statelock);
243 if (!rp->r_error)
244 rp->r_error = error;
245 mutex_exit(&rp->r_statelock);
246 }
247 }
248
249 /*
250 * Flush the readdir response cache.
251 */
252 if (HAVE_RDDIR_CACHE(rp))
253 nfs_purge_rddir_cache(vp);
254 }
255
256 /*
257 * Purge the readdir cache of all entries
258 */
259 void
nfs_purge_rddir_cache(vnode_t * vp)260 nfs_purge_rddir_cache(vnode_t *vp)
261 {
262 rnode_t *rp;
263 rddir_cache *rdc;
264 rddir_cache *nrdc;
265
266 rp = VTOR(vp);
267 mutex_enter(&rp->r_statelock);
268 rp->r_direof = NULL;
269 rp->r_flags &= ~RLOOKUP;
270 rp->r_flags |= RREADDIRPLUS;
271 rdc = avl_first(&rp->r_dir);
272 while (rdc != NULL) {
273 nrdc = AVL_NEXT(&rp->r_dir, rdc);
274 avl_remove(&rp->r_dir, rdc);
275 rddir_cache_rele(rdc);
276 rdc = nrdc;
277 }
278 mutex_exit(&rp->r_statelock);
279 }
280
281 /*
282 * Do a cache check based on the post-operation attributes.
283 * Then make them the new cached attributes. If no attributes
284 * were returned, then mark the attributes as timed out.
285 */
286 void
nfs3_cache_post_op_attr(vnode_t * vp,post_op_attr * poap,hrtime_t t,cred_t * cr)287 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
288 {
289 vattr_t attr;
290
291 if (!poap->attributes) {
292 PURGE_ATTRCACHE(vp);
293 return;
294 }
295 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
296 }
297
298 /*
299 * Same as above, but using a vattr
300 */
301 void
nfs3_cache_post_op_vattr(vnode_t * vp,post_op_vattr * poap,hrtime_t t,cred_t * cr)302 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
303 cred_t *cr)
304 {
305 if (!poap->attributes) {
306 PURGE_ATTRCACHE(vp);
307 return;
308 }
309 nfs_attr_cache(vp, poap->fres.vap, t, cr);
310 }
311
312 /*
313 * Do a cache check based on the weak cache consistency attributes.
314 * These consist of a small set of pre-operation attributes and the
315 * full set of post-operation attributes.
316 *
317 * If we are given the pre-operation attributes, then use them to
318 * check the validity of the various caches. Then, if we got the
319 * post-operation attributes, make them the new cached attributes.
320 * If we didn't get the post-operation attributes, then mark the
321 * attribute cache as timed out so that the next reference will
322 * cause a GETATTR to the server to refresh with the current
323 * attributes.
324 *
325 * Otherwise, if we didn't get the pre-operation attributes, but
326 * we did get the post-operation attributes, then use these
327 * attributes to check the validity of the various caches. This
328 * will probably cause a flush of the caches because if the
329 * operation succeeded, the attributes of the object were changed
330 * in some way from the old post-operation attributes. This
331 * should be okay because it is the safe thing to do. After
332 * checking the data caches, then we make these the new cached
333 * attributes.
334 *
335 * Otherwise, we didn't get either the pre- or post-operation
336 * attributes. Simply mark the attribute cache as timed out so
337 * the next reference will cause a GETATTR to the server to
338 * refresh with the current attributes.
339 *
340 * If an error occurred trying to convert the over the wire
341 * attributes to a vattr, then simply mark the attribute cache as
342 * timed out.
343 */
344 void
nfs3_cache_wcc_data(vnode_t * vp,wcc_data * wccp,hrtime_t t,cred_t * cr)345 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
346 {
347 vattr_t bva;
348 vattr_t ava;
349
350 if (wccp->after.attributes) {
351 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
352 PURGE_ATTRCACHE(vp);
353 return;
354 }
355 if (wccp->before.attributes) {
356 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
357 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
358 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
359 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
360 bva.va_size = wccp->before.attr.size;
361 nfs3_attr_cache(vp, &bva, &ava, t, cr);
362 } else
363 nfs_attr_cache(vp, &ava, t, cr);
364 } else {
365 PURGE_ATTRCACHE(vp);
366 }
367 }
368
369 /*
370 * Set attributes cache for given vnode using nfsattr.
371 *
372 * This routine does not do cache validation with the attributes.
373 *
374 * If an error occurred trying to convert the over the wire
375 * attributes to a vattr, then simply mark the attribute cache as
376 * timed out.
377 */
378 void
nfs_attrcache(vnode_t * vp,struct nfsfattr * na,hrtime_t t)379 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
380 {
381 rnode_t *rp;
382 struct vattr va;
383
384 if (!nattr_to_vattr(vp, na, &va)) {
385 rp = VTOR(vp);
386 mutex_enter(&rp->r_statelock);
387 if (rp->r_mtime <= t)
388 nfs_attrcache_va(vp, &va);
389 mutex_exit(&rp->r_statelock);
390 } else {
391 PURGE_ATTRCACHE(vp);
392 }
393 }
394
395 /*
396 * Set attributes cache for given vnode using fattr3.
397 *
398 * This routine does not do cache validation with the attributes.
399 *
400 * If an error occurred trying to convert the over the wire
401 * attributes to a vattr, then simply mark the attribute cache as
402 * timed out.
403 */
404 void
nfs3_attrcache(vnode_t * vp,fattr3 * na,hrtime_t t)405 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
406 {
407 rnode_t *rp;
408 struct vattr va;
409
410 if (!fattr3_to_vattr(vp, na, &va)) {
411 rp = VTOR(vp);
412 mutex_enter(&rp->r_statelock);
413 if (rp->r_mtime <= t)
414 nfs_attrcache_va(vp, &va);
415 mutex_exit(&rp->r_statelock);
416 } else {
417 PURGE_ATTRCACHE(vp);
418 }
419 }
420
421 /*
422 * Do a cache check based on attributes returned over the wire. The
423 * new attributes are cached.
424 *
425 * If an error occurred trying to convert the over the wire attributes
426 * to a vattr, then just return that error.
427 *
428 * As a side affect, the vattr argument is filled in with the converted
429 * attributes.
430 */
431 int
nfs_cache_fattr(vnode_t * vp,struct nfsfattr * na,vattr_t * vap,hrtime_t t,cred_t * cr)432 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
433 cred_t *cr)
434 {
435 int error;
436
437 error = nattr_to_vattr(vp, na, vap);
438 if (error)
439 return (error);
440 nfs_attr_cache(vp, vap, t, cr);
441 return (0);
442 }
443
444 /*
445 * Do a cache check based on attributes returned over the wire. The
446 * new attributes are cached.
447 *
448 * If an error occurred trying to convert the over the wire attributes
449 * to a vattr, then just return that error.
450 *
451 * As a side affect, the vattr argument is filled in with the converted
452 * attributes.
453 */
454 int
nfs3_cache_fattr3(vnode_t * vp,fattr3 * na,vattr_t * vap,hrtime_t t,cred_t * cr)455 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
456 {
457 int error;
458
459 error = fattr3_to_vattr(vp, na, vap);
460 if (error)
461 return (error);
462 nfs_attr_cache(vp, vap, t, cr);
463 return (0);
464 }
465
466 /*
467 * Use the passed in virtual attributes to check to see whether the
468 * data and metadata caches are valid, cache the new attributes, and
469 * then do the cache invalidation if required.
470 *
471 * The cache validation and caching of the new attributes is done
472 * atomically via the use of the mutex, r_statelock. If required,
473 * the cache invalidation is done atomically w.r.t. the cache
474 * validation and caching of the attributes via the pseudo lock,
475 * r_serial.
476 *
477 * This routine is used to do cache validation and attributes caching
478 * for operations with a single set of post operation attributes.
479 */
480 void
nfs_attr_cache(vnode_t * vp,vattr_t * vap,hrtime_t t,cred_t * cr)481 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
482 {
483 rnode_t *rp;
484 int mtime_changed = 0;
485 int ctime_changed = 0;
486 vsecattr_t *vsp;
487 int was_serial;
488 len_t preattr_rsize;
489 boolean_t writeattr_set = B_FALSE;
490 boolean_t cachepurge_set = B_FALSE;
491
492 rp = VTOR(vp);
493
494 mutex_enter(&rp->r_statelock);
495
496 if (rp->r_serial != curthread) {
497 klwp_t *lwp = ttolwp(curthread);
498
499 was_serial = 0;
500 if (lwp != NULL)
501 lwp->lwp_nostop++;
502 while (rp->r_serial != NULL) {
503 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
504 mutex_exit(&rp->r_statelock);
505 if (lwp != NULL)
506 lwp->lwp_nostop--;
507 return;
508 }
509 }
510 if (lwp != NULL)
511 lwp->lwp_nostop--;
512 } else
513 was_serial = 1;
514
515 if (rp->r_mtime > t) {
516 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
517 PURGE_ATTRCACHE_LOCKED(rp);
518 mutex_exit(&rp->r_statelock);
519 return;
520 }
521
522 /*
523 * Write thread after writing data to file on remote server,
524 * will always set RWRITEATTR to indicate that file on remote
525 * server was modified with a WRITE operation and would have
526 * marked attribute cache as timed out. If RWRITEATTR
527 * is set, then do not check for mtime and ctime change.
528 */
529 if (!(rp->r_flags & RWRITEATTR)) {
530 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
531 mtime_changed = 1;
532
533 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
534 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
535 ctime_changed = 1;
536 } else {
537 writeattr_set = B_TRUE;
538 }
539
540 preattr_rsize = rp->r_size;
541
542 nfs_attrcache_va(vp, vap);
543
544 /*
545 * If we have updated filesize in nfs_attrcache_va, as soon as we
546 * drop statelock we will be in transition of purging all
547 * our caches and updating them. It is possible for another
548 * thread to pick this new file size and read in zeroed data.
549 * stall other threads till cache purge is complete.
550 */
551 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
552 /*
553 * If RWRITEATTR was set and we have updated the file
554 * size, Server's returned file size need not necessarily
555 * be because of this Client's WRITE. We need to purge
556 * all caches.
557 */
558 if (writeattr_set)
559 mtime_changed = 1;
560
561 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
562 rp->r_flags |= RINCACHEPURGE;
563 cachepurge_set = B_TRUE;
564 }
565 }
566
567 if (!mtime_changed && !ctime_changed) {
568 mutex_exit(&rp->r_statelock);
569 return;
570 }
571
572 rp->r_serial = curthread;
573
574 mutex_exit(&rp->r_statelock);
575
576 if (mtime_changed)
577 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
578
579 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
580 mutex_enter(&rp->r_statelock);
581 rp->r_flags &= ~RINCACHEPURGE;
582 cv_broadcast(&rp->r_cv);
583 mutex_exit(&rp->r_statelock);
584 cachepurge_set = B_FALSE;
585 }
586
587 if (ctime_changed) {
588 (void) nfs_access_purge_rp(rp);
589 if (rp->r_secattr != NULL) {
590 mutex_enter(&rp->r_statelock);
591 vsp = rp->r_secattr;
592 rp->r_secattr = NULL;
593 mutex_exit(&rp->r_statelock);
594 if (vsp != NULL)
595 nfs_acl_free(vsp);
596 }
597 }
598
599 if (!was_serial) {
600 mutex_enter(&rp->r_statelock);
601 rp->r_serial = NULL;
602 cv_broadcast(&rp->r_cv);
603 mutex_exit(&rp->r_statelock);
604 }
605 }
606
607 /*
608 * Use the passed in "before" virtual attributes to check to see
609 * whether the data and metadata caches are valid, cache the "after"
610 * new attributes, and then do the cache invalidation if required.
611 *
612 * The cache validation and caching of the new attributes is done
613 * atomically via the use of the mutex, r_statelock. If required,
614 * the cache invalidation is done atomically w.r.t. the cache
615 * validation and caching of the attributes via the pseudo lock,
616 * r_serial.
617 *
618 * This routine is used to do cache validation and attributes caching
619 * for operations with both pre operation attributes and post operation
620 * attributes.
621 */
622 static void
nfs3_attr_cache(vnode_t * vp,vattr_t * bvap,vattr_t * avap,hrtime_t t,cred_t * cr)623 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
624 cred_t *cr)
625 {
626 rnode_t *rp;
627 int mtime_changed = 0;
628 int ctime_changed = 0;
629 vsecattr_t *vsp;
630 int was_serial;
631 len_t preattr_rsize;
632 boolean_t writeattr_set = B_FALSE;
633 boolean_t cachepurge_set = B_FALSE;
634
635 rp = VTOR(vp);
636
637 mutex_enter(&rp->r_statelock);
638
639 if (rp->r_serial != curthread) {
640 klwp_t *lwp = ttolwp(curthread);
641
642 was_serial = 0;
643 if (lwp != NULL)
644 lwp->lwp_nostop++;
645 while (rp->r_serial != NULL) {
646 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
647 mutex_exit(&rp->r_statelock);
648 if (lwp != NULL)
649 lwp->lwp_nostop--;
650 return;
651 }
652 }
653 if (lwp != NULL)
654 lwp->lwp_nostop--;
655 } else
656 was_serial = 1;
657
658 if (rp->r_mtime > t) {
659 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
660 PURGE_ATTRCACHE_LOCKED(rp);
661 mutex_exit(&rp->r_statelock);
662 return;
663 }
664
665 /*
666 * Write thread after writing data to file on remote server,
667 * will always set RWRITEATTR to indicate that file on remote
668 * server was modified with a WRITE operation and would have
669 * marked attribute cache as timed out. If RWRITEATTR
670 * is set, then do not check for mtime and ctime change.
671 */
672 if (!(rp->r_flags & RWRITEATTR)) {
673 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
674 mtime_changed = 1;
675
676 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
677 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
678 ctime_changed = 1;
679 } else {
680 writeattr_set = B_TRUE;
681 }
682
683 preattr_rsize = rp->r_size;
684
685 nfs_attrcache_va(vp, avap);
686
687 /*
688 * If we have updated filesize in nfs_attrcache_va, as soon as we
689 * drop statelock we will be in transition of purging all
690 * our caches and updating them. It is possible for another
691 * thread to pick this new file size and read in zeroed data.
692 * stall other threads till cache purge is complete.
693 */
694 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
695 /*
696 * If RWRITEATTR was set and we have updated the file
697 * size, Server's returned file size need not necessarily
698 * be because of this Client's WRITE. We need to purge
699 * all caches.
700 */
701 if (writeattr_set)
702 mtime_changed = 1;
703
704 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
705 rp->r_flags |= RINCACHEPURGE;
706 cachepurge_set = B_TRUE;
707 }
708 }
709
710 if (!mtime_changed && !ctime_changed) {
711 mutex_exit(&rp->r_statelock);
712 return;
713 }
714
715 rp->r_serial = curthread;
716
717 mutex_exit(&rp->r_statelock);
718
719 if (mtime_changed)
720 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
721
722 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
723 mutex_enter(&rp->r_statelock);
724 rp->r_flags &= ~RINCACHEPURGE;
725 cv_broadcast(&rp->r_cv);
726 mutex_exit(&rp->r_statelock);
727 cachepurge_set = B_FALSE;
728 }
729
730 if (ctime_changed) {
731 (void) nfs_access_purge_rp(rp);
732 if (rp->r_secattr != NULL) {
733 mutex_enter(&rp->r_statelock);
734 vsp = rp->r_secattr;
735 rp->r_secattr = NULL;
736 mutex_exit(&rp->r_statelock);
737 if (vsp != NULL)
738 nfs_acl_free(vsp);
739 }
740 }
741
742 if (!was_serial) {
743 mutex_enter(&rp->r_statelock);
744 rp->r_serial = NULL;
745 cv_broadcast(&rp->r_cv);
746 mutex_exit(&rp->r_statelock);
747 }
748 }
749
750 /*
751 * Set attributes cache for given vnode using virtual attributes.
752 *
753 * Set the timeout value on the attribute cache and fill it
754 * with the passed in attributes.
755 *
756 * The caller must be holding r_statelock.
757 */
758 void
nfs_attrcache_va(vnode_t * vp,struct vattr * va)759 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
760 {
761 rnode_t *rp;
762 mntinfo_t *mi;
763 hrtime_t delta;
764 hrtime_t now;
765
766 rp = VTOR(vp);
767
768 ASSERT(MUTEX_HELD(&rp->r_statelock));
769
770 now = gethrtime();
771
772 mi = VTOMI(vp);
773
774 /*
775 * Delta is the number of nanoseconds that we will
776 * cache the attributes of the file. It is based on
777 * the number of nanoseconds since the last time that
778 * we detected a change. The assumption is that files
779 * that changed recently are likely to change again.
780 * There is a minimum and a maximum for regular files
781 * and for directories which is enforced though.
782 *
783 * Using the time since last change was detected
784 * eliminates direct comparison or calculation
785 * using mixed client and server times. NFS does
786 * not make any assumptions regarding the client
787 * and server clocks being synchronized.
788 */
789 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
790 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
791 va->va_size != rp->r_attr.va_size)
792 rp->r_mtime = now;
793
794 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
795 delta = 0;
796 else {
797 delta = now - rp->r_mtime;
798 if (vp->v_type == VDIR) {
799 if (delta < mi->mi_acdirmin)
800 delta = mi->mi_acdirmin;
801 else if (delta > mi->mi_acdirmax)
802 delta = mi->mi_acdirmax;
803 } else {
804 if (delta < mi->mi_acregmin)
805 delta = mi->mi_acregmin;
806 else if (delta > mi->mi_acregmax)
807 delta = mi->mi_acregmax;
808 }
809 }
810 rp->r_attrtime = now + delta;
811 rp->r_attr = *va;
812 /*
813 * Update the size of the file if there is no cached data or if
814 * the cached data is clean and there is no data being written
815 * out.
816 */
817 if (rp->r_size != va->va_size &&
818 (!vn_has_cached_data(vp) ||
819 (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
820 rp->r_size = va->va_size;
821 nfs_setswaplike(vp, va);
822 rp->r_flags &= ~RWRITEATTR;
823 }
824
825 /*
826 * Fill in attribute from the cache.
827 * If valid, then return 0 to indicate that no error occurred,
828 * otherwise return 1 to indicate that an error occurred.
829 */
830 static int
nfs_getattr_cache(vnode_t * vp,struct vattr * vap)831 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
832 {
833 rnode_t *rp;
834 uint_t mask = vap->va_mask;
835
836 rp = VTOR(vp);
837 mutex_enter(&rp->r_statelock);
838 if (ATTRCACHE_VALID(vp)) {
839 /*
840 * Cached attributes are valid
841 */
842 *vap = rp->r_attr;
843 /*
844 * Set the caller's va_mask to the set of attributes
845 * that were requested ANDed with the attributes that
846 * are available. If attributes were requested that
847 * are not available, those bits must be turned off
848 * in the callers va_mask.
849 */
850 vap->va_mask &= mask;
851 mutex_exit(&rp->r_statelock);
852 return (0);
853 }
854 mutex_exit(&rp->r_statelock);
855 return (1);
856 }
857
858 /*
859 * Get attributes over-the-wire and update attributes cache
860 * if no error occurred in the over-the-wire operation.
861 * Return 0 if successful, otherwise error.
862 */
863 int
nfs_getattr_otw(vnode_t * vp,struct vattr * vap,cred_t * cr)864 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
865 {
866 int error;
867 struct nfsattrstat ns;
868 int douprintf;
869 mntinfo_t *mi;
870 failinfo_t fi;
871 hrtime_t t;
872
873 mi = VTOMI(vp);
874 fi.vp = vp;
875 fi.fhp = NULL; /* no need to update, filehandle not copied */
876 fi.copyproc = nfscopyfh;
877 fi.lookupproc = nfslookup;
878 fi.xattrdirproc = acl_getxattrdir2;
879
880 if (mi->mi_flags & MI_ACL) {
881 error = acl_getattr2_otw(vp, vap, cr);
882 if (mi->mi_flags & MI_ACL)
883 return (error);
884 }
885
886 douprintf = 1;
887
888 t = gethrtime();
889
890 error = rfs2call(mi, RFS_GETATTR,
891 xdr_fhandle, (caddr_t)VTOFH(vp),
892 xdr_attrstat, (caddr_t)&ns, cr,
893 &douprintf, &ns.ns_status, 0, &fi);
894
895 if (!error) {
896 error = geterrno(ns.ns_status);
897 if (!error)
898 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
899 else {
900 PURGE_STALE_FH(error, vp, cr);
901 }
902 }
903
904 return (error);
905 }
906
907 /*
908 * Return either cached ot remote attributes. If get remote attr
909 * use them to check and invalidate caches, then cache the new attributes.
910 */
911 int
nfsgetattr(vnode_t * vp,struct vattr * vap,cred_t * cr)912 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
913 {
914 int error;
915 rnode_t *rp;
916
917 /*
918 * If we've got cached attributes, we're done, otherwise go
919 * to the server to get attributes, which will update the cache
920 * in the process.
921 */
922 error = nfs_getattr_cache(vp, vap);
923 if (error)
924 error = nfs_getattr_otw(vp, vap, cr);
925
926 /* Return the client's view of file size */
927 rp = VTOR(vp);
928 mutex_enter(&rp->r_statelock);
929 vap->va_size = rp->r_size;
930 mutex_exit(&rp->r_statelock);
931
932 return (error);
933 }
934
935 /*
936 * Get attributes over-the-wire and update attributes cache
937 * if no error occurred in the over-the-wire operation.
938 * Return 0 if successful, otherwise error.
939 */
940 int
nfs3_getattr_otw(vnode_t * vp,struct vattr * vap,cred_t * cr)941 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
942 {
943 int error;
944 GETATTR3args args;
945 GETATTR3vres res;
946 int douprintf;
947 failinfo_t fi;
948 hrtime_t t;
949
950 args.object = *VTOFH3(vp);
951 fi.vp = vp;
952 fi.fhp = (caddr_t)&args.object;
953 fi.copyproc = nfs3copyfh;
954 fi.lookupproc = nfs3lookup;
955 fi.xattrdirproc = acl_getxattrdir3;
956 res.fres.vp = vp;
957 res.fres.vap = vap;
958
959 douprintf = 1;
960
961 t = gethrtime();
962
963 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
964 xdr_nfs_fh3, (caddr_t)&args,
965 xdr_GETATTR3vres, (caddr_t)&res, cr,
966 &douprintf, &res.status, 0, &fi);
967
968 if (error)
969 return (error);
970
971 error = geterrno3(res.status);
972 if (error) {
973 PURGE_STALE_FH(error, vp, cr);
974 return (error);
975 }
976
977 /*
978 * Catch status codes that indicate fattr3 to vattr translation failure
979 */
980 if (res.fres.status)
981 return (res.fres.status);
982
983 nfs_attr_cache(vp, vap, t, cr);
984 return (0);
985 }
986
987 /*
988 * Return either cached or remote attributes. If get remote attr
989 * use them to check and invalidate caches, then cache the new attributes.
990 */
991 int
nfs3getattr(vnode_t * vp,struct vattr * vap,cred_t * cr)992 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
993 {
994 int error;
995 rnode_t *rp;
996
997 /*
998 * If we've got cached attributes, we're done, otherwise go
999 * to the server to get attributes, which will update the cache
1000 * in the process.
1001 */
1002 error = nfs_getattr_cache(vp, vap);
1003 if (error)
1004 error = nfs3_getattr_otw(vp, vap, cr);
1005
1006 /* Return the client's view of file size */
1007 rp = VTOR(vp);
1008 mutex_enter(&rp->r_statelock);
1009 vap->va_size = rp->r_size;
1010 mutex_exit(&rp->r_statelock);
1011
1012 return (error);
1013 }
1014
1015 vtype_t nf_to_vt[] = {
1016 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
1017 };
1018 /*
1019 * Convert NFS Version 2 over the network attributes to the local
1020 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1021 * network representation and the local representation is done here.
1022 * Returns 0 for success, error if failed due to overflow.
1023 */
1024 int
nattr_to_vattr(vnode_t * vp,struct nfsfattr * na,struct vattr * vap)1025 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
1026 {
1027 /* overflow in time attributes? */
1028 #ifndef _LP64
1029 if (!NFS2_FATTR_TIME_OK(na))
1030 return (EOVERFLOW);
1031 #endif
1032
1033 vap->va_mask = AT_ALL;
1034
1035 if (na->na_type < NFNON || na->na_type > NFSOC)
1036 vap->va_type = VBAD;
1037 else
1038 vap->va_type = nf_to_vt[na->na_type];
1039 vap->va_mode = na->na_mode;
1040 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
1041 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
1042 vap->va_fsid = vp->v_vfsp->vfs_dev;
1043 vap->va_nodeid = na->na_nodeid;
1044 vap->va_nlink = na->na_nlink;
1045 vap->va_size = na->na_size; /* keep for cache validation */
1046 /*
1047 * nfs protocol defines times as unsigned so don't extend sign,
1048 * unless sysadmin set nfs_allow_preepoch_time.
1049 */
1050 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
1051 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
1052 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
1053 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
1054 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
1055 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
1056 /*
1057 * Shannon's law - uncompress the received dev_t
1058 * if the top half of is zero indicating a response
1059 * from an `older style' OS. Except for when it is a
1060 * `new style' OS sending the maj device of zero,
1061 * in which case the algorithm still works because the
1062 * fact that it is a new style server
1063 * is hidden by the minor device not being greater
1064 * than 255 (a requirement in this case).
1065 */
1066 if ((na->na_rdev & 0xffff0000) == 0)
1067 vap->va_rdev = nfsv2_expdev(na->na_rdev);
1068 else
1069 vap->va_rdev = expldev(na->na_rdev);
1070
1071 vap->va_nblocks = na->na_blocks;
1072 switch (na->na_type) {
1073 case NFBLK:
1074 vap->va_blksize = DEV_BSIZE;
1075 break;
1076
1077 case NFCHR:
1078 vap->va_blksize = MAXBSIZE;
1079 break;
1080
1081 case NFSOC:
1082 default:
1083 vap->va_blksize = na->na_blocksize;
1084 break;
1085 }
1086 /*
1087 * This bit of ugliness is a hack to preserve the
1088 * over-the-wire protocols for named-pipe vnodes.
1089 * It remaps the special over-the-wire type to the
1090 * VFIFO type. (see note in nfs.h)
1091 */
1092 if (NA_ISFIFO(na)) {
1093 vap->va_type = VFIFO;
1094 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1095 vap->va_rdev = 0;
1096 vap->va_blksize = na->na_blocksize;
1097 }
1098 vap->va_seq = 0;
1099 return (0);
1100 }
1101
1102 /*
1103 * Convert NFS Version 3 over the network attributes to the local
1104 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1105 * network representation and the local representation is done here.
1106 */
1107 vtype_t nf3_to_vt[] = {
1108 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1109 };
1110
1111 int
fattr3_to_vattr(vnode_t * vp,fattr3 * na,struct vattr * vap)1112 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1113 {
1114
1115 #ifndef _LP64
1116 /* overflow in time attributes? */
1117 if (!NFS3_FATTR_TIME_OK(na))
1118 return (EOVERFLOW);
1119 #endif
1120 if (!NFS3_SIZE_OK(na->size))
1121 /* file too big */
1122 return (EFBIG);
1123
1124 vap->va_mask = AT_ALL;
1125
1126 if (na->type < NF3REG || na->type > NF3FIFO)
1127 vap->va_type = VBAD;
1128 else
1129 vap->va_type = nf3_to_vt[na->type];
1130 vap->va_mode = na->mode;
1131 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1132 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1133 vap->va_fsid = vp->v_vfsp->vfs_dev;
1134 vap->va_nodeid = na->fileid;
1135 vap->va_nlink = na->nlink;
1136 vap->va_size = na->size;
1137
1138 /*
1139 * nfs protocol defines times as unsigned so don't extend sign,
1140 * unless sysadmin set nfs_allow_preepoch_time.
1141 */
1142 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1143 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1144 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1145 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1146 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1147 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1148
1149 switch (na->type) {
1150 case NF3BLK:
1151 vap->va_rdev = makedevice(na->rdev.specdata1,
1152 na->rdev.specdata2);
1153 vap->va_blksize = DEV_BSIZE;
1154 vap->va_nblocks = 0;
1155 break;
1156 case NF3CHR:
1157 vap->va_rdev = makedevice(na->rdev.specdata1,
1158 na->rdev.specdata2);
1159 vap->va_blksize = MAXBSIZE;
1160 vap->va_nblocks = 0;
1161 break;
1162 case NF3REG:
1163 case NF3DIR:
1164 case NF3LNK:
1165 vap->va_rdev = 0;
1166 vap->va_blksize = MAXBSIZE;
1167 vap->va_nblocks = (u_longlong_t)
1168 ((na->used + (size3)DEV_BSIZE - (size3)1) /
1169 (size3)DEV_BSIZE);
1170 break;
1171 case NF3SOCK:
1172 case NF3FIFO:
1173 default:
1174 vap->va_rdev = 0;
1175 vap->va_blksize = MAXBSIZE;
1176 vap->va_nblocks = 0;
1177 break;
1178 }
1179 vap->va_seq = 0;
1180 return (0);
1181 }
1182
1183 /*
1184 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1185 * for the demand-based allocation of async threads per-mount. The
1186 * nfs_async_timeout is the amount of time a thread will live after it
1187 * becomes idle, unless new I/O requests are received before the thread
1188 * dies. See nfs_async_putpage and nfs_async_start.
1189 */
1190
1191 int nfs_async_timeout = -1; /* uninitialized */
1192
1193 static void nfs_async_start(struct vfs *);
1194 static void nfs_async_pgops_start(struct vfs *);
1195 static void nfs_async_common_start(struct vfs *, int);
1196
1197 static void
free_async_args(struct nfs_async_reqs * args)1198 free_async_args(struct nfs_async_reqs *args)
1199 {
1200 rnode_t *rp;
1201
1202 if (args->a_io != NFS_INACTIVE) {
1203 rp = VTOR(args->a_vp);
1204 mutex_enter(&rp->r_statelock);
1205 rp->r_count--;
1206 if (args->a_io == NFS_PUTAPAGE ||
1207 args->a_io == NFS_PAGEIO)
1208 rp->r_awcount--;
1209 cv_broadcast(&rp->r_cv);
1210 mutex_exit(&rp->r_statelock);
1211 VN_RELE(args->a_vp);
1212 }
1213 crfree(args->a_cred);
1214 kmem_free(args, sizeof (*args));
1215 }
1216
1217 /*
1218 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1219 * pageout(), running in the global zone, have legitimate reasons to do
1220 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1221 * use of a a per-mount "asynchronous requests manager thread" which is
1222 * signaled by the various asynchronous work routines when there is
1223 * asynchronous work to be done. It is responsible for creating new
1224 * worker threads if necessary, and notifying existing worker threads
1225 * that there is work to be done.
1226 *
1227 * In other words, it will "take the specifications from the customers and
1228 * give them to the engineers."
1229 *
1230 * Worker threads die off of their own accord if they are no longer
1231 * needed.
1232 *
1233 * This thread is killed when the zone is going away or the filesystem
1234 * is being unmounted.
1235 */
1236 void
nfs_async_manager(vfs_t * vfsp)1237 nfs_async_manager(vfs_t *vfsp)
1238 {
1239 callb_cpr_t cprinfo;
1240 mntinfo_t *mi;
1241 uint_t max_threads;
1242
1243 mi = VFTOMI(vfsp);
1244
1245 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1246 "nfs_async_manager");
1247
1248 mutex_enter(&mi->mi_async_lock);
1249 /*
1250 * We want to stash the max number of threads that this mount was
1251 * allowed so we can use it later when the variable is set to zero as
1252 * part of the zone/mount going away.
1253 *
1254 * We want to be able to create at least one thread to handle
1255 * asynchronous inactive calls.
1256 */
1257 max_threads = MAX(mi->mi_max_threads, 1);
1258 /*
1259 * We don't want to wait for mi_max_threads to go to zero, since that
1260 * happens as part of a failed unmount, but this thread should only
1261 * exit when the mount/zone is really going away.
1262 *
1263 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1264 * attempted: the various _async_*() functions know to do things
1265 * inline if mi_max_threads == 0. Henceforth we just drain out the
1266 * outstanding requests.
1267 *
1268 * Note that we still create zthreads even if we notice the zone is
1269 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1270 * shutdown sequence to take slightly longer in some cases, but
1271 * doesn't violate the protocol, as all threads will exit as soon as
1272 * they're done processing the remaining requests.
1273 */
1274 for (;;) {
1275 while (mi->mi_async_req_count > 0) {
1276 /*
1277 * Paranoia: If the mount started out having
1278 * (mi->mi_max_threads == 0), and the value was
1279 * later changed (via a debugger or somesuch),
1280 * we could be confused since we will think we
1281 * can't create any threads, and the calling
1282 * code (which looks at the current value of
1283 * mi->mi_max_threads, now non-zero) thinks we
1284 * can.
1285 *
1286 * So, because we're paranoid, we create threads
1287 * up to the maximum of the original and the
1288 * current value. This means that future
1289 * (debugger-induced) lowerings of
1290 * mi->mi_max_threads are ignored for our
1291 * purposes, but who told them they could change
1292 * random values on a live kernel anyhow?
1293 */
1294 if (mi->mi_threads[NFS_ASYNC_QUEUE] <
1295 MAX(mi->mi_max_threads, max_threads)) {
1296 mi->mi_threads[NFS_ASYNC_QUEUE]++;
1297 mutex_exit(&mi->mi_async_lock);
1298 VFS_HOLD(vfsp); /* hold for new thread */
1299 (void) zthread_create(NULL, 0, nfs_async_start,
1300 vfsp, 0, minclsyspri);
1301 mutex_enter(&mi->mi_async_lock);
1302 } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
1303 NUM_ASYNC_PGOPS_THREADS) {
1304 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
1305 mutex_exit(&mi->mi_async_lock);
1306 VFS_HOLD(vfsp); /* hold for new thread */
1307 (void) zthread_create(NULL, 0,
1308 nfs_async_pgops_start, vfsp, 0,
1309 minclsyspri);
1310 mutex_enter(&mi->mi_async_lock);
1311 }
1312 NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1313 ASSERT(mi->mi_async_req_count != 0);
1314 mi->mi_async_req_count--;
1315 }
1316
1317 mutex_enter(&mi->mi_lock);
1318 if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
1319 mutex_exit(&mi->mi_lock);
1320 break;
1321 }
1322 mutex_exit(&mi->mi_lock);
1323
1324 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1325 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1326 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1327 }
1328 /*
1329 * Let everyone know we're done.
1330 */
1331 mi->mi_manager_thread = NULL;
1332 cv_broadcast(&mi->mi_async_cv);
1333
1334 /*
1335 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1336 * since CALLB_CPR_EXIT is actually responsible for releasing
1337 * 'mi_async_lock'.
1338 */
1339 CALLB_CPR_EXIT(&cprinfo);
1340 VFS_RELE(vfsp); /* release thread's hold */
1341 zthread_exit();
1342 }
1343
1344 /*
1345 * Signal (and wait for) the async manager thread to clean up and go away.
1346 */
1347 void
nfs_async_manager_stop(vfs_t * vfsp)1348 nfs_async_manager_stop(vfs_t *vfsp)
1349 {
1350 mntinfo_t *mi = VFTOMI(vfsp);
1351
1352 mutex_enter(&mi->mi_async_lock);
1353 mutex_enter(&mi->mi_lock);
1354 mi->mi_flags |= MI_ASYNC_MGR_STOP;
1355 mutex_exit(&mi->mi_lock);
1356 cv_broadcast(&mi->mi_async_reqs_cv);
1357 while (mi->mi_manager_thread != NULL)
1358 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1359 mutex_exit(&mi->mi_async_lock);
1360 }
1361
1362 int
nfs_async_readahead(vnode_t * vp,u_offset_t blkoff,caddr_t addr,struct seg * seg,cred_t * cr,void (* readahead)(vnode_t *,u_offset_t,caddr_t,struct seg *,cred_t *))1363 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1364 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1365 u_offset_t, caddr_t, struct seg *, cred_t *))
1366 {
1367 rnode_t *rp;
1368 mntinfo_t *mi;
1369 struct nfs_async_reqs *args;
1370
1371 rp = VTOR(vp);
1372 ASSERT(rp->r_freef == NULL);
1373
1374 mi = VTOMI(vp);
1375
1376 /*
1377 * If addr falls in a different segment, don't bother doing readahead.
1378 */
1379 if (addr >= seg->s_base + seg->s_size)
1380 return (-1);
1381
1382 /*
1383 * If we can't allocate a request structure, punt on the readahead.
1384 */
1385 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1386 return (-1);
1387
1388 /*
1389 * If a lock operation is pending, don't initiate any new
1390 * readaheads. Otherwise, bump r_count to indicate the new
1391 * asynchronous I/O.
1392 */
1393 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1394 kmem_free(args, sizeof (*args));
1395 return (-1);
1396 }
1397 mutex_enter(&rp->r_statelock);
1398 rp->r_count++;
1399 mutex_exit(&rp->r_statelock);
1400 nfs_rw_exit(&rp->r_lkserlock);
1401
1402 args->a_next = NULL;
1403 #ifdef DEBUG
1404 args->a_queuer = curthread;
1405 #endif
1406 VN_HOLD(vp);
1407 args->a_vp = vp;
1408 ASSERT(cr != NULL);
1409 crhold(cr);
1410 args->a_cred = cr;
1411 args->a_io = NFS_READ_AHEAD;
1412 args->a_nfs_readahead = readahead;
1413 args->a_nfs_blkoff = blkoff;
1414 args->a_nfs_seg = seg;
1415 args->a_nfs_addr = addr;
1416
1417 mutex_enter(&mi->mi_async_lock);
1418
1419 /*
1420 * If asyncio has been disabled, don't bother readahead.
1421 */
1422 if (mi->mi_max_threads == 0) {
1423 mutex_exit(&mi->mi_async_lock);
1424 goto noasync;
1425 }
1426
1427 /*
1428 * Link request structure into the async list and
1429 * wakeup async thread to do the i/o.
1430 */
1431 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1432 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1433 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1434 } else {
1435 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1436 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1437 }
1438
1439 if (mi->mi_io_kstats) {
1440 mutex_enter(&mi->mi_lock);
1441 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1442 mutex_exit(&mi->mi_lock);
1443 }
1444
1445 mi->mi_async_req_count++;
1446 ASSERT(mi->mi_async_req_count != 0);
1447 cv_signal(&mi->mi_async_reqs_cv);
1448 mutex_exit(&mi->mi_async_lock);
1449 return (0);
1450
1451 noasync:
1452 mutex_enter(&rp->r_statelock);
1453 rp->r_count--;
1454 cv_broadcast(&rp->r_cv);
1455 mutex_exit(&rp->r_statelock);
1456 VN_RELE(vp);
1457 crfree(cr);
1458 kmem_free(args, sizeof (*args));
1459 return (-1);
1460 }
1461
1462 int
nfs_async_putapage(vnode_t * vp,page_t * pp,u_offset_t off,size_t len,int flags,cred_t * cr,int (* putapage)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))1463 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1464 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1465 u_offset_t, size_t, int, cred_t *))
1466 {
1467 rnode_t *rp;
1468 mntinfo_t *mi;
1469 struct nfs_async_reqs *args;
1470
1471 ASSERT(flags & B_ASYNC);
1472 ASSERT(vp->v_vfsp != NULL);
1473
1474 rp = VTOR(vp);
1475 ASSERT(rp->r_count > 0);
1476
1477 mi = VTOMI(vp);
1478
1479 /*
1480 * If we can't allocate a request structure, do the putpage
1481 * operation synchronously in this thread's context.
1482 */
1483 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1484 goto noasync;
1485
1486 args->a_next = NULL;
1487 #ifdef DEBUG
1488 args->a_queuer = curthread;
1489 #endif
1490 VN_HOLD(vp);
1491 args->a_vp = vp;
1492 ASSERT(cr != NULL);
1493 crhold(cr);
1494 args->a_cred = cr;
1495 args->a_io = NFS_PUTAPAGE;
1496 args->a_nfs_putapage = putapage;
1497 args->a_nfs_pp = pp;
1498 args->a_nfs_off = off;
1499 args->a_nfs_len = (uint_t)len;
1500 args->a_nfs_flags = flags;
1501
1502 mutex_enter(&mi->mi_async_lock);
1503
1504 /*
1505 * If asyncio has been disabled, then make a synchronous request.
1506 * This check is done a second time in case async io was diabled
1507 * while this thread was blocked waiting for memory pressure to
1508 * reduce or for the queue to drain.
1509 */
1510 if (mi->mi_max_threads == 0) {
1511 mutex_exit(&mi->mi_async_lock);
1512 goto noasync;
1513 }
1514
1515 /*
1516 * Link request structure into the async list and
1517 * wakeup async thread to do the i/o.
1518 */
1519 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1520 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1521 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1522 } else {
1523 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1524 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1525 }
1526
1527 mutex_enter(&rp->r_statelock);
1528 rp->r_count++;
1529 rp->r_awcount++;
1530 mutex_exit(&rp->r_statelock);
1531
1532 if (mi->mi_io_kstats) {
1533 mutex_enter(&mi->mi_lock);
1534 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1535 mutex_exit(&mi->mi_lock);
1536 }
1537
1538 mi->mi_async_req_count++;
1539 ASSERT(mi->mi_async_req_count != 0);
1540 cv_signal(&mi->mi_async_reqs_cv);
1541 mutex_exit(&mi->mi_async_lock);
1542 return (0);
1543
1544 noasync:
1545 if (args != NULL) {
1546 VN_RELE(vp);
1547 crfree(cr);
1548 kmem_free(args, sizeof (*args));
1549 }
1550
1551 if (curproc == proc_pageout || curproc == proc_fsflush) {
1552 /*
1553 * If we get here in the context of the pageout/fsflush,
1554 * we refuse to do a sync write, because this may hang
1555 * pageout (and the machine). In this case, we just
1556 * re-mark the page as dirty and punt on the page.
1557 *
1558 * Make sure B_FORCE isn't set. We can re-mark the
1559 * pages as dirty and unlock the pages in one swoop by
1560 * passing in B_ERROR to pvn_write_done(). However,
1561 * we should make sure B_FORCE isn't set - we don't
1562 * want the page tossed before it gets written out.
1563 */
1564 if (flags & B_FORCE)
1565 flags &= ~(B_INVAL | B_FORCE);
1566 pvn_write_done(pp, flags | B_ERROR);
1567 return (0);
1568 }
1569 if (nfs_zone() != mi->mi_zone) {
1570 /*
1571 * So this was a cross-zone sync putpage. We pass in B_ERROR
1572 * to pvn_write_done() to re-mark the pages as dirty and unlock
1573 * them.
1574 *
1575 * We don't want to clear B_FORCE here as the caller presumably
1576 * knows what they're doing if they set it.
1577 */
1578 pvn_write_done(pp, flags | B_ERROR);
1579 return (EPERM);
1580 }
1581 return ((*putapage)(vp, pp, off, len, flags, cr));
1582 }
1583
1584 int
nfs_async_pageio(vnode_t * vp,page_t * pp,u_offset_t io_off,size_t io_len,int flags,cred_t * cr,int (* pageio)(vnode_t *,page_t *,u_offset_t,size_t,int,cred_t *))1585 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1586 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1587 size_t, int, cred_t *))
1588 {
1589 rnode_t *rp;
1590 mntinfo_t *mi;
1591 struct nfs_async_reqs *args;
1592
1593 ASSERT(flags & B_ASYNC);
1594 ASSERT(vp->v_vfsp != NULL);
1595
1596 rp = VTOR(vp);
1597 ASSERT(rp->r_count > 0);
1598
1599 mi = VTOMI(vp);
1600
1601 /*
1602 * If we can't allocate a request structure, do the pageio
1603 * request synchronously in this thread's context.
1604 */
1605 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1606 goto noasync;
1607
1608 args->a_next = NULL;
1609 #ifdef DEBUG
1610 args->a_queuer = curthread;
1611 #endif
1612 VN_HOLD(vp);
1613 args->a_vp = vp;
1614 ASSERT(cr != NULL);
1615 crhold(cr);
1616 args->a_cred = cr;
1617 args->a_io = NFS_PAGEIO;
1618 args->a_nfs_pageio = pageio;
1619 args->a_nfs_pp = pp;
1620 args->a_nfs_off = io_off;
1621 args->a_nfs_len = (uint_t)io_len;
1622 args->a_nfs_flags = flags;
1623
1624 mutex_enter(&mi->mi_async_lock);
1625
1626 /*
1627 * If asyncio has been disabled, then make a synchronous request.
1628 * This check is done a second time in case async io was diabled
1629 * while this thread was blocked waiting for memory pressure to
1630 * reduce or for the queue to drain.
1631 */
1632 if (mi->mi_max_threads == 0) {
1633 mutex_exit(&mi->mi_async_lock);
1634 goto noasync;
1635 }
1636
1637 /*
1638 * Link request structure into the async list and
1639 * wakeup async thread to do the i/o.
1640 */
1641 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1642 mi->mi_async_reqs[NFS_PAGEIO] = args;
1643 mi->mi_async_tail[NFS_PAGEIO] = args;
1644 } else {
1645 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1646 mi->mi_async_tail[NFS_PAGEIO] = args;
1647 }
1648
1649 mutex_enter(&rp->r_statelock);
1650 rp->r_count++;
1651 rp->r_awcount++;
1652 mutex_exit(&rp->r_statelock);
1653
1654 if (mi->mi_io_kstats) {
1655 mutex_enter(&mi->mi_lock);
1656 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1657 mutex_exit(&mi->mi_lock);
1658 }
1659
1660 mi->mi_async_req_count++;
1661 ASSERT(mi->mi_async_req_count != 0);
1662 cv_signal(&mi->mi_async_reqs_cv);
1663 mutex_exit(&mi->mi_async_lock);
1664 return (0);
1665
1666 noasync:
1667 if (args != NULL) {
1668 VN_RELE(vp);
1669 crfree(cr);
1670 kmem_free(args, sizeof (*args));
1671 }
1672
1673 /*
1674 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1675 * the page list), for writes we do it synchronously, except for
1676 * proc_pageout/proc_fsflush as described below.
1677 */
1678 if (flags & B_READ) {
1679 pvn_read_done(pp, flags | B_ERROR);
1680 return (0);
1681 }
1682
1683 if (curproc == proc_pageout || curproc == proc_fsflush) {
1684 /*
1685 * If we get here in the context of the pageout/fsflush,
1686 * we refuse to do a sync write, because this may hang
1687 * pageout/fsflush (and the machine). In this case, we just
1688 * re-mark the page as dirty and punt on the page.
1689 *
1690 * Make sure B_FORCE isn't set. We can re-mark the
1691 * pages as dirty and unlock the pages in one swoop by
1692 * passing in B_ERROR to pvn_write_done(). However,
1693 * we should make sure B_FORCE isn't set - we don't
1694 * want the page tossed before it gets written out.
1695 */
1696 if (flags & B_FORCE)
1697 flags &= ~(B_INVAL | B_FORCE);
1698 pvn_write_done(pp, flags | B_ERROR);
1699 return (0);
1700 }
1701
1702 if (nfs_zone() != mi->mi_zone) {
1703 /*
1704 * So this was a cross-zone sync pageio. We pass in B_ERROR
1705 * to pvn_write_done() to re-mark the pages as dirty and unlock
1706 * them.
1707 *
1708 * We don't want to clear B_FORCE here as the caller presumably
1709 * knows what they're doing if they set it.
1710 */
1711 pvn_write_done(pp, flags | B_ERROR);
1712 return (EPERM);
1713 }
1714 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1715 }
1716
1717 void
nfs_async_readdir(vnode_t * vp,rddir_cache * rdc,cred_t * cr,int (* readdir)(vnode_t *,rddir_cache *,cred_t *))1718 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1719 int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1720 {
1721 rnode_t *rp;
1722 mntinfo_t *mi;
1723 struct nfs_async_reqs *args;
1724
1725 rp = VTOR(vp);
1726 ASSERT(rp->r_freef == NULL);
1727
1728 mi = VTOMI(vp);
1729
1730 /*
1731 * If we can't allocate a request structure, do the readdir
1732 * operation synchronously in this thread's context.
1733 */
1734 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1735 goto noasync;
1736
1737 args->a_next = NULL;
1738 #ifdef DEBUG
1739 args->a_queuer = curthread;
1740 #endif
1741 VN_HOLD(vp);
1742 args->a_vp = vp;
1743 ASSERT(cr != NULL);
1744 crhold(cr);
1745 args->a_cred = cr;
1746 args->a_io = NFS_READDIR;
1747 args->a_nfs_readdir = readdir;
1748 args->a_nfs_rdc = rdc;
1749
1750 mutex_enter(&mi->mi_async_lock);
1751
1752 /*
1753 * If asyncio has been disabled, then make a synchronous request.
1754 */
1755 if (mi->mi_max_threads == 0) {
1756 mutex_exit(&mi->mi_async_lock);
1757 goto noasync;
1758 }
1759
1760 /*
1761 * Link request structure into the async list and
1762 * wakeup async thread to do the i/o.
1763 */
1764 if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1765 mi->mi_async_reqs[NFS_READDIR] = args;
1766 mi->mi_async_tail[NFS_READDIR] = args;
1767 } else {
1768 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1769 mi->mi_async_tail[NFS_READDIR] = args;
1770 }
1771
1772 mutex_enter(&rp->r_statelock);
1773 rp->r_count++;
1774 mutex_exit(&rp->r_statelock);
1775
1776 if (mi->mi_io_kstats) {
1777 mutex_enter(&mi->mi_lock);
1778 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1779 mutex_exit(&mi->mi_lock);
1780 }
1781
1782 mi->mi_async_req_count++;
1783 ASSERT(mi->mi_async_req_count != 0);
1784 cv_signal(&mi->mi_async_reqs_cv);
1785 mutex_exit(&mi->mi_async_lock);
1786 return;
1787
1788 noasync:
1789 if (args != NULL) {
1790 VN_RELE(vp);
1791 crfree(cr);
1792 kmem_free(args, sizeof (*args));
1793 }
1794
1795 rdc->entries = NULL;
1796 mutex_enter(&rp->r_statelock);
1797 ASSERT(rdc->flags & RDDIR);
1798 rdc->flags &= ~RDDIR;
1799 rdc->flags |= RDDIRREQ;
1800 /*
1801 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1802 * is set, wakeup the thread sleeping in cv_wait_sig().
1803 * The woken up thread will reset the flag to RDDIR and will
1804 * continue with the readdir opeartion.
1805 */
1806 if (rdc->flags & RDDIRWAIT) {
1807 rdc->flags &= ~RDDIRWAIT;
1808 cv_broadcast(&rdc->cv);
1809 }
1810 mutex_exit(&rp->r_statelock);
1811 rddir_cache_rele(rdc);
1812 }
1813
1814 void
nfs_async_commit(vnode_t * vp,page_t * plist,offset3 offset,count3 count,cred_t * cr,void (* commit)(vnode_t *,page_t *,offset3,count3,cred_t *))1815 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1816 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *))
1817 {
1818 rnode_t *rp;
1819 mntinfo_t *mi;
1820 struct nfs_async_reqs *args;
1821 page_t *pp;
1822
1823 rp = VTOR(vp);
1824 mi = VTOMI(vp);
1825
1826 /*
1827 * If we can't allocate a request structure, do the commit
1828 * operation synchronously in this thread's context.
1829 */
1830 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1831 goto noasync;
1832
1833 args->a_next = NULL;
1834 #ifdef DEBUG
1835 args->a_queuer = curthread;
1836 #endif
1837 VN_HOLD(vp);
1838 args->a_vp = vp;
1839 ASSERT(cr != NULL);
1840 crhold(cr);
1841 args->a_cred = cr;
1842 args->a_io = NFS_COMMIT;
1843 args->a_nfs_commit = commit;
1844 args->a_nfs_plist = plist;
1845 args->a_nfs_offset = offset;
1846 args->a_nfs_count = count;
1847
1848 mutex_enter(&mi->mi_async_lock);
1849
1850 /*
1851 * If asyncio has been disabled, then make a synchronous request.
1852 * This check is done a second time in case async io was diabled
1853 * while this thread was blocked waiting for memory pressure to
1854 * reduce or for the queue to drain.
1855 */
1856 if (mi->mi_max_threads == 0) {
1857 mutex_exit(&mi->mi_async_lock);
1858 goto noasync;
1859 }
1860
1861 /*
1862 * Link request structure into the async list and
1863 * wakeup async thread to do the i/o.
1864 */
1865 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1866 mi->mi_async_reqs[NFS_COMMIT] = args;
1867 mi->mi_async_tail[NFS_COMMIT] = args;
1868 } else {
1869 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1870 mi->mi_async_tail[NFS_COMMIT] = args;
1871 }
1872
1873 mutex_enter(&rp->r_statelock);
1874 rp->r_count++;
1875 mutex_exit(&rp->r_statelock);
1876
1877 if (mi->mi_io_kstats) {
1878 mutex_enter(&mi->mi_lock);
1879 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1880 mutex_exit(&mi->mi_lock);
1881 }
1882
1883 mi->mi_async_req_count++;
1884 ASSERT(mi->mi_async_req_count != 0);
1885 cv_signal(&mi->mi_async_reqs_cv);
1886 mutex_exit(&mi->mi_async_lock);
1887 return;
1888
1889 noasync:
1890 if (args != NULL) {
1891 VN_RELE(vp);
1892 crfree(cr);
1893 kmem_free(args, sizeof (*args));
1894 }
1895
1896 if (curproc == proc_pageout || curproc == proc_fsflush ||
1897 nfs_zone() != mi->mi_zone) {
1898 while (plist != NULL) {
1899 pp = plist;
1900 page_sub(&plist, pp);
1901 pp->p_fsdata = C_COMMIT;
1902 page_unlock(pp);
1903 }
1904 return;
1905 }
1906 (*commit)(vp, plist, offset, count, cr);
1907 }
1908
1909 void
nfs_async_inactive(vnode_t * vp,cred_t * cr,void (* inactive)(vnode_t *,cred_t *,caller_context_t *))1910 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1911 void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
1912 {
1913 mntinfo_t *mi;
1914 struct nfs_async_reqs *args;
1915
1916 mi = VTOMI(vp);
1917
1918 args = kmem_alloc(sizeof (*args), KM_SLEEP);
1919 args->a_next = NULL;
1920 #ifdef DEBUG
1921 args->a_queuer = curthread;
1922 #endif
1923 args->a_vp = vp;
1924 ASSERT(cr != NULL);
1925 crhold(cr);
1926 args->a_cred = cr;
1927 args->a_io = NFS_INACTIVE;
1928 args->a_nfs_inactive = inactive;
1929
1930 /*
1931 * Note that we don't check mi->mi_max_threads here, since we
1932 * *need* to get rid of this vnode regardless of whether someone
1933 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1934 *
1935 * The manager thread knows about this and is willing to create
1936 * at least one thread to accommodate us.
1937 */
1938 mutex_enter(&mi->mi_async_lock);
1939 if (mi->mi_manager_thread == NULL) {
1940 rnode_t *rp = VTOR(vp);
1941
1942 mutex_exit(&mi->mi_async_lock);
1943 crfree(cr); /* drop our reference */
1944 kmem_free(args, sizeof (*args));
1945 /*
1946 * We can't do an over-the-wire call since we're in the wrong
1947 * zone, so we need to clean up state as best we can and then
1948 * throw away the vnode.
1949 */
1950 mutex_enter(&rp->r_statelock);
1951 if (rp->r_unldvp != NULL) {
1952 vnode_t *unldvp;
1953 char *unlname;
1954 cred_t *unlcred;
1955
1956 unldvp = rp->r_unldvp;
1957 rp->r_unldvp = NULL;
1958 unlname = rp->r_unlname;
1959 rp->r_unlname = NULL;
1960 unlcred = rp->r_unlcred;
1961 rp->r_unlcred = NULL;
1962 mutex_exit(&rp->r_statelock);
1963
1964 VN_RELE(unldvp);
1965 kmem_free(unlname, MAXNAMELEN);
1966 crfree(unlcred);
1967 } else {
1968 mutex_exit(&rp->r_statelock);
1969 }
1970 /*
1971 * No need to explicitly throw away any cached pages. The
1972 * eventual rinactive() will attempt a synchronous
1973 * VOP_PUTPAGE() which will immediately fail since the request
1974 * is coming from the wrong zone, and then will proceed to call
1975 * nfs_invalidate_pages() which will clean things up for us.
1976 */
1977 rp_addfree(VTOR(vp), cr);
1978 return;
1979 }
1980
1981 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1982 mi->mi_async_reqs[NFS_INACTIVE] = args;
1983 } else {
1984 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1985 }
1986 mi->mi_async_tail[NFS_INACTIVE] = args;
1987 /*
1988 * Don't increment r_count, since we're trying to get rid of the vnode.
1989 */
1990
1991 mi->mi_async_req_count++;
1992 ASSERT(mi->mi_async_req_count != 0);
1993 cv_signal(&mi->mi_async_reqs_cv);
1994 mutex_exit(&mi->mi_async_lock);
1995 }
1996
1997 static void
nfs_async_start(struct vfs * vfsp)1998 nfs_async_start(struct vfs *vfsp)
1999 {
2000 nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
2001 }
2002
2003 static void
nfs_async_pgops_start(struct vfs * vfsp)2004 nfs_async_pgops_start(struct vfs *vfsp)
2005 {
2006 nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
2007 }
2008
2009 /*
2010 * The async queues for each mounted file system are arranged as a
2011 * set of queues, one for each async i/o type. Requests are taken
2012 * from the queues in a round-robin fashion. A number of consecutive
2013 * requests are taken from each queue before moving on to the next
2014 * queue. This functionality may allow the NFS Version 2 server to do
2015 * write clustering, even if the client is mixing writes and reads
2016 * because it will take multiple write requests from the queue
2017 * before processing any of the other async i/o types.
2018 *
2019 * XXX The nfs_async_common_start thread is unsafe in the light of the present
2020 * model defined by cpr to suspend the system. Specifically over the
2021 * wire calls are cpr-unsafe. The thread should be reevaluated in
2022 * case of future updates to the cpr model.
2023 */
2024 static void
nfs_async_common_start(struct vfs * vfsp,int async_queue)2025 nfs_async_common_start(struct vfs *vfsp, int async_queue)
2026 {
2027 struct nfs_async_reqs *args;
2028 mntinfo_t *mi = VFTOMI(vfsp);
2029 clock_t time_left = 1;
2030 callb_cpr_t cprinfo;
2031 int i;
2032 int async_types;
2033 kcondvar_t *async_work_cv;
2034
2035 if (async_queue == NFS_ASYNC_QUEUE) {
2036 async_types = NFS_ASYNC_TYPES;
2037 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
2038 } else {
2039 async_types = NFS_ASYNC_PGOPS_TYPES;
2040 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
2041 }
2042
2043 /*
2044 * Dynamic initialization of nfs_async_timeout to allow nfs to be
2045 * built in an implementation independent manner.
2046 */
2047 if (nfs_async_timeout == -1)
2048 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
2049
2050 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
2051
2052 mutex_enter(&mi->mi_async_lock);
2053 for (;;) {
2054 /*
2055 * Find the next queue containing an entry. We start
2056 * at the current queue pointer and then round robin
2057 * through all of them until we either find a non-empty
2058 * queue or have looked through all of them.
2059 */
2060 for (i = 0; i < async_types; i++) {
2061 args = *mi->mi_async_curr[async_queue];
2062 if (args != NULL)
2063 break;
2064 mi->mi_async_curr[async_queue]++;
2065 if (mi->mi_async_curr[async_queue] ==
2066 &mi->mi_async_reqs[async_types]) {
2067 mi->mi_async_curr[async_queue] =
2068 &mi->mi_async_reqs[0];
2069 }
2070 }
2071 /*
2072 * If we didn't find a entry, then block until woken up
2073 * again and then look through the queues again.
2074 */
2075 if (args == NULL) {
2076 /*
2077 * Exiting is considered to be safe for CPR as well
2078 */
2079 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2080
2081 /*
2082 * Wakeup thread waiting to unmount the file
2083 * system only if all async threads are inactive.
2084 *
2085 * If we've timed-out and there's nothing to do,
2086 * then get rid of this thread.
2087 */
2088 if (mi->mi_max_threads == 0 || time_left <= 0) {
2089 --mi->mi_threads[async_queue];
2090
2091 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
2092 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
2093 cv_signal(&mi->mi_async_cv);
2094 CALLB_CPR_EXIT(&cprinfo);
2095 VFS_RELE(vfsp); /* release thread's hold */
2096 zthread_exit();
2097 /* NOTREACHED */
2098 }
2099 time_left = cv_reltimedwait(async_work_cv,
2100 &mi->mi_async_lock, nfs_async_timeout,
2101 TR_CLOCK_TICK);
2102
2103 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
2104
2105 continue;
2106 }
2107 time_left = 1;
2108
2109 /*
2110 * Remove the request from the async queue and then
2111 * update the current async request queue pointer. If
2112 * the current queue is empty or we have removed enough
2113 * consecutive entries from it, then reset the counter
2114 * for this queue and then move the current pointer to
2115 * the next queue.
2116 */
2117 *mi->mi_async_curr[async_queue] = args->a_next;
2118 if (*mi->mi_async_curr[async_queue] == NULL ||
2119 --mi->mi_async_clusters[args->a_io] == 0) {
2120 mi->mi_async_clusters[args->a_io] =
2121 mi->mi_async_init_clusters;
2122 mi->mi_async_curr[async_queue]++;
2123 if (mi->mi_async_curr[async_queue] ==
2124 &mi->mi_async_reqs[async_types]) {
2125 mi->mi_async_curr[async_queue] =
2126 &mi->mi_async_reqs[0];
2127 }
2128 }
2129
2130 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2131 mutex_enter(&mi->mi_lock);
2132 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2133 mutex_exit(&mi->mi_lock);
2134 }
2135
2136 mutex_exit(&mi->mi_async_lock);
2137
2138 /*
2139 * Obtain arguments from the async request structure.
2140 */
2141 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2142 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2143 args->a_nfs_addr, args->a_nfs_seg,
2144 args->a_cred);
2145 } else if (args->a_io == NFS_PUTAPAGE) {
2146 (void) (*args->a_nfs_putapage)(args->a_vp,
2147 args->a_nfs_pp, args->a_nfs_off,
2148 args->a_nfs_len, args->a_nfs_flags,
2149 args->a_cred);
2150 } else if (args->a_io == NFS_PAGEIO) {
2151 (void) (*args->a_nfs_pageio)(args->a_vp,
2152 args->a_nfs_pp, args->a_nfs_off,
2153 args->a_nfs_len, args->a_nfs_flags,
2154 args->a_cred);
2155 } else if (args->a_io == NFS_READDIR) {
2156 (void) ((*args->a_nfs_readdir)(args->a_vp,
2157 args->a_nfs_rdc, args->a_cred));
2158 } else if (args->a_io == NFS_COMMIT) {
2159 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2160 args->a_nfs_offset, args->a_nfs_count,
2161 args->a_cred);
2162 } else if (args->a_io == NFS_INACTIVE) {
2163 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
2164 }
2165
2166 /*
2167 * Now, release the vnode and free the credentials
2168 * structure.
2169 */
2170 free_async_args(args);
2171 /*
2172 * Reacquire the mutex because it will be needed above.
2173 */
2174 mutex_enter(&mi->mi_async_lock);
2175 }
2176 }
2177
2178 void
nfs_async_stop(struct vfs * vfsp)2179 nfs_async_stop(struct vfs *vfsp)
2180 {
2181 mntinfo_t *mi = VFTOMI(vfsp);
2182
2183 /*
2184 * Wait for all outstanding async operations to complete and for the
2185 * worker threads to exit.
2186 */
2187 mutex_enter(&mi->mi_async_lock);
2188 mi->mi_max_threads = 0;
2189 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2190 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2191 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
2192 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2193 mutex_exit(&mi->mi_async_lock);
2194 }
2195
2196 /*
2197 * nfs_async_stop_sig:
2198 * Wait for all outstanding putpage operation to complete. If a signal
2199 * is deliver we will abort and return non-zero. If we can put all the
2200 * pages we will return 0. This routine is called from nfs_unmount and
2201 * nfs3_unmount to make these operations interruptible.
2202 */
2203 int
nfs_async_stop_sig(struct vfs * vfsp)2204 nfs_async_stop_sig(struct vfs *vfsp)
2205 {
2206 mntinfo_t *mi = VFTOMI(vfsp);
2207 ushort_t omax;
2208 int rval;
2209
2210 /*
2211 * Wait for all outstanding async operations to complete and for the
2212 * worker threads to exit.
2213 */
2214 mutex_enter(&mi->mi_async_lock);
2215 omax = mi->mi_max_threads;
2216 mi->mi_max_threads = 0;
2217 /*
2218 * Tell all the worker threads to exit.
2219 */
2220 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2221 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2222 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
2223 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2224 break;
2225 }
2226 rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2227 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0); /* Interrupted */
2228 if (rval)
2229 mi->mi_max_threads = omax;
2230 mutex_exit(&mi->mi_async_lock);
2231
2232 return (rval);
2233 }
2234
2235 int
writerp(rnode_t * rp,caddr_t base,int tcount,struct uio * uio,int pgcreated)2236 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2237 {
2238 int pagecreate;
2239 int n;
2240 int saved_n;
2241 caddr_t saved_base;
2242 u_offset_t offset;
2243 int error;
2244 int sm_error;
2245 vnode_t *vp = RTOV(rp);
2246
2247 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2248 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2249 if (!vpm_enable) {
2250 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2251 }
2252
2253 /*
2254 * Move bytes in at most PAGESIZE chunks. We must avoid
2255 * spanning pages in uiomove() because page faults may cause
2256 * the cache to be invalidated out from under us. The r_size is not
2257 * updated until after the uiomove. If we push the last page of a
2258 * file before r_size is correct, we will lose the data written past
2259 * the current (and invalid) r_size.
2260 */
2261 do {
2262 offset = uio->uio_loffset;
2263 pagecreate = 0;
2264
2265 /*
2266 * n is the number of bytes required to satisfy the request
2267 * or the number of bytes to fill out the page.
2268 */
2269 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2270
2271 /*
2272 * Check to see if we can skip reading in the page
2273 * and just allocate the memory. We can do this
2274 * if we are going to rewrite the entire mapping
2275 * or if we are going to write to or beyond the current
2276 * end of file from the beginning of the mapping.
2277 *
2278 * The read of r_size is now protected by r_statelock.
2279 */
2280 mutex_enter(&rp->r_statelock);
2281 /*
2282 * When pgcreated is nonzero the caller has already done
2283 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2284 * segkpm this means we already have at least one page
2285 * created and mapped at base.
2286 */
2287 pagecreate = pgcreated ||
2288 ((offset & PAGEOFFSET) == 0 &&
2289 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2290
2291 mutex_exit(&rp->r_statelock);
2292 if (!vpm_enable && pagecreate) {
2293 /*
2294 * The last argument tells segmap_pagecreate() to
2295 * always lock the page, as opposed to sometimes
2296 * returning with the page locked. This way we avoid a
2297 * fault on the ensuing uiomove(), but also
2298 * more importantly (to fix bug 1094402) we can
2299 * call segmap_fault() to unlock the page in all
2300 * cases. An alternative would be to modify
2301 * segmap_pagecreate() to tell us when it is
2302 * locking a page, but that's a fairly major
2303 * interface change.
2304 */
2305 if (pgcreated == 0)
2306 (void) segmap_pagecreate(segkmap, base,
2307 (uint_t)n, 1);
2308 saved_base = base;
2309 saved_n = n;
2310 }
2311
2312 /*
2313 * The number of bytes of data in the last page can not
2314 * be accurately be determined while page is being
2315 * uiomove'd to and the size of the file being updated.
2316 * Thus, inform threads which need to know accurately
2317 * how much data is in the last page of the file. They
2318 * will not do the i/o immediately, but will arrange for
2319 * the i/o to happen later when this modify operation
2320 * will have finished.
2321 */
2322 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2323 mutex_enter(&rp->r_statelock);
2324 rp->r_flags |= RMODINPROGRESS;
2325 rp->r_modaddr = (offset & MAXBMASK);
2326 mutex_exit(&rp->r_statelock);
2327
2328 if (vpm_enable) {
2329 /*
2330 * Copy data. If new pages are created, part of
2331 * the page that is not written will be initizliazed
2332 * with zeros.
2333 */
2334 error = vpm_data_copy(vp, offset, n, uio,
2335 !pagecreate, NULL, 0, S_WRITE);
2336 } else {
2337 error = uiomove(base, n, UIO_WRITE, uio);
2338 }
2339
2340 /*
2341 * r_size is the maximum number of
2342 * bytes known to be in the file.
2343 * Make sure it is at least as high as the
2344 * first unwritten byte pointed to by uio_loffset.
2345 */
2346 mutex_enter(&rp->r_statelock);
2347 if (rp->r_size < uio->uio_loffset)
2348 rp->r_size = uio->uio_loffset;
2349 rp->r_flags &= ~RMODINPROGRESS;
2350 rp->r_flags |= RDIRTY;
2351 mutex_exit(&rp->r_statelock);
2352
2353 /* n = # of bytes written */
2354 n = (int)(uio->uio_loffset - offset);
2355
2356 if (!vpm_enable) {
2357 base += n;
2358 }
2359 tcount -= n;
2360 /*
2361 * If we created pages w/o initializing them completely,
2362 * we need to zero the part that wasn't set up.
2363 * This happens on a most EOF write cases and if
2364 * we had some sort of error during the uiomove.
2365 */
2366 if (!vpm_enable && pagecreate) {
2367 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2368 (void) kzero(base, PAGESIZE - n);
2369
2370 if (pgcreated) {
2371 /*
2372 * Caller is responsible for this page,
2373 * it was not created in this loop.
2374 */
2375 pgcreated = 0;
2376 } else {
2377 /*
2378 * For bug 1094402: segmap_pagecreate locks
2379 * page. Unlock it. This also unlocks the
2380 * pages allocated by page_create_va() in
2381 * segmap_pagecreate().
2382 */
2383 sm_error = segmap_fault(kas.a_hat, segkmap,
2384 saved_base, saved_n,
2385 F_SOFTUNLOCK, S_WRITE);
2386 if (error == 0)
2387 error = sm_error;
2388 }
2389 }
2390 } while (tcount > 0 && error == 0);
2391
2392 return (error);
2393 }
2394
2395 int
nfs_putpages(vnode_t * vp,u_offset_t off,size_t len,int flags,cred_t * cr)2396 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2397 {
2398 rnode_t *rp;
2399 page_t *pp;
2400 u_offset_t eoff;
2401 u_offset_t io_off;
2402 size_t io_len;
2403 int error;
2404 int rdirty;
2405 int err;
2406
2407 rp = VTOR(vp);
2408 ASSERT(rp->r_count > 0);
2409
2410 if (!vn_has_cached_data(vp))
2411 return (0);
2412
2413 ASSERT(vp->v_type != VCHR);
2414
2415 /*
2416 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2417 * writes. B_FORCE is set to force the VM system to actually
2418 * invalidate the pages, even if the i/o failed. The pages
2419 * need to get invalidated because they can't be written out
2420 * because there isn't any space left on either the server's
2421 * file system or in the user's disk quota. The B_FREE bit
2422 * is cleared to avoid confusion as to whether this is a
2423 * request to place the page on the freelist or to destroy
2424 * it.
2425 */
2426 if ((rp->r_flags & ROUTOFSPACE) ||
2427 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2428 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2429
2430 if (len == 0) {
2431 /*
2432 * If doing a full file synchronous operation, then clear
2433 * the RDIRTY bit. If a page gets dirtied while the flush
2434 * is happening, then RDIRTY will get set again. The
2435 * RDIRTY bit must get cleared before the flush so that
2436 * we don't lose this information.
2437 *
2438 * If there are no full file async write operations
2439 * pending and RDIRTY bit is set, clear it.
2440 */
2441 if (off == (u_offset_t)0 &&
2442 !(flags & B_ASYNC) &&
2443 (rp->r_flags & RDIRTY)) {
2444 mutex_enter(&rp->r_statelock);
2445 rdirty = (rp->r_flags & RDIRTY);
2446 rp->r_flags &= ~RDIRTY;
2447 mutex_exit(&rp->r_statelock);
2448 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2449 mutex_enter(&rp->r_statelock);
2450 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
2451 rdirty = (rp->r_flags & RDIRTY);
2452 rp->r_flags &= ~RDIRTY;
2453 }
2454 mutex_exit(&rp->r_statelock);
2455 } else
2456 rdirty = 0;
2457
2458 /*
2459 * Search the entire vp list for pages >= off, and flush
2460 * the dirty pages.
2461 */
2462 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2463 flags, cr);
2464
2465 /*
2466 * If an error occurred and the file was marked as dirty
2467 * before and we aren't forcibly invalidating pages, then
2468 * reset the RDIRTY flag.
2469 */
2470 if (error && rdirty &&
2471 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2472 mutex_enter(&rp->r_statelock);
2473 rp->r_flags |= RDIRTY;
2474 mutex_exit(&rp->r_statelock);
2475 }
2476 } else {
2477 /*
2478 * Do a range from [off...off + len) looking for pages
2479 * to deal with.
2480 */
2481 error = 0;
2482 #ifdef lint
2483 io_len = 0;
2484 #endif
2485 eoff = off + len;
2486 mutex_enter(&rp->r_statelock);
2487 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2488 io_off += io_len) {
2489 mutex_exit(&rp->r_statelock);
2490 /*
2491 * If we are not invalidating, synchronously
2492 * freeing or writing pages use the routine
2493 * page_lookup_nowait() to prevent reclaiming
2494 * them from the free list.
2495 */
2496 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2497 pp = page_lookup(vp, io_off,
2498 (flags & (B_INVAL | B_FREE)) ?
2499 SE_EXCL : SE_SHARED);
2500 } else {
2501 pp = page_lookup_nowait(vp, io_off,
2502 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2503 }
2504
2505 if (pp == NULL || !pvn_getdirty(pp, flags))
2506 io_len = PAGESIZE;
2507 else {
2508 err = (*rp->r_putapage)(vp, pp, &io_off,
2509 &io_len, flags, cr);
2510 if (!error)
2511 error = err;
2512 /*
2513 * "io_off" and "io_len" are returned as
2514 * the range of pages we actually wrote.
2515 * This allows us to skip ahead more quickly
2516 * since several pages may've been dealt
2517 * with by this iteration of the loop.
2518 */
2519 }
2520 mutex_enter(&rp->r_statelock);
2521 }
2522 mutex_exit(&rp->r_statelock);
2523 }
2524
2525 return (error);
2526 }
2527
2528 void
nfs_invalidate_pages(vnode_t * vp,u_offset_t off,cred_t * cr)2529 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2530 {
2531 rnode_t *rp;
2532
2533 rp = VTOR(vp);
2534 mutex_enter(&rp->r_statelock);
2535 while (rp->r_flags & RTRUNCATE)
2536 cv_wait(&rp->r_cv, &rp->r_statelock);
2537 rp->r_flags |= RTRUNCATE;
2538 if (off == (u_offset_t)0) {
2539 rp->r_flags &= ~RDIRTY;
2540 if (!(rp->r_flags & RSTALE))
2541 rp->r_error = 0;
2542 }
2543 rp->r_truncaddr = off;
2544 mutex_exit(&rp->r_statelock);
2545 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2546 B_INVAL | B_TRUNC, cr);
2547 mutex_enter(&rp->r_statelock);
2548 rp->r_flags &= ~RTRUNCATE;
2549 cv_broadcast(&rp->r_cv);
2550 mutex_exit(&rp->r_statelock);
2551 }
2552
2553 static int nfs_write_error_to_cons_only = 0;
2554 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2555
2556 /*
2557 * Print a file handle
2558 */
2559 void
nfs_printfhandle(nfs_fhandle * fhp)2560 nfs_printfhandle(nfs_fhandle *fhp)
2561 {
2562 int *ip;
2563 char *buf;
2564 size_t bufsize;
2565 char *cp;
2566
2567 /*
2568 * 13 == "(file handle:"
2569 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2570 * 1 == ' '
2571 * 8 == maximum strlen of "%x"
2572 * 3 == ")\n\0"
2573 */
2574 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2575 buf = kmem_alloc(bufsize, KM_NOSLEEP);
2576 if (buf == NULL)
2577 return;
2578
2579 cp = buf;
2580 (void) strcpy(cp, "(file handle:");
2581 while (*cp != '\0')
2582 cp++;
2583 for (ip = (int *)fhp->fh_buf;
2584 ip < (int *)&fhp->fh_buf[fhp->fh_len];
2585 ip++) {
2586 (void) sprintf(cp, " %x", *ip);
2587 while (*cp != '\0')
2588 cp++;
2589 }
2590 (void) strcpy(cp, ")\n");
2591
2592 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2593
2594 kmem_free(buf, bufsize);
2595 }
2596
2597 /*
2598 * Notify the system administrator that an NFS write error has
2599 * occurred.
2600 */
2601
2602 /* seconds between ENOSPC/EDQUOT messages */
2603 clock_t nfs_write_error_interval = 5;
2604
2605 void
nfs_write_error(vnode_t * vp,int error,cred_t * cr)2606 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2607 {
2608 mntinfo_t *mi;
2609 clock_t now;
2610
2611 mi = VTOMI(vp);
2612 /*
2613 * In case of forced unmount or zone shutdown, do not print any
2614 * messages since it can flood the console with error messages.
2615 */
2616 if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2617 return;
2618
2619 /*
2620 * No use in flooding the console with ENOSPC
2621 * messages from the same file system.
2622 */
2623 now = ddi_get_lbolt();
2624 if ((error != ENOSPC && error != EDQUOT) ||
2625 now - mi->mi_printftime > 0) {
2626 zoneid_t zoneid = mi->mi_zone->zone_id;
2627
2628 #ifdef DEBUG
2629 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2630 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2631 #else
2632 nfs_perror(error, "NFS write error on host %s: %m.\n",
2633 VTOR(vp)->r_server->sv_hostname, NULL);
2634 #endif
2635 if (error == ENOSPC || error == EDQUOT) {
2636 zcmn_err(zoneid, CE_CONT,
2637 MSG("^File: userid=%d, groupid=%d\n"),
2638 crgetuid(cr), crgetgid(cr));
2639 if (crgetuid(CRED()) != crgetuid(cr) ||
2640 crgetgid(CRED()) != crgetgid(cr)) {
2641 zcmn_err(zoneid, CE_CONT,
2642 MSG("^User: userid=%d, groupid=%d\n"),
2643 crgetuid(CRED()), crgetgid(CRED()));
2644 }
2645 mi->mi_printftime = now +
2646 nfs_write_error_interval * hz;
2647 }
2648 nfs_printfhandle(&VTOR(vp)->r_fh);
2649 #ifdef DEBUG
2650 if (error == EACCES) {
2651 zcmn_err(zoneid, CE_CONT,
2652 MSG("^nfs_bio: cred is%s kcred\n"),
2653 cr == kcred ? "" : " not");
2654 }
2655 #endif
2656 }
2657 }
2658
2659 /* ARGSUSED */
2660 static void *
nfs_mi_init(zoneid_t zoneid)2661 nfs_mi_init(zoneid_t zoneid)
2662 {
2663 struct mi_globals *mig;
2664
2665 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2666 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2667 list_create(&mig->mig_list, sizeof (mntinfo_t),
2668 offsetof(mntinfo_t, mi_zone_node));
2669 mig->mig_destructor_called = B_FALSE;
2670 return (mig);
2671 }
2672
2673 /*
2674 * Callback routine to tell all NFS mounts in the zone to stop creating new
2675 * threads. Existing threads should exit.
2676 */
2677 /* ARGSUSED */
2678 static void
nfs_mi_shutdown(zoneid_t zoneid,void * data)2679 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2680 {
2681 struct mi_globals *mig = data;
2682 mntinfo_t *mi;
2683
2684 ASSERT(mig != NULL);
2685 again:
2686 mutex_enter(&mig->mig_lock);
2687 for (mi = list_head(&mig->mig_list); mi != NULL;
2688 mi = list_next(&mig->mig_list, mi)) {
2689
2690 /*
2691 * If we've done the shutdown work for this FS, skip.
2692 * Once we go off the end of the list, we're done.
2693 */
2694 if (mi->mi_flags & MI_DEAD)
2695 continue;
2696
2697 /*
2698 * We will do work, so not done. Get a hold on the FS.
2699 */
2700 VFS_HOLD(mi->mi_vfsp);
2701
2702 /*
2703 * purge the DNLC for this filesystem
2704 */
2705 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2706
2707 mutex_enter(&mi->mi_async_lock);
2708 /*
2709 * Tell existing async worker threads to exit.
2710 */
2711 mi->mi_max_threads = 0;
2712 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2713 /*
2714 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2715 * getting ready to exit when it's done with its current work.
2716 * Also set MI_DEAD to note we've acted on this FS.
2717 */
2718 mutex_enter(&mi->mi_lock);
2719 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2720 mutex_exit(&mi->mi_lock);
2721 /*
2722 * Wake up the async manager thread.
2723 */
2724 cv_broadcast(&mi->mi_async_reqs_cv);
2725 mutex_exit(&mi->mi_async_lock);
2726
2727 /*
2728 * Drop lock and release FS, which may change list, then repeat.
2729 * We're done when every mi has been done or the list is empty.
2730 */
2731 mutex_exit(&mig->mig_lock);
2732 VFS_RELE(mi->mi_vfsp);
2733 goto again;
2734 }
2735 mutex_exit(&mig->mig_lock);
2736 }
2737
2738 static void
nfs_mi_free_globals(struct mi_globals * mig)2739 nfs_mi_free_globals(struct mi_globals *mig)
2740 {
2741 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2742 mutex_destroy(&mig->mig_lock);
2743 kmem_free(mig, sizeof (*mig));
2744
2745 }
2746
2747 /* ARGSUSED */
2748 static void
nfs_mi_destroy(zoneid_t zoneid,void * data)2749 nfs_mi_destroy(zoneid_t zoneid, void *data)
2750 {
2751 struct mi_globals *mig = data;
2752
2753 ASSERT(mig != NULL);
2754 mutex_enter(&mig->mig_lock);
2755 if (list_head(&mig->mig_list) != NULL) {
2756 /* Still waiting for VFS_FREEVFS() */
2757 mig->mig_destructor_called = B_TRUE;
2758 mutex_exit(&mig->mig_lock);
2759 return;
2760 }
2761 nfs_mi_free_globals(mig);
2762 }
2763
2764 /*
2765 * Add an NFS mount to the per-zone list of NFS mounts.
2766 */
2767 void
nfs_mi_zonelist_add(mntinfo_t * mi)2768 nfs_mi_zonelist_add(mntinfo_t *mi)
2769 {
2770 struct mi_globals *mig;
2771
2772 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2773 mutex_enter(&mig->mig_lock);
2774 list_insert_head(&mig->mig_list, mi);
2775 mutex_exit(&mig->mig_lock);
2776 }
2777
2778 /*
2779 * Remove an NFS mount from the per-zone list of NFS mounts.
2780 */
2781 static void
nfs_mi_zonelist_remove(mntinfo_t * mi)2782 nfs_mi_zonelist_remove(mntinfo_t *mi)
2783 {
2784 struct mi_globals *mig;
2785
2786 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2787 mutex_enter(&mig->mig_lock);
2788 list_remove(&mig->mig_list, mi);
2789 /*
2790 * We can be called asynchronously by VFS_FREEVFS() after the zone
2791 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2792 * mi globals.
2793 */
2794 if (list_head(&mig->mig_list) == NULL &&
2795 mig->mig_destructor_called == B_TRUE) {
2796 nfs_mi_free_globals(mig);
2797 return;
2798 }
2799 mutex_exit(&mig->mig_lock);
2800 }
2801
2802 /*
2803 * NFS Client initialization routine. This routine should only be called
2804 * once. It performs the following tasks:
2805 * - Initalize all global locks
2806 * - Call sub-initialization routines (localize access to variables)
2807 */
2808 int
nfs_clntinit(void)2809 nfs_clntinit(void)
2810 {
2811 #ifdef DEBUG
2812 static boolean_t nfs_clntup = B_FALSE;
2813 #endif
2814 int error;
2815
2816 #ifdef DEBUG
2817 ASSERT(nfs_clntup == B_FALSE);
2818 #endif
2819
2820 error = nfs_subrinit();
2821 if (error)
2822 return (error);
2823
2824 error = nfs_vfsinit();
2825 if (error) {
2826 /*
2827 * Cleanup nfs_subrinit() work
2828 */
2829 nfs_subrfini();
2830 return (error);
2831 }
2832 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2833 nfs_mi_destroy);
2834
2835 nfs4_clnt_init();
2836
2837 nfscmd_init();
2838
2839 #ifdef DEBUG
2840 nfs_clntup = B_TRUE;
2841 #endif
2842
2843 return (0);
2844 }
2845
2846 /*
2847 * This routine is only called if the NFS Client has been initialized but
2848 * the module failed to be installed. This routine will cleanup the previously
2849 * allocated/initialized work.
2850 */
2851 void
nfs_clntfini(void)2852 nfs_clntfini(void)
2853 {
2854 (void) zone_key_delete(mi_list_key);
2855 nfs_subrfini();
2856 nfs_vfsfini();
2857 nfs4_clnt_fini();
2858 nfscmd_fini();
2859 }
2860
2861 /*
2862 * nfs_lockrelease:
2863 *
2864 * Release any locks on the given vnode that are held by the current
2865 * process.
2866 */
2867 void
nfs_lockrelease(vnode_t * vp,int flag,offset_t offset,cred_t * cr)2868 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2869 {
2870 flock64_t ld;
2871 struct shrlock shr;
2872 char *buf;
2873 int remote_lock_possible;
2874 int ret;
2875
2876 ASSERT((uintptr_t)vp > KERNELBASE);
2877
2878 /*
2879 * Generate an explicit unlock operation for the entire file. As a
2880 * partial optimization, only generate the unlock if there is a
2881 * lock registered for the file. We could check whether this
2882 * particular process has any locks on the file, but that would
2883 * require the local locking code to provide yet another query
2884 * routine. Note that no explicit synchronization is needed here.
2885 * At worst, flk_has_remote_locks() will return a false positive,
2886 * in which case the unlock call wastes time but doesn't harm
2887 * correctness.
2888 *
2889 * In addition, an unlock request is generated if the process
2890 * is listed as possibly having a lock on the file because the
2891 * server and client lock managers may have gotten out of sync.
2892 * N.B. It is important to make sure nfs_remove_locking_id() is
2893 * called here even if flk_has_remote_locks(vp) reports true.
2894 * If it is not called and there is an entry on the process id
2895 * list, that entry will never get removed.
2896 */
2897 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2898 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2899 if (remote_lock_possible || flk_has_remote_locks(vp)) {
2900 ld.l_type = F_UNLCK; /* set to unlock entire file */
2901 ld.l_whence = 0; /* unlock from start of file */
2902 ld.l_start = 0;
2903 ld.l_len = 0; /* do entire file */
2904 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
2905 NULL);
2906
2907 if (ret != 0) {
2908 /*
2909 * If VOP_FRLOCK fails, make sure we unregister
2910 * local locks before we continue.
2911 */
2912 ld.l_pid = ttoproc(curthread)->p_pid;
2913 lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2914 #ifdef DEBUG
2915 nfs_perror(ret,
2916 "NFS lock release error on vp %p: %m.\n",
2917 (void *)vp, NULL);
2918 #endif
2919 }
2920
2921 /*
2922 * The call to VOP_FRLOCK may put the pid back on the
2923 * list. We need to remove it.
2924 */
2925 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2926 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2927 }
2928
2929 /*
2930 * As long as the vp has a share matching our pid,
2931 * pluck it off and unshare it. There are circumstances in
2932 * which the call to nfs_remove_locking_id() may put the
2933 * owner back on the list, in which case we simply do a
2934 * redundant and harmless unshare.
2935 */
2936 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2937 while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2938 (char *)NULL, buf, &shr.s_own_len)) {
2939 shr.s_owner = buf;
2940 shr.s_access = 0;
2941 shr.s_deny = 0;
2942 shr.s_sysid = 0;
2943 shr.s_pid = curproc->p_pid;
2944
2945 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
2946 #ifdef DEBUG
2947 if (ret != 0) {
2948 nfs_perror(ret,
2949 "NFS share release error on vp %p: %m.\n",
2950 (void *)vp, NULL);
2951 }
2952 #endif
2953 }
2954 kmem_free(buf, MAX_SHR_OWNER_LEN);
2955 }
2956
2957 /*
2958 * nfs_lockcompletion:
2959 *
2960 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2961 * as non cachable (set VNOCACHE bit).
2962 */
2963
2964 void
nfs_lockcompletion(vnode_t * vp,int cmd)2965 nfs_lockcompletion(vnode_t *vp, int cmd)
2966 {
2967 #ifdef DEBUG
2968 rnode_t *rp = VTOR(vp);
2969
2970 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2971 #endif
2972
2973 if (cmd == F_SETLK || cmd == F_SETLKW) {
2974 if (!lm_safemap(vp)) {
2975 mutex_enter(&vp->v_lock);
2976 vp->v_flag |= VNOCACHE;
2977 mutex_exit(&vp->v_lock);
2978 } else {
2979 mutex_enter(&vp->v_lock);
2980 vp->v_flag &= ~VNOCACHE;
2981 mutex_exit(&vp->v_lock);
2982 }
2983 }
2984 /*
2985 * The cached attributes of the file are stale after acquiring
2986 * the lock on the file. They were updated when the file was
2987 * opened, but not updated when the lock was acquired. Therefore the
2988 * cached attributes are invalidated after the lock is obtained.
2989 */
2990 PURGE_ATTRCACHE(vp);
2991 }
2992
2993 /*
2994 * The lock manager holds state making it possible for the client
2995 * and server to be out of sync. For example, if the response from
2996 * the server granting a lock request is lost, the server will think
2997 * the lock is granted and the client will think the lock is lost.
2998 * The client can tell when it is not positive if it is in sync with
2999 * the server.
3000 *
3001 * To deal with this, a list of processes for which the client is
3002 * not sure if the server holds a lock is attached to the rnode.
3003 * When such a process closes the rnode, an unlock request is sent
3004 * to the server to unlock the entire file.
3005 *
3006 * The list is kept as a singularly linked NULL terminated list.
3007 * Because it is only added to under extreme error conditions, the
3008 * list shouldn't get very big. DEBUG kernels print a message if
3009 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily
3010 * choosen to be 8, but can be tuned at runtime.
3011 */
3012 #ifdef DEBUG
3013 /* int nfs_lmpl_high_water = 8; */
3014 int nfs_lmpl_high_water = 128;
3015 int nfs_cnt_add_locking_id = 0;
3016 int nfs_len_add_locking_id = 0;
3017 #endif /* DEBUG */
3018
3019 /*
3020 * Record that the nfs lock manager server may be holding a lock on
3021 * a vnode for a process.
3022 *
3023 * Because the nfs lock manager server holds state, it is possible
3024 * for the server to get out of sync with the client. This routine is called
3025 * from the client when it is no longer sure if the server is in sync
3026 * with the client. nfs_lockrelease() will then notice this and send
3027 * an unlock request when the file is closed
3028 */
3029 void
nfs_add_locking_id(vnode_t * vp,pid_t pid,int type,char * id,int len)3030 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
3031 {
3032 rnode_t *rp;
3033 lmpl_t *new;
3034 lmpl_t *cur;
3035 lmpl_t **lmplp;
3036 #ifdef DEBUG
3037 int list_len = 1;
3038 #endif /* DEBUG */
3039
3040 #ifdef DEBUG
3041 ++nfs_cnt_add_locking_id;
3042 #endif /* DEBUG */
3043 /*
3044 * allocate new lmpl_t now so we don't sleep
3045 * later after grabbing mutexes
3046 */
3047 ASSERT(len < MAX_SHR_OWNER_LEN);
3048 new = kmem_alloc(sizeof (*new), KM_SLEEP);
3049 new->lmpl_type = type;
3050 new->lmpl_pid = pid;
3051 new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
3052 bcopy(id, new->lmpl_owner, len);
3053 new->lmpl_own_len = len;
3054 new->lmpl_next = (lmpl_t *)NULL;
3055 #ifdef DEBUG
3056 if (type == RLMPL_PID) {
3057 ASSERT(len == sizeof (pid_t));
3058 ASSERT(pid == *(pid_t *)new->lmpl_owner);
3059 } else {
3060 ASSERT(type == RLMPL_OWNER);
3061 }
3062 #endif
3063
3064 rp = VTOR(vp);
3065 mutex_enter(&rp->r_statelock);
3066
3067 /*
3068 * Add this id to the list for this rnode only if the
3069 * rnode is active and the id is not already there.
3070 */
3071 ASSERT(rp->r_flags & RHASHED);
3072 lmplp = &(rp->r_lmpl);
3073 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3074 if (cur->lmpl_pid == pid &&
3075 cur->lmpl_type == type &&
3076 cur->lmpl_own_len == len &&
3077 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
3078 kmem_free(new->lmpl_owner, len);
3079 kmem_free(new, sizeof (*new));
3080 break;
3081 }
3082 lmplp = &cur->lmpl_next;
3083 #ifdef DEBUG
3084 ++list_len;
3085 #endif /* DEBUG */
3086 }
3087 if (cur == (lmpl_t *)NULL) {
3088 *lmplp = new;
3089 #ifdef DEBUG
3090 if (list_len > nfs_len_add_locking_id) {
3091 nfs_len_add_locking_id = list_len;
3092 }
3093 if (list_len > nfs_lmpl_high_water) {
3094 cmn_err(CE_WARN, "nfs_add_locking_id: long list "
3095 "vp=%p is %d", (void *)vp, list_len);
3096 }
3097 #endif /* DEBUG */
3098 }
3099
3100 #ifdef DEBUG
3101 if (share_debug) {
3102 int nitems = 0;
3103 int npids = 0;
3104 int nowners = 0;
3105
3106 /*
3107 * Count the number of things left on r_lmpl after the remove.
3108 */
3109 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3110 cur = cur->lmpl_next) {
3111 nitems++;
3112 if (cur->lmpl_type == RLMPL_PID) {
3113 npids++;
3114 } else if (cur->lmpl_type == RLMPL_OWNER) {
3115 nowners++;
3116 } else {
3117 cmn_err(CE_PANIC, "nfs_add_locking_id: "
3118 "unrecognized lmpl_type %d",
3119 cur->lmpl_type);
3120 }
3121 }
3122
3123 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
3124 "OWNs = %d items left on r_lmpl\n",
3125 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
3126 }
3127 #endif
3128
3129 mutex_exit(&rp->r_statelock);
3130 }
3131
3132 /*
3133 * Remove an id from the lock manager id list.
3134 *
3135 * If the id is not in the list return 0. If it was found and
3136 * removed, return 1.
3137 */
3138 static int
nfs_remove_locking_id(vnode_t * vp,int type,char * id,char * rid,int * rlen)3139 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3140 {
3141 lmpl_t *cur;
3142 lmpl_t **lmplp;
3143 rnode_t *rp;
3144 int rv = 0;
3145
3146 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3147
3148 rp = VTOR(vp);
3149
3150 mutex_enter(&rp->r_statelock);
3151 ASSERT(rp->r_flags & RHASHED);
3152 lmplp = &(rp->r_lmpl);
3153
3154 /*
3155 * Search through the list and remove the entry for this id
3156 * if it is there. The special case id == NULL allows removal
3157 * of the first share on the r_lmpl list belonging to the
3158 * current process (if any), without regard to further details
3159 * of its identity.
3160 */
3161 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3162 if (cur->lmpl_type == type &&
3163 cur->lmpl_pid == curproc->p_pid &&
3164 (id == (char *)NULL ||
3165 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3166 *lmplp = cur->lmpl_next;
3167 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3168 if (rid != NULL) {
3169 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3170 *rlen = cur->lmpl_own_len;
3171 }
3172 kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3173 kmem_free(cur, sizeof (*cur));
3174 rv = 1;
3175 break;
3176 }
3177 lmplp = &cur->lmpl_next;
3178 }
3179
3180 #ifdef DEBUG
3181 if (share_debug) {
3182 int nitems = 0;
3183 int npids = 0;
3184 int nowners = 0;
3185
3186 /*
3187 * Count the number of things left on r_lmpl after the remove.
3188 */
3189 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3190 cur = cur->lmpl_next) {
3191 nitems++;
3192 if (cur->lmpl_type == RLMPL_PID) {
3193 npids++;
3194 } else if (cur->lmpl_type == RLMPL_OWNER) {
3195 nowners++;
3196 } else {
3197 cmn_err(CE_PANIC,
3198 "nrli: unrecognized lmpl_type %d",
3199 cur->lmpl_type);
3200 }
3201 }
3202
3203 cmn_err(CE_CONT,
3204 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3205 (type == RLMPL_PID) ? "P" : "O",
3206 npids,
3207 nowners,
3208 nitems);
3209 }
3210 #endif
3211
3212 mutex_exit(&rp->r_statelock);
3213 return (rv);
3214 }
3215
3216 void
nfs_free_mi(mntinfo_t * mi)3217 nfs_free_mi(mntinfo_t *mi)
3218 {
3219 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3220 ASSERT(mi->mi_manager_thread == NULL);
3221 ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
3222 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
3223
3224 /*
3225 * Remove the node from the global list before we start tearing it down.
3226 */
3227 nfs_mi_zonelist_remove(mi);
3228 if (mi->mi_klmconfig) {
3229 lm_free_config(mi->mi_klmconfig);
3230 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3231 }
3232 mutex_destroy(&mi->mi_lock);
3233 mutex_destroy(&mi->mi_remap_lock);
3234 mutex_destroy(&mi->mi_async_lock);
3235 mutex_destroy(&mi->mi_rnodes_lock);
3236 cv_destroy(&mi->mi_failover_cv);
3237 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
3238 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
3239 cv_destroy(&mi->mi_async_reqs_cv);
3240 cv_destroy(&mi->mi_async_cv);
3241 list_destroy(&mi->mi_rnodes);
3242 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
3243 kmem_free(mi, sizeof (*mi));
3244 }
3245
3246 static int
mnt_kstat_update(kstat_t * ksp,int rw)3247 mnt_kstat_update(kstat_t *ksp, int rw)
3248 {
3249 mntinfo_t *mi;
3250 struct mntinfo_kstat *mik;
3251 vfs_t *vfsp;
3252 int i;
3253
3254 /* this is a read-only kstat. Bail out on a write */
3255 if (rw == KSTAT_WRITE)
3256 return (EACCES);
3257
3258 /*
3259 * We don't want to wait here as kstat_chain_lock could be held by
3260 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3261 * and thus could lead to a deadlock.
3262 */
3263 vfsp = (struct vfs *)ksp->ks_private;
3264
3265
3266 mi = VFTOMI(vfsp);
3267
3268 mik = (struct mntinfo_kstat *)ksp->ks_data;
3269
3270 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3271 mik->mik_vers = (uint32_t)mi->mi_vers;
3272 mik->mik_flags = mi->mi_flags;
3273 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3274 mik->mik_curread = (uint32_t)mi->mi_curread;
3275 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3276 mik->mik_retrans = mi->mi_retrans;
3277 mik->mik_timeo = mi->mi_timeo;
3278 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3279 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3280 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3281 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3282 for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3283 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3284 mik->mik_timers[i].deviate =
3285 (uint32_t)mi->mi_timers[i].rt_deviate;
3286 mik->mik_timers[i].rtxcur =
3287 (uint32_t)mi->mi_timers[i].rt_rtxcur;
3288 }
3289 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3290 mik->mik_failover = (uint32_t)mi->mi_failover;
3291 mik->mik_remap = (uint32_t)mi->mi_remap;
3292 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3293
3294 return (0);
3295 }
3296
3297 void
nfs_mnt_kstat_init(struct vfs * vfsp)3298 nfs_mnt_kstat_init(struct vfs *vfsp)
3299 {
3300 mntinfo_t *mi = VFTOMI(vfsp);
3301
3302 /*
3303 * Create the version specific kstats.
3304 *
3305 * PSARC 2001/697 Contract Private Interface
3306 * All nfs kstats are under SunMC contract
3307 * Please refer to the PSARC listed above and contact
3308 * SunMC before making any changes!
3309 *
3310 * Changes must be reviewed by Solaris File Sharing
3311 * Changes must be communicated to contract-2001-697@sun.com
3312 *
3313 */
3314
3315 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3316 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3317 if (mi->mi_io_kstats) {
3318 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3319 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3320 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3321 kstat_install(mi->mi_io_kstats);
3322 }
3323
3324 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3325 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3326 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3327 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3328 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3329 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3330 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3331 kstat_install(mi->mi_ro_kstats);
3332 }
3333 }
3334
3335 nfs_delmapcall_t *
nfs_init_delmapcall()3336 nfs_init_delmapcall()
3337 {
3338 nfs_delmapcall_t *delmap_call;
3339
3340 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3341 delmap_call->call_id = curthread;
3342 delmap_call->error = 0;
3343
3344 return (delmap_call);
3345 }
3346
3347 void
nfs_free_delmapcall(nfs_delmapcall_t * delmap_call)3348 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3349 {
3350 kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3351 }
3352
3353 /*
3354 * Searches for the current delmap caller (based on curthread) in the list of
3355 * callers. If it is found, we remove it and free the delmap caller.
3356 * Returns:
3357 * 0 if the caller wasn't found
3358 * 1 if the caller was found, removed and freed. *errp is set to what
3359 * the result of the delmap was.
3360 */
3361 int
nfs_find_and_delete_delmapcall(rnode_t * rp,int * errp)3362 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3363 {
3364 nfs_delmapcall_t *delmap_call;
3365
3366 /*
3367 * If the list doesn't exist yet, we create it and return
3368 * that the caller wasn't found. No list = no callers.
3369 */
3370 mutex_enter(&rp->r_statelock);
3371 if (!(rp->r_flags & RDELMAPLIST)) {
3372 /* The list does not exist */
3373 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3374 offsetof(nfs_delmapcall_t, call_node));
3375 rp->r_flags |= RDELMAPLIST;
3376 mutex_exit(&rp->r_statelock);
3377 return (0);
3378 } else {
3379 /* The list exists so search it */
3380 for (delmap_call = list_head(&rp->r_indelmap);
3381 delmap_call != NULL;
3382 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3383 if (delmap_call->call_id == curthread) {
3384 /* current caller is in the list */
3385 *errp = delmap_call->error;
3386 list_remove(&rp->r_indelmap, delmap_call);
3387 mutex_exit(&rp->r_statelock);
3388 nfs_free_delmapcall(delmap_call);
3389 return (1);
3390 }
3391 }
3392 }
3393 mutex_exit(&rp->r_statelock);
3394 return (0);
3395 }
3396