1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1991, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35
36 #include <sys/cdefs.h>
37 /*
38 * Socket operations for use by nfs
39 */
40
41 #include "opt_kgssapi.h"
42 #include "opt_nfs.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/limits.h>
48 #include <sys/lock.h>
49 #include <sys/malloc.h>
50 #include <sys/mbuf.h>
51 #include <sys/mount.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/syscallsubr.h>
56 #include <sys/sysctl.h>
57 #include <sys/syslog.h>
58 #include <sys/vnode.h>
59
60 #include <rpc/rpc.h>
61 #include <rpc/krpc.h>
62
63 #include <kgssapi/krb5/kcrypto.h>
64
65 #include <fs/nfs/nfsport.h>
66
67 #ifdef KDTRACE_HOOKS
68 #include <sys/dtrace_bsd.h>
69
70 dtrace_nfsclient_nfs23_start_probe_func_t
71 dtrace_nfscl_nfs234_start_probe;
72
73 dtrace_nfsclient_nfs23_done_probe_func_t
74 dtrace_nfscl_nfs234_done_probe;
75
76 /*
77 * Registered probes by RPC type.
78 */
79 uint32_t nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
80 uint32_t nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];
81
82 uint32_t nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
83 uint32_t nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];
84
85 uint32_t nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
86 uint32_t nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
87 #endif
88
89 NFSSTATESPINLOCK;
90 NFSREQSPINLOCK;
91 NFSDLOCKMUTEX;
92 NFSCLSTATEMUTEX;
93 extern struct nfsstatsv1 nfsstatsv1;
94 extern struct nfsreqhead nfsd_reqq;
95 extern int nfscl_ticks;
96 extern void (*ncl_call_invalcaches)(struct vnode *);
97 extern int nfs_numnfscbd;
98 extern int nfscl_debuglevel;
99 extern int nfsrv_lease;
100
101 SVCPOOL *nfscbd_pool;
102 int nfs_bufpackets = 4;
103 static int nfsrv_gsscallbackson = 0;
104 static int nfs_reconnects;
105 static int nfs3_jukebox_delay = 10;
106 static int nfs_skip_wcc_data_onerr = 1;
107 static int nfs_dsretries = 2;
108 static struct timespec nfs_trylater_max = {
109 .tv_sec = NFS_TRYLATERDEL,
110 .tv_nsec = 0,
111 };
112
113 SYSCTL_DECL(_vfs_nfs);
114
115 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
116 "Buffer reservation size 2 < x < 64");
117 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
118 "Number of times the nfs client has had to reconnect");
119 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
120 "Number of seconds to delay a retry after receiving EJUKEBOX");
121 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
122 "Disable weak cache consistency checking when server returns an error");
123 SYSCTL_INT(_vfs_nfs, OID_AUTO, dsretries, CTLFLAG_RW, &nfs_dsretries, 0,
124 "Number of retries for a DS RPC before failure");
125
126 static void nfs_down(struct nfsmount *, struct thread *, const char *,
127 int, int);
128 static void nfs_up(struct nfsmount *, struct thread *, const char *,
129 int, int);
130 static int nfs_msg(struct thread *, const char *, const char *, int);
131
132 struct nfs_cached_auth {
133 int ca_refs; /* refcount, including 1 from the cache */
134 uid_t ca_uid; /* uid that corresponds to this auth */
135 AUTH *ca_auth; /* RPC auth handle */
136 };
137
138 static int nfsv2_procid[NFS_V3NPROCS] = {
139 NFSV2PROC_NULL,
140 NFSV2PROC_GETATTR,
141 NFSV2PROC_SETATTR,
142 NFSV2PROC_LOOKUP,
143 NFSV2PROC_NOOP,
144 NFSV2PROC_READLINK,
145 NFSV2PROC_READ,
146 NFSV2PROC_WRITE,
147 NFSV2PROC_CREATE,
148 NFSV2PROC_MKDIR,
149 NFSV2PROC_SYMLINK,
150 NFSV2PROC_CREATE,
151 NFSV2PROC_REMOVE,
152 NFSV2PROC_RMDIR,
153 NFSV2PROC_RENAME,
154 NFSV2PROC_LINK,
155 NFSV2PROC_READDIR,
156 NFSV2PROC_NOOP,
157 NFSV2PROC_STATFS,
158 NFSV2PROC_NOOP,
159 NFSV2PROC_NOOP,
160 NFSV2PROC_NOOP,
161 };
162
163 /*
164 * This static array indicates that a NFSv4 RPC should use
165 * RPCSEC_GSS, if the mount indicates that via sec=krb5[ip].
166 * System RPCs that do not use file handles will be false
167 * in this array so that they will use AUTH_SYS when the
168 * "syskrb5" mount option is specified, along with
169 * "sec=krb5[ip]".
170 */
171 static bool nfscl_use_gss[NFSV42_NPROCS] = {
172 true,
173 true,
174 true,
175 true,
176 true,
177 true,
178 true,
179 true,
180 true,
181 true,
182 true,
183 true,
184 true,
185 true,
186 true,
187 true,
188 true,
189 true,
190 true,
191 true,
192 true,
193 true,
194 true,
195 false, /* SetClientID */
196 false, /* SetClientIDConfirm */
197 true,
198 true,
199 true,
200 true,
201 true,
202 true,
203 true,
204 false, /* Renew */
205 true,
206 false, /* ReleaseLockOwn */
207 true,
208 true,
209 true,
210 true,
211 true,
212 true,
213 false, /* ExchangeID */
214 false, /* CreateSession */
215 false, /* DestroySession */
216 false, /* DestroyClientID */
217 false, /* FreeStateID */
218 true,
219 true,
220 true,
221 true,
222 false, /* ReclaimComplete */
223 true,
224 true,
225 true,
226 true,
227 true,
228 true,
229 true,
230 true,
231 true,
232 true,
233 true,
234 true,
235 true,
236 true,
237 false, /* BindConnectionToSession */
238 true,
239 true,
240 true,
241 true,
242 };
243
244 /*
245 * Initialize sockets and congestion for a new NFS connection.
246 * We do not free the sockaddr if error.
247 * Which arguments are set to NULL indicate what kind of call it is.
248 * cred == NULL --> a call to connect to a pNFS DS
249 * nmp == NULL --> indicates an upcall to userland or a NFSv4.0 callback
250 */
251 int
newnfs_connect(struct nfsmount * nmp,struct nfssockreq * nrp,struct ucred * cred,NFSPROC_T * p,int callback_retry_mult,bool dotls,struct __rpc_client ** clipp)252 newnfs_connect(struct nfsmount *nmp, struct nfssockreq *nrp,
253 struct ucred *cred, NFSPROC_T *p, int callback_retry_mult, bool dotls,
254 struct __rpc_client **clipp)
255 {
256 int rcvreserve, sndreserve;
257 int pktscale, pktscalesav;
258 struct sockaddr *saddr;
259 struct ucred *origcred;
260 CLIENT *client;
261 struct netconfig *nconf;
262 struct socket *so;
263 int one = 1, retries, error = 0;
264 struct thread *td = curthread;
265 SVCXPRT *xprt;
266 struct timeval timo;
267 uint64_t tval;
268
269 /*
270 * We need to establish the socket using the credentials of
271 * the mountpoint. Some parts of this process (such as
272 * sobind() and soconnect()) will use the curent thread's
273 * credential instead of the socket credential. To work
274 * around this, temporarily change the current thread's
275 * credential to that of the mountpoint.
276 *
277 * XXX: It would be better to explicitly pass the correct
278 * credential to sobind() and soconnect().
279 */
280 origcred = td->td_ucred;
281
282 /*
283 * Use the credential in nr_cred, if not NULL.
284 */
285 if (nrp->nr_cred != NULL)
286 td->td_ucred = nrp->nr_cred;
287 else
288 td->td_ucred = cred;
289 saddr = nrp->nr_nam;
290
291 if (saddr->sa_family == AF_INET)
292 if (nrp->nr_sotype == SOCK_DGRAM)
293 nconf = getnetconfigent("udp");
294 else
295 nconf = getnetconfigent("tcp");
296 else
297 if (nrp->nr_sotype == SOCK_DGRAM)
298 nconf = getnetconfigent("udp6");
299 else
300 nconf = getnetconfigent("tcp6");
301
302 pktscale = nfs_bufpackets;
303 if (pktscale < 2)
304 pktscale = 2;
305 if (pktscale > 64)
306 pktscale = 64;
307 pktscalesav = pktscale;
308 /*
309 * soreserve() can fail if sb_max is too small, so shrink pktscale
310 * and try again if there is an error.
311 * Print a log message suggesting increasing sb_max.
312 * Creating a socket and doing this is necessary since, if the
313 * reservation sizes are too large and will make soreserve() fail,
314 * the connection will work until a large send is attempted and
315 * then it will loop in the krpc code.
316 */
317 so = NULL;
318 saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
319 error = socreate(saddr->sa_family, &so, nrp->nr_sotype,
320 nrp->nr_soproto, td->td_ucred, td);
321 if (error != 0)
322 goto out;
323 do {
324 if (error != 0 && pktscale > 2) {
325 if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
326 pktscale == pktscalesav) {
327 /*
328 * Suggest vfs.nfs.bufpackets * maximum RPC message,
329 * adjusted for the sb_max->sb_max_adj conversion of
330 * MCLBYTES / (MSIZE + MCLBYTES) as the minimum setting
331 * for kern.ipc.maxsockbuf.
332 */
333 tval = (NFS_MAXBSIZE + NFS_MAXXDR) * nfs_bufpackets;
334 tval *= MSIZE + MCLBYTES;
335 tval += MCLBYTES - 1; /* Round up divide by MCLBYTES. */
336 tval /= MCLBYTES;
337 printf("Consider increasing kern.ipc.maxsockbuf to a "
338 "minimum of %ju to support %ubyte NFS I/O\n",
339 (uintmax_t)tval, NFS_MAXBSIZE);
340 }
341 pktscale--;
342 }
343 if (nrp->nr_sotype == SOCK_DGRAM) {
344 if (nmp != NULL) {
345 sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
346 pktscale;
347 rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
348 pktscale;
349 } else {
350 sndreserve = rcvreserve = 1024 * pktscale;
351 }
352 } else {
353 if (nrp->nr_sotype != SOCK_STREAM)
354 panic("nfscon sotype");
355 if (nmp != NULL) {
356 sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
357 pktscale;
358 rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
359 pktscale;
360 } else {
361 sndreserve = rcvreserve = 1024 * pktscale;
362 }
363 }
364 error = soreserve(so, sndreserve, rcvreserve);
365 if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
366 pktscale <= 2)
367 printf("Must increase kern.ipc.maxsockbuf or reduce"
368 " rsize, wsize\n");
369 } while (error != 0 && pktscale > 2);
370 soclose(so);
371 if (error != 0)
372 goto out;
373
374 client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
375 nrp->nr_vers, sndreserve, rcvreserve);
376 CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
377 if (nmp != NULL) {
378 if ((nmp->nm_flag & NFSMNT_INT))
379 CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
380 if ((nmp->nm_flag & NFSMNT_RESVPORT))
381 CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
382 if (NFSHASTLS(nmp)) {
383 CLNT_CONTROL(client, CLSET_TLS, &one);
384 if (nmp->nm_tlscertname != NULL)
385 CLNT_CONTROL(client, CLSET_TLSCERTNAME,
386 nmp->nm_tlscertname);
387 }
388 if (NFSHASSOFT(nmp)) {
389 if (nmp->nm_sotype == SOCK_DGRAM)
390 /*
391 * For UDP, the large timeout for a reconnect
392 * will be set to "nm_retry * nm_timeo / 2", so
393 * we only want to do 2 reconnect timeout
394 * retries.
395 */
396 retries = 2;
397 else
398 retries = nmp->nm_retry;
399 } else
400 retries = INT_MAX;
401 if (NFSHASNFSV4N(nmp)) {
402 if (cred != NULL) {
403 if (NFSHASSOFT(nmp)) {
404 /*
405 * This should be a DS mount.
406 * Use CLSET_TIMEOUT to set the timeout
407 * for connections to DSs instead of
408 * specifying a timeout on each RPC.
409 * This is done so that SO_SNDTIMEO
410 * is set on the TCP socket as well
411 * as specifying a time limit when
412 * waiting for an RPC reply. Useful
413 * if the send queue for the TCP
414 * connection has become constipated,
415 * due to a failed DS.
416 * The choice of lease_duration / 4 is
417 * fairly arbitrary, but seems to work
418 * ok, with a lower bound of 10sec.
419 */
420 timo.tv_sec = nfsrv_lease / 4;
421 if (timo.tv_sec < 10)
422 timo.tv_sec = 10;
423 timo.tv_usec = 0;
424 CLNT_CONTROL(client, CLSET_TIMEOUT,
425 &timo);
426 }
427 /*
428 * Make sure the nfscbd_pool doesn't get
429 * destroyed while doing this.
430 */
431 NFSD_LOCK();
432 if (nfs_numnfscbd > 0) {
433 nfs_numnfscbd++;
434 NFSD_UNLOCK();
435 xprt = svc_vc_create_backchannel(
436 nfscbd_pool);
437 CLNT_CONTROL(client, CLSET_BACKCHANNEL,
438 xprt);
439 NFSD_LOCK();
440 nfs_numnfscbd--;
441 if (nfs_numnfscbd == 0)
442 wakeup(&nfs_numnfscbd);
443 }
444 NFSD_UNLOCK();
445 } else {
446 /*
447 * cred == NULL for a DS connect.
448 * For connects to a DS, set a retry limit
449 * so that failed DSs will be detected.
450 * This is ok for NFSv4.1, since a DS does
451 * not maintain open/lock state and is the
452 * only case where using a "soft" mount is
453 * recommended for NFSv4.
454 * For mounts from the MDS to DS, this is done
455 * via mount options, but that is not the case
456 * here. The retry limit here can be adjusted
457 * via the sysctl vfs.nfs.dsretries.
458 * See the comment above w.r.t. timeout.
459 */
460 timo.tv_sec = nfsrv_lease / 4;
461 if (timo.tv_sec < 10)
462 timo.tv_sec = 10;
463 timo.tv_usec = 0;
464 CLNT_CONTROL(client, CLSET_TIMEOUT, &timo);
465 retries = nfs_dsretries;
466 }
467 }
468 } else {
469 /*
470 * Three cases:
471 * - Null RPC callback to client
472 * - Non-Null RPC callback to client, wait a little longer
473 * - upcalls to nfsuserd and gssd (clp == NULL)
474 */
475 if (callback_retry_mult == 0) {
476 retries = NFSV4_UPCALLRETRY;
477 CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
478 } else {
479 retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
480 }
481 if (dotls)
482 CLNT_CONTROL(client, CLSET_TLS, &one);
483 }
484 CLNT_CONTROL(client, CLSET_RETRIES, &retries);
485
486 if (nmp != NULL) {
487 /*
488 * For UDP, there are 2 timeouts:
489 * - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
490 * that does a retransmit of an RPC request using the same
491 * socket and xid. This is what you normally want to do,
492 * since NFS servers depend on "same xid" for their
493 * Duplicate Request Cache.
494 * - timeout specified in CLNT_CALL_MBUF(), which specifies when
495 * retransmits on the same socket should fail and a fresh
496 * socket created. Each of these timeouts counts as one
497 * CLSET_RETRIES as set above.
498 * Set the initial retransmit timeout for UDP. This timeout
499 * doesn't exist for TCP and the following call just fails,
500 * which is ok.
501 */
502 timo.tv_sec = nmp->nm_timeo / NFS_HZ;
503 timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
504 CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
505 }
506
507 /*
508 * *clipp is &nrp->nr_client or &nm_aconn[nmp->nm_nextaconn].
509 * The latter case is for additional connections specified by the
510 * "nconnect" mount option. nr_mtx etc is used for these additional
511 * connections, as well as nr_client in the nfssockreq
512 * structure for the mount.
513 */
514 mtx_lock(&nrp->nr_mtx);
515 if (*clipp != NULL) {
516 mtx_unlock(&nrp->nr_mtx);
517 /*
518 * Someone else already connected.
519 */
520 CLNT_RELEASE(client);
521 } else {
522 *clipp = client;
523 /*
524 * Protocols that do not require connections may be optionally
525 * left unconnected for servers that reply from a port other
526 * than NFS_PORT.
527 */
528 if (nmp == NULL || (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
529 mtx_unlock(&nrp->nr_mtx);
530 CLNT_CONTROL(client, CLSET_CONNECT, &one);
531 } else
532 mtx_unlock(&nrp->nr_mtx);
533 }
534
535 out:
536 /* Restore current thread's credentials. */
537 td->td_ucred = origcred;
538
539 NFSEXITCODE(error);
540 return (error);
541 }
542
543 /*
544 * NFS disconnect. Clean up and unlink.
545 */
546 void
newnfs_disconnect(struct nfsmount * nmp,struct nfssockreq * nrp)547 newnfs_disconnect(struct nfsmount *nmp, struct nfssockreq *nrp)
548 {
549 CLIENT *client, *aconn[NFS_MAXNCONN - 1];
550 int i;
551
552 mtx_lock(&nrp->nr_mtx);
553 if (nrp->nr_client != NULL) {
554 client = nrp->nr_client;
555 nrp->nr_client = NULL;
556 if (nmp != NULL && nmp->nm_aconnect > 0) {
557 for (i = 0; i < nmp->nm_aconnect; i++) {
558 aconn[i] = nmp->nm_aconn[i];
559 nmp->nm_aconn[i] = NULL;
560 }
561 }
562 mtx_unlock(&nrp->nr_mtx);
563 rpc_gss_secpurge_call(client);
564 CLNT_CLOSE(client);
565 CLNT_RELEASE(client);
566 if (nmp != NULL && nmp->nm_aconnect > 0) {
567 for (i = 0; i < nmp->nm_aconnect; i++) {
568 if (aconn[i] != NULL) {
569 rpc_gss_secpurge_call(aconn[i]);
570 CLNT_CLOSE(aconn[i]);
571 CLNT_RELEASE(aconn[i]);
572 }
573 }
574 }
575 } else {
576 mtx_unlock(&nrp->nr_mtx);
577 }
578 }
579
580 static AUTH *
nfs_getauth(struct nfssockreq * nrp,int secflavour,char * clnt_principal,char * srv_principal,gss_OID mech_oid,struct ucred * cred)581 nfs_getauth(struct nfssockreq *nrp, int secflavour, char *clnt_principal,
582 char *srv_principal, gss_OID mech_oid, struct ucred *cred)
583 {
584 rpc_gss_service_t svc;
585 AUTH *auth;
586
587 switch (secflavour) {
588 case RPCSEC_GSS_KRB5:
589 case RPCSEC_GSS_KRB5I:
590 case RPCSEC_GSS_KRB5P:
591 if (!mech_oid) {
592 if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
593 return (NULL);
594 }
595 if (secflavour == RPCSEC_GSS_KRB5)
596 svc = rpc_gss_svc_none;
597 else if (secflavour == RPCSEC_GSS_KRB5I)
598 svc = rpc_gss_svc_integrity;
599 else
600 svc = rpc_gss_svc_privacy;
601
602 if (clnt_principal == NULL) {
603 NFSCL_DEBUG(1, "nfs_getauth: clnt princ=NULL, "
604 "srv princ=%s\n", srv_principal);
605 auth = rpc_gss_secfind_call(nrp->nr_client, cred,
606 srv_principal, mech_oid, svc);
607 } else {
608 NFSCL_DEBUG(1, "nfs_getauth: clnt princ=%s "
609 "srv princ=%s\n", clnt_principal, srv_principal);
610 auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
611 clnt_principal, srv_principal, "kerberosv5",
612 svc, NULL, NULL, NULL);
613 return (auth);
614 }
615 if (auth != NULL)
616 return (auth);
617 /* fallthrough */
618 case AUTH_SYS:
619 default:
620 return (authunix_create(cred));
621 }
622 }
623
624 /*
625 * Callback from the RPC code to generate up/down notifications.
626 */
627
628 struct nfs_feedback_arg {
629 struct nfsmount *nf_mount;
630 int nf_lastmsg; /* last tprintf */
631 int nf_tprintfmsg;
632 struct thread *nf_td;
633 };
634
635 static void
nfs_feedback(int type,int proc,void * arg)636 nfs_feedback(int type, int proc, void *arg)
637 {
638 struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
639 struct nfsmount *nmp = nf->nf_mount;
640 time_t now;
641
642 switch (type) {
643 case FEEDBACK_REXMIT2:
644 case FEEDBACK_RECONNECT:
645 now = NFSD_MONOSEC;
646 if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
647 nfs_down(nmp, nf->nf_td,
648 "not responding", 0, NFSSTA_TIMEO);
649 nf->nf_tprintfmsg = TRUE;
650 nf->nf_lastmsg = now;
651 }
652 break;
653
654 case FEEDBACK_OK:
655 nfs_up(nf->nf_mount, nf->nf_td,
656 "is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
657 break;
658 }
659 }
660
661 /*
662 * newnfs_request - goes something like this
663 * - does the rpc by calling the krpc layer
664 * - break down rpc header and return with nfs reply
665 * nb: always frees up nd_mreq mbuf list
666 */
667 int
newnfs_request(struct nfsrv_descript * nd,struct nfsmount * nmp,struct nfsclient * clp,struct nfssockreq * nrp,vnode_t vp,struct thread * td,struct ucred * cred,u_int32_t prog,u_int32_t vers,u_char * retsum,int toplevel,u_int64_t * xidp,struct nfsclsession * dssep)668 newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp,
669 struct nfsclient *clp, struct nfssockreq *nrp, vnode_t vp,
670 struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers,
671 u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *dssep)
672 {
673 uint32_t retseq, retval, retval0, slotseq, *tl;
674 int i = 0, j = 0, opcnt, set_sigset = 0, slot;
675 int error = 0, usegssname = 0, secflavour = AUTH_SYS;
676 int freeslot, maxslot, reterr, slotpos, timeo;
677 u_int16_t procnum;
678 u_int nextconn;
679 struct nfs_feedback_arg nf;
680 struct timeval timo;
681 AUTH *auth;
682 struct rpc_callextra ext;
683 enum clnt_stat stat;
684 struct nfsreq *rep = NULL;
685 char *srv_principal = NULL, *clnt_principal = NULL;
686 sigset_t oldset;
687 struct ucred *authcred;
688 struct nfsclsession *sep;
689 uint8_t sessionid[NFSX_V4SESSIONID];
690 bool nextconn_set;
691 struct timespec trylater_delay, ts, waituntil;
692
693 /* Initially 1msec. */
694 trylater_delay.tv_sec = 0;
695 trylater_delay.tv_nsec = 1000000;
696 sep = dssep;
697 if (xidp != NULL)
698 *xidp = 0;
699 /* Reject requests while attempting a forced unmount. */
700 if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
701 m_freem(nd->nd_mreq);
702 return (ESTALE);
703 }
704
705 /*
706 * Set authcred, which is used to acquire RPC credentials to
707 * the cred argument, by default. The crhold() should not be
708 * necessary, but will ensure that some future code change
709 * doesn't result in the credential being free'd prematurely.
710 */
711 authcred = crhold(cred);
712
713 /* For client side interruptible mounts, mask off the signals. */
714 if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
715 newnfs_set_sigmask(td, &oldset);
716 set_sigset = 1;
717 }
718
719 /*
720 * If not already connected call newnfs_connect now.
721 */
722 if (nrp->nr_client == NULL)
723 newnfs_connect(nmp, nrp, cred, td, 0, false, &nrp->nr_client);
724
725 /*
726 * If the "nconnect" mount option was specified and this RPC is
727 * one that can have a large RPC message and is being done through
728 * the NFS/MDS server, use an additional connection. (When the RPC is
729 * being done through the server/MDS, nrp == &nmp->nm_sockreq.)
730 * The "nconnect" mount option normally has minimal effect when the
731 * "pnfs" mount option is specified, since only Readdir RPCs are
732 * normally done through the NFS/MDS server.
733 */
734 nextconn_set = false;
735 if (nmp != NULL && nmp->nm_aconnect > 0 && nrp == &nmp->nm_sockreq &&
736 (nd->nd_procnum == NFSPROC_READ ||
737 nd->nd_procnum == NFSPROC_READDIR ||
738 nd->nd_procnum == NFSPROC_READDIRPLUS ||
739 nd->nd_procnum == NFSPROC_WRITE)) {
740 nextconn = atomic_fetchadd_int(&nmp->nm_nextaconn, 1);
741 nextconn %= nmp->nm_aconnect;
742 nextconn_set = true;
743 if (nmp->nm_aconn[nextconn] == NULL)
744 newnfs_connect(nmp, nrp, cred, td, 0, false,
745 &nmp->nm_aconn[nextconn]);
746 }
747
748 /*
749 * For a client side mount, nmp is != NULL and clp == NULL. For
750 * server calls (callbacks or upcalls), nmp == NULL.
751 */
752 if (clp != NULL) {
753 NFSLOCKSTATE();
754 if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
755 secflavour = RPCSEC_GSS_KRB5;
756 if (nd->nd_procnum != NFSPROC_NULL) {
757 if (clp->lc_flags & LCL_GSSINTEGRITY)
758 secflavour = RPCSEC_GSS_KRB5I;
759 else if (clp->lc_flags & LCL_GSSPRIVACY)
760 secflavour = RPCSEC_GSS_KRB5P;
761 }
762 }
763 NFSUNLOCKSTATE();
764 } else if (nmp != NULL && NFSHASKERB(nmp) &&
765 nd->nd_procnum != NFSPROC_NULL && (!NFSHASSYSKRB5(nmp) ||
766 nfscl_use_gss[nd->nd_procnum])) {
767 if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
768 nd->nd_flag |= ND_USEGSSNAME;
769 if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
770 /*
771 * If there is a client side host based credential,
772 * use that, otherwise use the system uid, if set.
773 * The system uid is in the nmp->nm_sockreq.nr_cred
774 * credentials.
775 */
776 if (nmp->nm_krbnamelen > 0) {
777 usegssname = 1;
778 clnt_principal = nmp->nm_krbname;
779 } else if (nmp->nm_uid != (uid_t)-1) {
780 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
781 ("newnfs_request: NULL nr_cred"));
782 crfree(authcred);
783 authcred = crhold(nmp->nm_sockreq.nr_cred);
784 }
785 } else if (nmp->nm_krbnamelen == 0 &&
786 nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
787 /*
788 * If there is no host based principal name and
789 * the system uid is set and this is root, use the
790 * system uid, since root won't have user
791 * credentials in a credentials cache file.
792 * The system uid is in the nmp->nm_sockreq.nr_cred
793 * credentials.
794 */
795 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
796 ("newnfs_request: NULL nr_cred"));
797 crfree(authcred);
798 authcred = crhold(nmp->nm_sockreq.nr_cred);
799 }
800 if (NFSHASINTEGRITY(nmp))
801 secflavour = RPCSEC_GSS_KRB5I;
802 else if (NFSHASPRIVACY(nmp))
803 secflavour = RPCSEC_GSS_KRB5P;
804 else
805 secflavour = RPCSEC_GSS_KRB5;
806 if (nrp->nr_srvprinc[0] == '\0')
807 srv_principal = NFSMNT_SRVKRBNAME(nmp);
808 else
809 srv_principal = nrp->nr_srvprinc;
810 } else if (nmp != NULL && (!NFSHASKERB(nmp) || NFSHASSYSKRB5(nmp)) &&
811 nd->nd_procnum != NFSPROC_NULL &&
812 (nd->nd_flag & ND_USEGSSNAME) != 0) {
813 /*
814 * Use the uid that did the mount when the RPC is doing
815 * NFSv4 system operations, as indicated by the
816 * ND_USEGSSNAME flag, for the AUTH_SYS case.
817 * The credentials in nm_sockreq.nr_cred were used for the
818 * mount.
819 */
820 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
821 ("newnfs_request: NULL nr_cred"));
822 crfree(authcred);
823 authcred = crhold(nmp->nm_sockreq.nr_cred);
824 }
825
826 if (nmp != NULL) {
827 bzero(&nf, sizeof(struct nfs_feedback_arg));
828 nf.nf_mount = nmp;
829 nf.nf_td = td;
830 nf.nf_lastmsg = NFSD_MONOSEC -
831 ((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
832 }
833
834 if (nd->nd_procnum == NFSPROC_NULL)
835 auth = authnone_create();
836 else if (usegssname) {
837 /*
838 * For this case, the authenticator is held in the
839 * nfssockreq structure, so don't release the reference count
840 * held on it. --> Don't AUTH_DESTROY() it in this function.
841 */
842 if (nrp->nr_auth == NULL)
843 nrp->nr_auth = nfs_getauth(nrp, secflavour,
844 clnt_principal, srv_principal, NULL, authcred);
845 else
846 rpc_gss_refresh_auth_call(nrp->nr_auth);
847 auth = nrp->nr_auth;
848 } else
849 auth = nfs_getauth(nrp, secflavour, NULL,
850 srv_principal, NULL, authcred);
851 crfree(authcred);
852 if (auth == NULL) {
853 m_freem(nd->nd_mreq);
854 if (set_sigset)
855 newnfs_restore_sigmask(td, &oldset);
856 return (EACCES);
857 }
858 bzero(&ext, sizeof(ext));
859 ext.rc_auth = auth;
860 if (nmp != NULL) {
861 ext.rc_feedback = nfs_feedback;
862 ext.rc_feedback_arg = &nf;
863 }
864
865 procnum = nd->nd_procnum;
866 if ((nd->nd_flag & ND_NFSV4) &&
867 nd->nd_procnum != NFSPROC_NULL &&
868 nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
869 procnum = NFSV4PROC_COMPOUND;
870
871 if (nmp != NULL) {
872 NFSINCRGLOBAL(nfsstatsv1.rpcrequests);
873
874 /* Map the procnum to the old NFSv2 one, as required. */
875 if ((nd->nd_flag & ND_NFSV2) != 0) {
876 if (nd->nd_procnum < NFS_V3NPROCS)
877 procnum = nfsv2_procid[nd->nd_procnum];
878 else
879 procnum = NFSV2PROC_NOOP;
880 }
881
882 /*
883 * Now only used for the R_DONTRECOVER case, but until that is
884 * supported within the krpc code, I need to keep a queue of
885 * outstanding RPCs for nfsv4 client requests.
886 */
887 if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
888 rep = malloc(sizeof(struct nfsreq),
889 M_NFSDREQ, M_WAITOK);
890 #ifdef KDTRACE_HOOKS
891 if (dtrace_nfscl_nfs234_start_probe != NULL) {
892 uint32_t probe_id;
893 int probe_procnum;
894
895 if (nd->nd_flag & ND_NFSV4) {
896 probe_id =
897 nfscl_nfs4_start_probes[nd->nd_procnum];
898 probe_procnum = nd->nd_procnum;
899 } else if (nd->nd_flag & ND_NFSV3) {
900 probe_id = nfscl_nfs3_start_probes[procnum];
901 probe_procnum = procnum;
902 } else {
903 probe_id =
904 nfscl_nfs2_start_probes[nd->nd_procnum];
905 probe_procnum = procnum;
906 }
907 if (probe_id != 0)
908 (dtrace_nfscl_nfs234_start_probe)
909 (probe_id, vp, nd->nd_mreq, cred,
910 probe_procnum);
911 }
912 #endif
913 }
914 freeslot = -1; /* Set to slot that needs to be free'd */
915 tryagain:
916 slot = -1; /* Slot that needs a sequence# increment. */
917 /*
918 * This timeout specifies when a new socket should be created,
919 * along with new xid values. For UDP, this should be done
920 * infrequently, since retransmits of RPC requests should normally
921 * use the same xid.
922 */
923 if (nmp == NULL) {
924 if (clp == NULL) {
925 timo.tv_sec = NFSV4_UPCALLTIMEO;
926 timo.tv_usec = 0;
927 } else {
928 timo.tv_sec = NFSV4_CALLBACKTIMEO / 1000;
929 timo.tv_usec = NFSV4_CALLBACKTIMEO * 1000;
930 }
931 } else {
932 if (nrp->nr_sotype != SOCK_DGRAM) {
933 timo.tv_usec = 0;
934 if ((nmp->nm_flag & NFSMNT_NFSV4))
935 timo.tv_sec = INT_MAX;
936 else
937 timo.tv_sec = NFS_TCPTIMEO;
938 } else {
939 if (NFSHASSOFT(nmp)) {
940 /*
941 * CLSET_RETRIES is set to 2, so this should be
942 * half of the total timeout required.
943 */
944 timeo = nmp->nm_retry * nmp->nm_timeo / 2;
945 if (timeo < 1)
946 timeo = 1;
947 timo.tv_sec = timeo / NFS_HZ;
948 timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
949 NFS_HZ;
950 } else {
951 /* For UDP hard mounts, use a large value. */
952 timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
953 timo.tv_usec = 0;
954 }
955 }
956
957 if (rep != NULL) {
958 rep->r_flags = 0;
959 rep->r_nmp = nmp;
960 /*
961 * Chain request into list of outstanding requests.
962 */
963 NFSLOCKREQ();
964 TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
965 NFSUNLOCKREQ();
966 }
967 }
968
969 nd->nd_mrep = NULL;
970 if (clp != NULL && sep != NULL)
971 stat = clnt_bck_call(nrp->nr_client, &ext, procnum,
972 nd->nd_mreq, &nd->nd_mrep, timo, sep->nfsess_xprt);
973 else if (nextconn_set)
974 /*
975 * When there are multiple TCP connections, send the
976 * RPCs with large messages on the alternate TCP
977 * connection(s) in a round robin fashion.
978 * The small RPC messages are sent on the default
979 * TCP connection because they do not require much
980 * network bandwidth and separating them from the
981 * large RPC messages avoids them getting "log jammed"
982 * behind several large RPC messages.
983 */
984 stat = CLNT_CALL_MBUF(nmp->nm_aconn[nextconn],
985 &ext, procnum, nd->nd_mreq, &nd->nd_mrep, timo);
986 else
987 stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum,
988 nd->nd_mreq, &nd->nd_mrep, timo);
989 NFSCL_DEBUG(2, "clnt call=%d\n", stat);
990
991 if (rep != NULL) {
992 /*
993 * RPC done, unlink the request.
994 */
995 NFSLOCKREQ();
996 TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
997 NFSUNLOCKREQ();
998 }
999
1000 /*
1001 * If there was a successful reply and a tprintf msg.
1002 * tprintf a response.
1003 */
1004 if (stat == RPC_SUCCESS) {
1005 error = 0;
1006 } else if (stat == RPC_TIMEDOUT) {
1007 NFSINCRGLOBAL(nfsstatsv1.rpctimeouts);
1008 error = ETIMEDOUT;
1009 } else if (stat == RPC_VERSMISMATCH) {
1010 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1011 error = EOPNOTSUPP;
1012 } else if (stat == RPC_PROGVERSMISMATCH) {
1013 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1014 error = EPROTONOSUPPORT;
1015 } else if (stat == RPC_CANTSEND || stat == RPC_CANTRECV ||
1016 stat == RPC_SYSTEMERROR || stat == RPC_INTR) {
1017 /* Check for a session slot that needs to be free'd. */
1018 if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1019 (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1020 nd->nd_procnum != NFSPROC_NULL) {
1021 /*
1022 * This should only occur when either the MDS or
1023 * a client has an RPC against a DS fail.
1024 * This happens because these cases use "soft"
1025 * connections that can time out and fail.
1026 * The slot used for this RPC is now in a
1027 * non-deterministic state, but if the slot isn't
1028 * free'd, threads can get stuck waiting for a slot.
1029 */
1030 if (sep == NULL)
1031 sep = nfsmnt_mdssession(nmp);
1032 /*
1033 * Bump the sequence# out of range, so that reuse of
1034 * this slot will result in an NFSERR_SEQMISORDERED
1035 * error and not a bogus cached RPC reply.
1036 */
1037 mtx_lock(&sep->nfsess_mtx);
1038 sep->nfsess_slotseq[nd->nd_slotid] += 10;
1039 sep->nfsess_badslots |= (0x1ULL << nd->nd_slotid);
1040 mtx_unlock(&sep->nfsess_mtx);
1041 /* And free the slot. */
1042 nfsv4_freeslot(sep, nd->nd_slotid, true);
1043 }
1044 if (stat == RPC_INTR)
1045 error = EINTR;
1046 else {
1047 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1048 error = ENXIO;
1049 }
1050 } else if (stat == RPC_AUTHERROR) {
1051 /* Check for a session slot that needs to be free'd. */
1052 if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1053 (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1054 nd->nd_procnum != NFSPROC_NULL) {
1055 /*
1056 * This can occur when a Kerberos/RPCSEC_GSS session
1057 * expires, due to TGT expiration.
1058 * Free the slot, resetting the slot's sequence#.
1059 */
1060 if (sep == NULL)
1061 sep = nfsmnt_mdssession(nmp);
1062 nfsv4_freeslot(sep, nd->nd_slotid, true);
1063 }
1064 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1065 error = EACCES;
1066 } else {
1067 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1068 error = EACCES;
1069 }
1070 if (error) {
1071 m_freem(nd->nd_mreq);
1072 if (usegssname == 0)
1073 AUTH_DESTROY(auth);
1074 if (rep != NULL)
1075 free(rep, M_NFSDREQ);
1076 if (set_sigset)
1077 newnfs_restore_sigmask(td, &oldset);
1078 return (error);
1079 }
1080
1081 KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
1082
1083 /*
1084 * Search for any mbufs that are not a multiple of 4 bytes long
1085 * or with m_data not longword aligned.
1086 * These could cause pointer alignment problems, so copy them to
1087 * well aligned mbufs.
1088 */
1089 newnfs_realign(&nd->nd_mrep, M_WAITOK);
1090 nd->nd_md = nd->nd_mrep;
1091 nd->nd_dpos = mtod(nd->nd_md, caddr_t);
1092 nd->nd_repstat = 0;
1093 if (nd->nd_procnum != NFSPROC_NULL &&
1094 nd->nd_procnum != NFSV4PROC_CBNULL) {
1095 /* If sep == NULL, set it to the default in nmp. */
1096 if (sep == NULL && nmp != NULL)
1097 sep = nfsmnt_mdssession(nmp);
1098 /*
1099 * and now the actual NFS xdr.
1100 */
1101 NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1102 nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
1103 if (nd->nd_repstat >= 10000)
1104 NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
1105 (int)nd->nd_repstat);
1106
1107 /*
1108 * Get rid of the tag, return count and SEQUENCE result for
1109 * NFSv4.
1110 */
1111 if ((nd->nd_flag & ND_NFSV4) != 0 && nd->nd_repstat !=
1112 NFSERR_MINORVERMISMATCH) {
1113 NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1114 i = fxdr_unsigned(int, *tl);
1115 error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
1116 if (error)
1117 goto nfsmout;
1118 NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1119 opcnt = fxdr_unsigned(int, *tl++);
1120 i = fxdr_unsigned(int, *tl++);
1121 j = fxdr_unsigned(int, *tl);
1122 if (j >= 10000)
1123 NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
1124 /*
1125 * If the first op is Sequence, free up the slot.
1126 */
1127 if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0) ||
1128 (clp != NULL && i == NFSV4OP_CBSEQUENCE && j != 0)) {
1129 NFSCL_DEBUG(1, "failed seq=%d\n", j);
1130 if (sep != NULL && i == NFSV4OP_SEQUENCE &&
1131 j == NFSERR_SEQMISORDERED) {
1132 mtx_lock(&sep->nfsess_mtx);
1133 sep->nfsess_badslots |=
1134 (0x1ULL << nd->nd_slotid);
1135 mtx_unlock(&sep->nfsess_mtx);
1136 }
1137 }
1138 if (((nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) ||
1139 (clp != NULL && i == NFSV4OP_CBSEQUENCE &&
1140 j == 0)) && sep != NULL) {
1141 if (i == NFSV4OP_SEQUENCE)
1142 NFSM_DISSECT(tl, uint32_t *,
1143 NFSX_V4SESSIONID +
1144 5 * NFSX_UNSIGNED);
1145 else
1146 NFSM_DISSECT(tl, uint32_t *,
1147 NFSX_V4SESSIONID +
1148 4 * NFSX_UNSIGNED);
1149 mtx_lock(&sep->nfsess_mtx);
1150 if (bcmp(tl, sep->nfsess_sessionid,
1151 NFSX_V4SESSIONID) == 0) {
1152 tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
1153 retseq = fxdr_unsigned(uint32_t, *tl++);
1154 slot = fxdr_unsigned(int, *tl++);
1155 if ((nd->nd_flag & ND_HASSLOTID) != 0) {
1156 if (slot >= NFSV4_SLOTS ||
1157 (i == NFSV4OP_CBSEQUENCE &&
1158 slot >= NFSV4_CBSLOTS)) {
1159 printf("newnfs_request:"
1160 " Bogus slot\n");
1161 slot = nd->nd_slotid;
1162 } else if (slot !=
1163 nd->nd_slotid) {
1164 printf("newnfs_request:"
1165 " Wrong session "
1166 "srvslot=%d "
1167 "slot=%d\n", slot,
1168 nd->nd_slotid);
1169 if (i == NFSV4OP_SEQUENCE) {
1170 /*
1171 * Mark both slots as
1172 * bad, because we do
1173 * not know if the
1174 * server has advanced
1175 * the sequence# for
1176 * either of them.
1177 */
1178 sep->nfsess_badslots |=
1179 (0x1ULL << slot);
1180 sep->nfsess_badslots |=
1181 (0x1ULL <<
1182 nd->nd_slotid);
1183 }
1184 slot = nd->nd_slotid;
1185 }
1186 freeslot = slot;
1187 } else if (slot != 0) {
1188 printf("newnfs_request: Bad "
1189 "session slot=%d\n", slot);
1190 slot = 0;
1191 }
1192 if (retseq != sep->nfsess_slotseq[slot])
1193 printf("retseq diff 0x%x\n",
1194 retseq);
1195 retval0 = fxdr_unsigned(uint32_t,*tl++);
1196 retval = fxdr_unsigned(uint32_t, *tl);
1197 if ((retval + 1) < sep->nfsess_foreslots
1198 ) {
1199 sep->nfsess_foreslots = (retval
1200 + 1);
1201 nfs_resetslots(sep);
1202 } else if ((retval + 1) >
1203 sep->nfsess_foreslots) {
1204 if (retval0 > retval)
1205 printf("Sess:highest > "
1206 "target_highest\n");
1207 sep->nfsess_foreslots =
1208 (retval < NFSV4_SLOTS) ?
1209 (retval + 1) : NFSV4_SLOTS;
1210 }
1211 }
1212 mtx_unlock(&sep->nfsess_mtx);
1213
1214 /* Grab the op and status for the next one. */
1215 if (opcnt > 1) {
1216 NFSM_DISSECT(tl, uint32_t *,
1217 2 * NFSX_UNSIGNED);
1218 i = fxdr_unsigned(int, *tl++);
1219 j = fxdr_unsigned(int, *tl);
1220 }
1221 }
1222 }
1223 if (nd->nd_repstat != 0) {
1224 if (nd->nd_repstat == NFSERR_BADSESSION &&
1225 nmp != NULL && dssep == NULL &&
1226 (nd->nd_flag & ND_NFSV41) != 0) {
1227 /*
1228 * If this is a client side MDS RPC, mark
1229 * the MDS session defunct and initiate
1230 * recovery, as required.
1231 * The nfsess_defunct field is protected by
1232 * the NFSLOCKMNT()/nm_mtx lock and not the
1233 * nfsess_mtx lock to simplify its handling,
1234 * for the MDS session. This lock is also
1235 * sufficient for nfsess_sessionid, since it
1236 * never changes in the structure.
1237 */
1238 NFSCL_DEBUG(1, "Got badsession\n");
1239 NFSLOCKCLSTATE();
1240 NFSLOCKMNT(nmp);
1241 if (TAILQ_EMPTY(&nmp->nm_sess)) {
1242 NFSUNLOCKMNT(nmp);
1243 NFSUNLOCKCLSTATE();
1244 printf("If server has not rebooted, "
1245 "check NFS clients for unique "
1246 "/etc/hostid's\n");
1247 goto out;
1248 }
1249 sep = NFSMNT_MDSSESSION(nmp);
1250 if (bcmp(sep->nfsess_sessionid, nd->nd_sequence,
1251 NFSX_V4SESSIONID) == 0) {
1252 printf("Initiate recovery. If server "
1253 "has not rebooted, "
1254 "check NFS clients for unique "
1255 "/etc/hostid's\n");
1256 /* Initiate recovery. */
1257 sep->nfsess_defunct = 1;
1258 NFSCL_DEBUG(1, "Marked defunct\n");
1259 if (nmp->nm_clp != NULL) {
1260 nmp->nm_clp->nfsc_flags |=
1261 NFSCLFLAGS_RECOVER;
1262 wakeup(nmp->nm_clp);
1263 }
1264 }
1265 NFSUNLOCKCLSTATE();
1266 /*
1267 * Sleep for up to 1sec waiting for a new
1268 * session.
1269 */
1270 mtx_sleep(&nmp->nm_sess, &nmp->nm_mtx, PZERO,
1271 "nfsbadsess", hz);
1272 /*
1273 * Get the session again, in case a new one
1274 * has been created during the sleep.
1275 */
1276 sep = NFSMNT_MDSSESSION(nmp);
1277 NFSUNLOCKMNT(nmp);
1278 if ((nd->nd_flag & ND_LOOPBADSESS) != 0) {
1279 reterr = nfsv4_sequencelookup(nmp, sep,
1280 &slotpos, &maxslot, &slotseq,
1281 sessionid, true);
1282 if (reterr == 0) {
1283 /* Fill in new session info. */
1284 NFSCL_DEBUG(1,
1285 "Filling in new sequence\n");
1286 tl = nd->nd_sequence;
1287 bcopy(sessionid, tl,
1288 NFSX_V4SESSIONID);
1289 tl += NFSX_V4SESSIONID /
1290 NFSX_UNSIGNED;
1291 *tl++ = txdr_unsigned(slotseq);
1292 *tl++ = txdr_unsigned(slotpos);
1293 *tl = txdr_unsigned(maxslot);
1294 nd->nd_slotid = slotpos;
1295 nd->nd_flag |= ND_HASSLOTID;
1296 }
1297 if (reterr == NFSERR_BADSESSION ||
1298 reterr == 0) {
1299 NFSCL_DEBUG(1,
1300 "Badsession looping\n");
1301 m_freem(nd->nd_mrep);
1302 nd->nd_mrep = NULL;
1303 goto tryagain;
1304 }
1305 nd->nd_repstat = reterr;
1306 NFSCL_DEBUG(1, "Got err=%d\n", reterr);
1307 }
1308 }
1309 /*
1310 * When clp != NULL, it is a callback and all
1311 * callback operations can be retried for NFSERR_DELAY.
1312 */
1313 if (((nd->nd_repstat == NFSERR_DELAY ||
1314 nd->nd_repstat == NFSERR_GRACE) &&
1315 (nd->nd_flag & ND_NFSV4) && (clp != NULL ||
1316 (nd->nd_procnum != NFSPROC_DELEGRETURN &&
1317 nd->nd_procnum != NFSPROC_SETATTR &&
1318 nd->nd_procnum != NFSPROC_READ &&
1319 nd->nd_procnum != NFSPROC_READDS &&
1320 nd->nd_procnum != NFSPROC_WRITE &&
1321 nd->nd_procnum != NFSPROC_WRITEDS &&
1322 nd->nd_procnum != NFSPROC_OPEN &&
1323 nd->nd_procnum != NFSPROC_OPENLAYGET &&
1324 nd->nd_procnum != NFSPROC_CREATE &&
1325 nd->nd_procnum != NFSPROC_CREATELAYGET &&
1326 nd->nd_procnum != NFSPROC_OPENCONFIRM &&
1327 nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
1328 nd->nd_procnum != NFSPROC_CLOSE &&
1329 nd->nd_procnum != NFSPROC_LOCK &&
1330 nd->nd_procnum != NFSPROC_LOCKU))) ||
1331 (nd->nd_repstat == NFSERR_DELAY &&
1332 (nd->nd_flag & ND_NFSV4) == 0) ||
1333 nd->nd_repstat == NFSERR_RESOURCE ||
1334 nd->nd_repstat == NFSERR_RETRYUNCACHEDREP) {
1335 /* Clip at NFS_TRYLATERDEL. */
1336 if (timespeccmp(&trylater_delay,
1337 &nfs_trylater_max, >))
1338 trylater_delay = nfs_trylater_max;
1339 getnanouptime(&waituntil);
1340 timespecadd(&waituntil, &trylater_delay,
1341 &waituntil);
1342 do {
1343 nfs_catnap(PZERO, 0, "nfstry");
1344 getnanouptime(&ts);
1345 } while (timespeccmp(&ts, &waituntil, <));
1346 timespecadd(&trylater_delay, &trylater_delay,
1347 &trylater_delay); /* Double each time. */
1348 if (slot != -1) {
1349 mtx_lock(&sep->nfsess_mtx);
1350 sep->nfsess_slotseq[slot]++;
1351 *nd->nd_slotseq = txdr_unsigned(
1352 sep->nfsess_slotseq[slot]);
1353 mtx_unlock(&sep->nfsess_mtx);
1354 }
1355 m_freem(nd->nd_mrep);
1356 nd->nd_mrep = NULL;
1357 goto tryagain;
1358 }
1359
1360 /*
1361 * If the File Handle was stale, invalidate the
1362 * lookup cache, just in case.
1363 * (vp != NULL implies a client side call)
1364 */
1365 if (nd->nd_repstat == ESTALE && vp != NULL) {
1366 cache_purge(vp);
1367 if (ncl_call_invalcaches != NULL)
1368 (*ncl_call_invalcaches)(vp);
1369 }
1370 }
1371 if ((nd->nd_flag & ND_NFSV4) != 0) {
1372 /* Free the slot, as required. */
1373 if (freeslot != -1)
1374 nfsv4_freeslot(sep, freeslot, false);
1375 /*
1376 * If this op is Putfh, throw its results away.
1377 */
1378 if (j >= 10000)
1379 NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
1380 if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
1381 NFSM_DISSECT(tl,u_int32_t *,2 * NFSX_UNSIGNED);
1382 i = fxdr_unsigned(int, *tl++);
1383 j = fxdr_unsigned(int, *tl);
1384 if (j >= 10000)
1385 NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
1386 j);
1387 /*
1388 * All Compounds that do an Op that must
1389 * be in sequence consist of NFSV4OP_PUTFH
1390 * followed by one of these. As such, we
1391 * can determine if the seqid# should be
1392 * incremented, here.
1393 */
1394 if ((i == NFSV4OP_OPEN ||
1395 i == NFSV4OP_OPENCONFIRM ||
1396 i == NFSV4OP_OPENDOWNGRADE ||
1397 i == NFSV4OP_CLOSE ||
1398 i == NFSV4OP_LOCK ||
1399 i == NFSV4OP_LOCKU) &&
1400 (j == 0 ||
1401 (j != NFSERR_STALECLIENTID &&
1402 j != NFSERR_STALESTATEID &&
1403 j != NFSERR_BADSTATEID &&
1404 j != NFSERR_BADSEQID &&
1405 j != NFSERR_BADXDR &&
1406 j != NFSERR_RESOURCE &&
1407 j != NFSERR_NOFILEHANDLE)))
1408 nd->nd_flag |= ND_INCRSEQID;
1409 }
1410 /*
1411 * If this op's status is non-zero, mark
1412 * that there is no more data to process.
1413 * The exception is Setattr, which always has xdr
1414 * when it has failed.
1415 */
1416 if (j != 0 && i != NFSV4OP_SETATTR)
1417 nd->nd_flag |= ND_NOMOREDATA;
1418
1419 /*
1420 * If R_DONTRECOVER is set, replace the stale error
1421 * reply, so that recovery isn't initiated.
1422 */
1423 if ((nd->nd_repstat == NFSERR_STALECLIENTID ||
1424 nd->nd_repstat == NFSERR_BADSESSION ||
1425 nd->nd_repstat == NFSERR_STALESTATEID) &&
1426 rep != NULL && (rep->r_flags & R_DONTRECOVER))
1427 nd->nd_repstat = NFSERR_STALEDONTRECOVER;
1428 }
1429 }
1430 out:
1431
1432 #ifdef KDTRACE_HOOKS
1433 if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
1434 uint32_t probe_id;
1435 int probe_procnum;
1436
1437 if (nd->nd_flag & ND_NFSV4) {
1438 probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
1439 probe_procnum = nd->nd_procnum;
1440 } else if (nd->nd_flag & ND_NFSV3) {
1441 probe_id = nfscl_nfs3_done_probes[procnum];
1442 probe_procnum = procnum;
1443 } else {
1444 probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
1445 probe_procnum = procnum;
1446 }
1447 if (probe_id != 0)
1448 (dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
1449 nd->nd_mreq, cred, probe_procnum, 0);
1450 }
1451 #endif
1452
1453 m_freem(nd->nd_mreq);
1454 if (usegssname == 0)
1455 AUTH_DESTROY(auth);
1456 if (rep != NULL)
1457 free(rep, M_NFSDREQ);
1458 if (set_sigset)
1459 newnfs_restore_sigmask(td, &oldset);
1460 return (0);
1461 nfsmout:
1462 m_freem(nd->nd_mrep);
1463 m_freem(nd->nd_mreq);
1464 if (usegssname == 0)
1465 AUTH_DESTROY(auth);
1466 if (rep != NULL)
1467 free(rep, M_NFSDREQ);
1468 if (set_sigset)
1469 newnfs_restore_sigmask(td, &oldset);
1470 return (error);
1471 }
1472
1473 /*
1474 * Reset slots above nfsess_foreslots that are not busy.
1475 */
1476 void
nfs_resetslots(struct nfsclsession * sep)1477 nfs_resetslots(struct nfsclsession *sep)
1478 {
1479 int i;
1480 uint64_t bitval;
1481
1482 mtx_assert(&sep->nfsess_mtx, MA_OWNED);
1483 bitval = (1 << sep->nfsess_foreslots);
1484 for (i = sep->nfsess_foreslots; i < NFSV4_SLOTS; i++) {
1485 if ((sep->nfsess_slots & bitval) == 0 &&
1486 (sep->nfsess_badslots & bitval) == 0)
1487 sep->nfsess_slotseq[i] = 0;
1488 bitval <<= 1;
1489 }
1490 }
1491
1492 /*
1493 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1494 * wait for all requests to complete. This is used by forced unmounts
1495 * to terminate any outstanding RPCs.
1496 */
1497 int
newnfs_nmcancelreqs(struct nfsmount * nmp)1498 newnfs_nmcancelreqs(struct nfsmount *nmp)
1499 {
1500 struct nfsclds *dsp;
1501 struct __rpc_client *cl;
1502 int i;
1503
1504 if (nmp->nm_sockreq.nr_client != NULL)
1505 CLNT_CLOSE(nmp->nm_sockreq.nr_client);
1506 for (i = 0; i < nmp->nm_aconnect; i++)
1507 if (nmp->nm_aconn[i] != NULL)
1508 CLNT_CLOSE(nmp->nm_aconn[i]);
1509 lookformore:
1510 NFSLOCKMNT(nmp);
1511 TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
1512 NFSLOCKDS(dsp);
1513 if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
1514 (dsp->nfsclds_flags & NFSCLDS_CLOSED) == 0 &&
1515 dsp->nfsclds_sockp != NULL &&
1516 dsp->nfsclds_sockp->nr_client != NULL) {
1517 dsp->nfsclds_flags |= NFSCLDS_CLOSED;
1518 cl = dsp->nfsclds_sockp->nr_client;
1519 NFSUNLOCKDS(dsp);
1520 NFSUNLOCKMNT(nmp);
1521 CLNT_CLOSE(cl);
1522 goto lookformore;
1523 }
1524 NFSUNLOCKDS(dsp);
1525 }
1526 NFSUNLOCKMNT(nmp);
1527 return (0);
1528 }
1529
1530 /*
1531 * Any signal that can interrupt an NFS operation in an intr mount
1532 * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
1533 */
1534 int newnfs_sig_set[] = {
1535 SIGINT,
1536 SIGTERM,
1537 SIGHUP,
1538 SIGKILL,
1539 SIGQUIT
1540 };
1541
1542 /*
1543 * Check to see if one of the signals in our subset is pending on
1544 * the process (in an intr mount).
1545 */
1546 static int
nfs_sig_pending(sigset_t set)1547 nfs_sig_pending(sigset_t set)
1548 {
1549 int i;
1550
1551 for (i = 0 ; i < nitems(newnfs_sig_set); i++)
1552 if (SIGISMEMBER(set, newnfs_sig_set[i]))
1553 return (1);
1554 return (0);
1555 }
1556
1557 /*
1558 * The set/restore sigmask functions are used to (temporarily) overwrite
1559 * the thread td_sigmask during an RPC call (for example). These are also
1560 * used in other places in the NFS client that might tsleep().
1561 */
1562 void
newnfs_set_sigmask(struct thread * td,sigset_t * oldset)1563 newnfs_set_sigmask(struct thread *td, sigset_t *oldset)
1564 {
1565 sigset_t newset;
1566 int i;
1567 struct proc *p;
1568
1569 SIGFILLSET(newset);
1570 if (td == NULL)
1571 td = curthread; /* XXX */
1572 p = td->td_proc;
1573 /* Remove the NFS set of signals from newset */
1574 PROC_LOCK(p);
1575 mtx_lock(&p->p_sigacts->ps_mtx);
1576 for (i = 0 ; i < nitems(newnfs_sig_set); i++) {
1577 /*
1578 * But make sure we leave the ones already masked
1579 * by the process, ie. remove the signal from the
1580 * temporary signalmask only if it wasn't already
1581 * in p_sigmask.
1582 */
1583 if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
1584 !SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
1585 SIGDELSET(newset, newnfs_sig_set[i]);
1586 }
1587 mtx_unlock(&p->p_sigacts->ps_mtx);
1588 kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
1589 SIGPROCMASK_PROC_LOCKED);
1590 PROC_UNLOCK(p);
1591 }
1592
1593 void
newnfs_restore_sigmask(struct thread * td,sigset_t * set)1594 newnfs_restore_sigmask(struct thread *td, sigset_t *set)
1595 {
1596 if (td == NULL)
1597 td = curthread; /* XXX */
1598 kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
1599 }
1600
1601 /*
1602 * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
1603 * old one after msleep() returns.
1604 */
1605 int
newnfs_msleep(struct thread * td,void * ident,struct mtx * mtx,int priority,char * wmesg,int timo)1606 newnfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
1607 {
1608 sigset_t oldset;
1609 int error;
1610
1611 if ((priority & PCATCH) == 0)
1612 return msleep(ident, mtx, priority, wmesg, timo);
1613 if (td == NULL)
1614 td = curthread; /* XXX */
1615 newnfs_set_sigmask(td, &oldset);
1616 error = msleep(ident, mtx, priority, wmesg, timo);
1617 newnfs_restore_sigmask(td, &oldset);
1618 return (error);
1619 }
1620
1621 /*
1622 * Test for a termination condition pending on the process.
1623 * This is used for NFSMNT_INT mounts.
1624 */
1625 int
newnfs_sigintr(struct nfsmount * nmp,struct thread * td)1626 newnfs_sigintr(struct nfsmount *nmp, struct thread *td)
1627 {
1628 struct proc *p;
1629 sigset_t tmpset;
1630
1631 /* Terminate all requests while attempting a forced unmount. */
1632 if (NFSCL_FORCEDISM(nmp->nm_mountp))
1633 return (EIO);
1634 if (!(nmp->nm_flag & NFSMNT_INT))
1635 return (0);
1636 if (td == NULL)
1637 return (0);
1638 p = td->td_proc;
1639 PROC_LOCK(p);
1640 tmpset = p->p_siglist;
1641 SIGSETOR(tmpset, td->td_siglist);
1642 SIGSETNAND(tmpset, td->td_sigmask);
1643 mtx_lock(&p->p_sigacts->ps_mtx);
1644 SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
1645 mtx_unlock(&p->p_sigacts->ps_mtx);
1646 if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
1647 && nfs_sig_pending(tmpset)) {
1648 PROC_UNLOCK(p);
1649 return (EINTR);
1650 }
1651 PROC_UNLOCK(p);
1652 return (0);
1653 }
1654
1655 static int
nfs_msg(struct thread * td,const char * server,const char * msg,int error)1656 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
1657 {
1658 struct proc *p;
1659
1660 p = td ? td->td_proc : NULL;
1661 if (error) {
1662 tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n",
1663 server, msg, error);
1664 } else {
1665 tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
1666 }
1667 return (0);
1668 }
1669
1670 static void
nfs_down(struct nfsmount * nmp,struct thread * td,const char * msg,int error,int flags)1671 nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
1672 int error, int flags)
1673 {
1674 if (nmp == NULL)
1675 return;
1676 mtx_lock(&nmp->nm_mtx);
1677 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
1678 nmp->nm_state |= NFSSTA_TIMEO;
1679 mtx_unlock(&nmp->nm_mtx);
1680 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1681 VQ_NOTRESP, 0);
1682 } else
1683 mtx_unlock(&nmp->nm_mtx);
1684 mtx_lock(&nmp->nm_mtx);
1685 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1686 nmp->nm_state |= NFSSTA_LOCKTIMEO;
1687 mtx_unlock(&nmp->nm_mtx);
1688 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1689 VQ_NOTRESPLOCK, 0);
1690 } else
1691 mtx_unlock(&nmp->nm_mtx);
1692 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
1693 }
1694
1695 static void
nfs_up(struct nfsmount * nmp,struct thread * td,const char * msg,int flags,int tprintfmsg)1696 nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
1697 int flags, int tprintfmsg)
1698 {
1699 if (nmp == NULL)
1700 return;
1701 if (tprintfmsg) {
1702 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
1703 }
1704
1705 mtx_lock(&nmp->nm_mtx);
1706 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
1707 nmp->nm_state &= ~NFSSTA_TIMEO;
1708 mtx_unlock(&nmp->nm_mtx);
1709 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1710 VQ_NOTRESP, 1);
1711 } else
1712 mtx_unlock(&nmp->nm_mtx);
1713
1714 mtx_lock(&nmp->nm_mtx);
1715 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1716 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
1717 mtx_unlock(&nmp->nm_mtx);
1718 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1719 VQ_NOTRESPLOCK, 1);
1720 } else
1721 mtx_unlock(&nmp->nm_mtx);
1722 }
1723