1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1991, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35
36 #include <sys/cdefs.h>
37 /*
38 * Socket operations for use by nfs
39 */
40
41 #include "opt_kgssapi.h"
42 #include "opt_nfs.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/limits.h>
48 #include <sys/lock.h>
49 #include <sys/malloc.h>
50 #include <sys/mbuf.h>
51 #include <sys/mount.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/syscallsubr.h>
56 #include <sys/sysctl.h>
57 #include <sys/syslog.h>
58 #include <sys/vnode.h>
59
60 #include <rpc/rpc.h>
61 #include <rpc/krpc.h>
62
63 #include <kgssapi/krb5/kcrypto.h>
64
65 #include <fs/nfs/nfsport.h>
66
67 #ifdef KDTRACE_HOOKS
68 #include <sys/dtrace_bsd.h>
69
70 dtrace_nfsclient_nfs23_start_probe_func_t
71 dtrace_nfscl_nfs234_start_probe;
72
73 dtrace_nfsclient_nfs23_done_probe_func_t
74 dtrace_nfscl_nfs234_done_probe;
75
76 /*
77 * Registered probes by RPC type.
78 */
79 uint32_t nfscl_nfs2_start_probes[NFSV41_NPROCS + 1];
80 uint32_t nfscl_nfs2_done_probes[NFSV41_NPROCS + 1];
81
82 uint32_t nfscl_nfs3_start_probes[NFSV41_NPROCS + 1];
83 uint32_t nfscl_nfs3_done_probes[NFSV41_NPROCS + 1];
84
85 uint32_t nfscl_nfs4_start_probes[NFSV41_NPROCS + 1];
86 uint32_t nfscl_nfs4_done_probes[NFSV41_NPROCS + 1];
87 #endif
88
89 NFSSTATESPINLOCK;
90 NFSREQSPINLOCK;
91 NFSDLOCKMUTEX;
92 NFSCLSTATEMUTEX;
93 extern struct nfsstatsv1 nfsstatsv1;
94 extern struct nfsreqhead nfsd_reqq;
95 extern int nfscl_ticks;
96 extern void (*ncl_call_invalcaches)(struct vnode *);
97 extern int nfs_numnfscbd;
98 extern int nfscl_debuglevel;
99 extern int nfsrv_lease;
100
101 SVCPOOL *nfscbd_pool;
102 int nfs_bufpackets = 4;
103 static int nfsrv_gsscallbackson = 0;
104 static int nfs_reconnects;
105 static int nfs3_jukebox_delay = 10;
106 static int nfs_skip_wcc_data_onerr = 1;
107 static int nfs_dsretries = 2;
108 static struct timespec nfs_trylater_max = {
109 .tv_sec = NFS_TRYLATERDEL,
110 .tv_nsec = 0,
111 };
112
113 SYSCTL_DECL(_vfs_nfs);
114
115 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
116 "Buffer reservation size 2 < x < 64");
117 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
118 "Number of times the nfs client has had to reconnect");
119 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
120 "Number of seconds to delay a retry after receiving EJUKEBOX");
121 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
122 "Disable weak cache consistency checking when server returns an error");
123 SYSCTL_INT(_vfs_nfs, OID_AUTO, dsretries, CTLFLAG_RW, &nfs_dsretries, 0,
124 "Number of retries for a DS RPC before failure");
125
126 static void nfs_down(struct nfsmount *, struct thread *, const char *,
127 int, int);
128 static void nfs_up(struct nfsmount *, struct thread *, const char *,
129 int, int);
130 static int nfs_msg(struct thread *, const char *, const char *, int);
131
132 struct nfs_cached_auth {
133 int ca_refs; /* refcount, including 1 from the cache */
134 uid_t ca_uid; /* uid that corresponds to this auth */
135 AUTH *ca_auth; /* RPC auth handle */
136 };
137
138 static int nfsv2_procid[NFS_V3NPROCS] = {
139 NFSV2PROC_NULL,
140 NFSV2PROC_GETATTR,
141 NFSV2PROC_SETATTR,
142 NFSV2PROC_LOOKUP,
143 NFSV2PROC_NOOP,
144 NFSV2PROC_READLINK,
145 NFSV2PROC_READ,
146 NFSV2PROC_WRITE,
147 NFSV2PROC_CREATE,
148 NFSV2PROC_MKDIR,
149 NFSV2PROC_SYMLINK,
150 NFSV2PROC_CREATE,
151 NFSV2PROC_REMOVE,
152 NFSV2PROC_RMDIR,
153 NFSV2PROC_RENAME,
154 NFSV2PROC_LINK,
155 NFSV2PROC_READDIR,
156 NFSV2PROC_NOOP,
157 NFSV2PROC_STATFS,
158 NFSV2PROC_NOOP,
159 NFSV2PROC_NOOP,
160 NFSV2PROC_NOOP,
161 };
162
163 /*
164 * This static array indicates that a NFSv4 RPC should use
165 * RPCSEC_GSS, if the mount indicates that via sec=krb5[ip].
166 * System RPCs that do not use file handles will be false
167 * in this array so that they will use AUTH_SYS when the
168 * "syskrb5" mount option is specified, along with
169 * "sec=krb5[ip]".
170 */
171 static bool nfscl_use_gss[NFSV42_NPROCS] = {
172 true,
173 true,
174 true,
175 true,
176 true,
177 true,
178 true,
179 true,
180 true,
181 true,
182 true,
183 true,
184 true,
185 true,
186 true,
187 true,
188 true,
189 true,
190 true,
191 true,
192 true,
193 true,
194 true,
195 false, /* SetClientID */
196 false, /* SetClientIDConfirm */
197 true,
198 true,
199 true,
200 true,
201 true,
202 true,
203 true,
204 false, /* Renew */
205 true,
206 false, /* ReleaseLockOwn */
207 true,
208 true,
209 true,
210 true,
211 true,
212 true,
213 false, /* ExchangeID */
214 false, /* CreateSession */
215 false, /* DestroySession */
216 false, /* DestroyClientID */
217 false, /* FreeStateID */
218 true,
219 true,
220 true,
221 true,
222 false, /* ReclaimComplete */
223 true,
224 true,
225 true,
226 true,
227 true,
228 true,
229 true,
230 true,
231 true,
232 true,
233 true,
234 true,
235 true,
236 true,
237 false, /* BindConnectionToSession */
238 true,
239 true,
240 true,
241 true,
242 true,
243 };
244
245 /*
246 * Initialize sockets and congestion for a new NFS connection.
247 * We do not free the sockaddr if error.
248 * Which arguments are set to NULL indicate what kind of call it is.
249 * cred == NULL --> a call to connect to a pNFS DS
250 * nmp == NULL --> indicates an upcall to userland or a NFSv4.0 callback
251 */
252 int
newnfs_connect(struct nfsmount * nmp,struct nfssockreq * nrp,struct ucred * cred,NFSPROC_T * p,int callback_retry_mult,bool dotls,struct __rpc_client ** clipp)253 newnfs_connect(struct nfsmount *nmp, struct nfssockreq *nrp,
254 struct ucred *cred, NFSPROC_T *p, int callback_retry_mult, bool dotls,
255 struct __rpc_client **clipp)
256 {
257 int rcvreserve, sndreserve;
258 int pktscale, pktscalesav;
259 struct sockaddr *saddr;
260 struct ucred *origcred;
261 CLIENT *client;
262 struct netconfig *nconf;
263 struct socket *so;
264 int one = 1, retries, error = 0;
265 struct thread *td = curthread;
266 SVCXPRT *xprt;
267 struct timeval timo;
268 uint64_t tval;
269
270 /*
271 * We need to establish the socket using the credentials of
272 * the mountpoint. Some parts of this process (such as
273 * sobind() and soconnect()) will use the curent thread's
274 * credential instead of the socket credential. To work
275 * around this, temporarily change the current thread's
276 * credential to that of the mountpoint.
277 *
278 * XXX: It would be better to explicitly pass the correct
279 * credential to sobind() and soconnect().
280 */
281 origcred = td->td_ucred;
282
283 /*
284 * Use the credential in nr_cred, if not NULL.
285 */
286 if (nrp->nr_cred != NULL)
287 td->td_ucred = nrp->nr_cred;
288 else
289 td->td_ucred = cred;
290 saddr = nrp->nr_nam;
291
292 if (saddr->sa_family == AF_INET)
293 if (nrp->nr_sotype == SOCK_DGRAM)
294 nconf = getnetconfigent("udp");
295 else
296 nconf = getnetconfigent("tcp");
297 else
298 if (nrp->nr_sotype == SOCK_DGRAM)
299 nconf = getnetconfigent("udp6");
300 else
301 nconf = getnetconfigent("tcp6");
302
303 pktscale = nfs_bufpackets;
304 if (pktscale < 2)
305 pktscale = 2;
306 if (pktscale > 64)
307 pktscale = 64;
308 pktscalesav = pktscale;
309 /*
310 * soreserve() can fail if sb_max is too small, so shrink pktscale
311 * and try again if there is an error.
312 * Print a log message suggesting increasing sb_max.
313 * Creating a socket and doing this is necessary since, if the
314 * reservation sizes are too large and will make soreserve() fail,
315 * the connection will work until a large send is attempted and
316 * then it will loop in the krpc code.
317 */
318 so = NULL;
319 saddr = NFSSOCKADDR(nrp->nr_nam, struct sockaddr *);
320 error = socreate(saddr->sa_family, &so, nrp->nr_sotype,
321 nrp->nr_soproto, td->td_ucred, td);
322 if (error != 0)
323 goto out;
324 do {
325 if (error != 0 && pktscale > 2) {
326 if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
327 pktscale == pktscalesav) {
328 /*
329 * Suggest vfs.nfs.bufpackets * maximum RPC message,
330 * adjusted for the sb_max->sb_max_adj conversion of
331 * MCLBYTES / (MSIZE + MCLBYTES) as the minimum setting
332 * for kern.ipc.maxsockbuf.
333 */
334 tval = (NFS_MAXBSIZE + NFS_MAXXDR) * nfs_bufpackets;
335 tval *= MSIZE + MCLBYTES;
336 tval += MCLBYTES - 1; /* Round up divide by MCLBYTES. */
337 tval /= MCLBYTES;
338 printf("Consider increasing kern.ipc.maxsockbuf to a "
339 "minimum of %ju to support %ubyte NFS I/O\n",
340 (uintmax_t)tval, NFS_MAXBSIZE);
341 }
342 pktscale--;
343 }
344 if (nrp->nr_sotype == SOCK_DGRAM) {
345 if (nmp != NULL) {
346 sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
347 pktscale;
348 rcvreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) *
349 pktscale;
350 } else {
351 sndreserve = rcvreserve = 1024 * pktscale;
352 }
353 } else {
354 if (nrp->nr_sotype != SOCK_STREAM)
355 panic("nfscon sotype");
356 if (nmp != NULL) {
357 sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
358 pktscale;
359 rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR) *
360 pktscale;
361 } else {
362 sndreserve = rcvreserve = 1024 * pktscale;
363 }
364 }
365 error = soreserve(so, sndreserve, rcvreserve);
366 if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM &&
367 pktscale <= 2)
368 printf("Must increase kern.ipc.maxsockbuf or reduce"
369 " rsize, wsize\n");
370 } while (error != 0 && pktscale > 2);
371 soclose(so);
372 if (error != 0)
373 goto out;
374
375 client = clnt_reconnect_create(nconf, saddr, nrp->nr_prog,
376 nrp->nr_vers, sndreserve, rcvreserve);
377 CLNT_CONTROL(client, CLSET_WAITCHAN, "nfsreq");
378 if (nmp != NULL) {
379 if ((nmp->nm_flag & NFSMNT_INT))
380 CLNT_CONTROL(client, CLSET_INTERRUPTIBLE, &one);
381 if ((nmp->nm_flag & NFSMNT_RESVPORT))
382 CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
383 if (NFSHASTLS(nmp)) {
384 CLNT_CONTROL(client, CLSET_TLS, &one);
385 if (nmp->nm_tlscertname != NULL)
386 CLNT_CONTROL(client, CLSET_TLSCERTNAME,
387 nmp->nm_tlscertname);
388 }
389 if (NFSHASSOFT(nmp)) {
390 if (nmp->nm_sotype == SOCK_DGRAM)
391 /*
392 * For UDP, the large timeout for a reconnect
393 * will be set to "nm_retry * nm_timeo / 2", so
394 * we only want to do 2 reconnect timeout
395 * retries.
396 */
397 retries = 2;
398 else
399 retries = nmp->nm_retry;
400 } else
401 retries = INT_MAX;
402 if (NFSHASNFSV4N(nmp)) {
403 if (cred != NULL) {
404 if (NFSHASSOFT(nmp)) {
405 /*
406 * This should be a DS mount.
407 * Use CLSET_TIMEOUT to set the timeout
408 * for connections to DSs instead of
409 * specifying a timeout on each RPC.
410 * This is done so that SO_SNDTIMEO
411 * is set on the TCP socket as well
412 * as specifying a time limit when
413 * waiting for an RPC reply. Useful
414 * if the send queue for the TCP
415 * connection has become constipated,
416 * due to a failed DS.
417 * The choice of lease_duration / 4 is
418 * fairly arbitrary, but seems to work
419 * ok, with a lower bound of 10sec.
420 */
421 timo.tv_sec = nfsrv_lease / 4;
422 if (timo.tv_sec < 10)
423 timo.tv_sec = 10;
424 timo.tv_usec = 0;
425 CLNT_CONTROL(client, CLSET_TIMEOUT,
426 &timo);
427 }
428 /*
429 * Make sure the nfscbd_pool doesn't get
430 * destroyed while doing this.
431 */
432 NFSD_LOCK();
433 if (nfs_numnfscbd > 0) {
434 nfs_numnfscbd++;
435 NFSD_UNLOCK();
436 xprt = svc_vc_create_backchannel(
437 nfscbd_pool);
438 CLNT_CONTROL(client, CLSET_BACKCHANNEL,
439 xprt);
440 NFSD_LOCK();
441 nfs_numnfscbd--;
442 if (nfs_numnfscbd == 0)
443 wakeup(&nfs_numnfscbd);
444 }
445 NFSD_UNLOCK();
446 } else {
447 /*
448 * cred == NULL for a DS connect.
449 * For connects to a DS, set a retry limit
450 * so that failed DSs will be detected.
451 * This is ok for NFSv4.1, since a DS does
452 * not maintain open/lock state and is the
453 * only case where using a "soft" mount is
454 * recommended for NFSv4.
455 * For mounts from the MDS to DS, this is done
456 * via mount options, but that is not the case
457 * here. The retry limit here can be adjusted
458 * via the sysctl vfs.nfs.dsretries.
459 * See the comment above w.r.t. timeout.
460 */
461 timo.tv_sec = nfsrv_lease / 4;
462 if (timo.tv_sec < 10)
463 timo.tv_sec = 10;
464 timo.tv_usec = 0;
465 CLNT_CONTROL(client, CLSET_TIMEOUT, &timo);
466 retries = nfs_dsretries;
467 }
468 }
469 } else {
470 /*
471 * Three cases:
472 * - Null RPC callback to client
473 * - Non-Null RPC callback to client, wait a little longer
474 * - upcalls to nfsuserd and gssd (clp == NULL)
475 */
476 if (callback_retry_mult == 0) {
477 retries = NFSV4_UPCALLRETRY;
478 CLNT_CONTROL(client, CLSET_PRIVPORT, &one);
479 } else {
480 retries = NFSV4_CALLBACKRETRY * callback_retry_mult;
481 }
482 if (dotls)
483 CLNT_CONTROL(client, CLSET_TLS, &one);
484 }
485 CLNT_CONTROL(client, CLSET_RETRIES, &retries);
486
487 if (nmp != NULL) {
488 /*
489 * For UDP, there are 2 timeouts:
490 * - CLSET_RETRY_TIMEOUT sets the initial timeout for the timer
491 * that does a retransmit of an RPC request using the same
492 * socket and xid. This is what you normally want to do,
493 * since NFS servers depend on "same xid" for their
494 * Duplicate Request Cache.
495 * - timeout specified in CLNT_CALL_MBUF(), which specifies when
496 * retransmits on the same socket should fail and a fresh
497 * socket created. Each of these timeouts counts as one
498 * CLSET_RETRIES as set above.
499 * Set the initial retransmit timeout for UDP. This timeout
500 * doesn't exist for TCP and the following call just fails,
501 * which is ok.
502 */
503 timo.tv_sec = nmp->nm_timeo / NFS_HZ;
504 timo.tv_usec = (nmp->nm_timeo % NFS_HZ) * 1000000 / NFS_HZ;
505 CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, &timo);
506 }
507
508 /*
509 * *clipp is &nrp->nr_client or &nm_aconn[nmp->nm_nextaconn].
510 * The latter case is for additional connections specified by the
511 * "nconnect" mount option. nr_mtx etc is used for these additional
512 * connections, as well as nr_client in the nfssockreq
513 * structure for the mount.
514 */
515 mtx_lock(&nrp->nr_mtx);
516 if (*clipp != NULL) {
517 mtx_unlock(&nrp->nr_mtx);
518 /*
519 * Someone else already connected.
520 */
521 CLNT_RELEASE(client);
522 } else {
523 *clipp = client;
524 /*
525 * Protocols that do not require connections may be optionally
526 * left unconnected for servers that reply from a port other
527 * than NFS_PORT.
528 */
529 if (nmp == NULL || (nmp->nm_flag & NFSMNT_NOCONN) == 0) {
530 mtx_unlock(&nrp->nr_mtx);
531 CLNT_CONTROL(client, CLSET_CONNECT, &one);
532 } else
533 mtx_unlock(&nrp->nr_mtx);
534 }
535
536 out:
537 /* Restore current thread's credentials. */
538 td->td_ucred = origcred;
539
540 NFSEXITCODE(error);
541 return (error);
542 }
543
544 /*
545 * NFS disconnect. Clean up and unlink.
546 */
547 void
newnfs_disconnect(struct nfsmount * nmp,struct nfssockreq * nrp)548 newnfs_disconnect(struct nfsmount *nmp, struct nfssockreq *nrp)
549 {
550 CLIENT *client, *aconn[NFS_MAXNCONN - 1];
551 int i;
552
553 mtx_lock(&nrp->nr_mtx);
554 if (nrp->nr_client != NULL) {
555 client = nrp->nr_client;
556 nrp->nr_client = NULL;
557 if (nmp != NULL && nmp->nm_aconnect > 0) {
558 for (i = 0; i < nmp->nm_aconnect; i++) {
559 aconn[i] = nmp->nm_aconn[i];
560 nmp->nm_aconn[i] = NULL;
561 }
562 }
563 mtx_unlock(&nrp->nr_mtx);
564 rpc_gss_secpurge_call(client);
565 CLNT_CLOSE(client);
566 CLNT_RELEASE(client);
567 if (nmp != NULL && nmp->nm_aconnect > 0) {
568 for (i = 0; i < nmp->nm_aconnect; i++) {
569 if (aconn[i] != NULL) {
570 rpc_gss_secpurge_call(aconn[i]);
571 CLNT_CLOSE(aconn[i]);
572 CLNT_RELEASE(aconn[i]);
573 }
574 }
575 }
576 } else {
577 mtx_unlock(&nrp->nr_mtx);
578 }
579 }
580
581 static AUTH *
nfs_getauth(struct nfssockreq * nrp,int secflavour,char * clnt_principal,char * srv_principal,gss_OID mech_oid,struct ucred * cred)582 nfs_getauth(struct nfssockreq *nrp, int secflavour, char *clnt_principal,
583 char *srv_principal, gss_OID mech_oid, struct ucred *cred)
584 {
585 rpc_gss_service_t svc;
586 AUTH *auth;
587
588 switch (secflavour) {
589 case RPCSEC_GSS_KRB5:
590 case RPCSEC_GSS_KRB5I:
591 case RPCSEC_GSS_KRB5P:
592 if (!mech_oid) {
593 if (!rpc_gss_mech_to_oid_call("kerberosv5", &mech_oid))
594 return (NULL);
595 }
596 if (secflavour == RPCSEC_GSS_KRB5)
597 svc = rpc_gss_svc_none;
598 else if (secflavour == RPCSEC_GSS_KRB5I)
599 svc = rpc_gss_svc_integrity;
600 else
601 svc = rpc_gss_svc_privacy;
602
603 if (clnt_principal == NULL) {
604 NFSCL_DEBUG(1, "nfs_getauth: clnt princ=NULL, "
605 "srv princ=%s\n", srv_principal);
606 auth = rpc_gss_secfind_call(nrp->nr_client, cred,
607 srv_principal, mech_oid, svc);
608 } else {
609 NFSCL_DEBUG(1, "nfs_getauth: clnt princ=%s "
610 "srv princ=%s\n", clnt_principal, srv_principal);
611 auth = rpc_gss_seccreate_call(nrp->nr_client, cred,
612 clnt_principal, srv_principal, "kerberosv5",
613 svc, NULL, NULL, NULL);
614 return (auth);
615 }
616 if (auth != NULL)
617 return (auth);
618 /* fallthrough */
619 case AUTH_SYS:
620 default:
621 return (authunix_create(cred));
622 }
623 }
624
625 /*
626 * Callback from the RPC code to generate up/down notifications.
627 */
628
629 struct nfs_feedback_arg {
630 struct nfsmount *nf_mount;
631 int nf_lastmsg; /* last tprintf */
632 int nf_tprintfmsg;
633 struct thread *nf_td;
634 };
635
636 static void
nfs_feedback(int type,int proc,void * arg)637 nfs_feedback(int type, int proc, void *arg)
638 {
639 struct nfs_feedback_arg *nf = (struct nfs_feedback_arg *) arg;
640 struct nfsmount *nmp = nf->nf_mount;
641 time_t now;
642
643 switch (type) {
644 case FEEDBACK_REXMIT2:
645 case FEEDBACK_RECONNECT:
646 now = NFSD_MONOSEC;
647 if (nf->nf_lastmsg + nmp->nm_tprintf_delay < now) {
648 nfs_down(nmp, nf->nf_td,
649 "not responding", 0, NFSSTA_TIMEO);
650 nf->nf_tprintfmsg = TRUE;
651 nf->nf_lastmsg = now;
652 }
653 break;
654
655 case FEEDBACK_OK:
656 nfs_up(nf->nf_mount, nf->nf_td,
657 "is alive again", NFSSTA_TIMEO, nf->nf_tprintfmsg);
658 break;
659 }
660 }
661
662 /*
663 * newnfs_request - goes something like this
664 * - does the rpc by calling the krpc layer
665 * - break down rpc header and return with nfs reply
666 * nb: always frees up nd_mreq mbuf list
667 */
668 int
newnfs_request(struct nfsrv_descript * nd,struct nfsmount * nmp,struct nfsclient * clp,struct nfssockreq * nrp,vnode_t vp,struct thread * td,struct ucred * cred,u_int32_t prog,u_int32_t vers,u_char * retsum,int toplevel,u_int64_t * xidp,struct nfsclsession * dssep)669 newnfs_request(struct nfsrv_descript *nd, struct nfsmount *nmp,
670 struct nfsclient *clp, struct nfssockreq *nrp, vnode_t vp,
671 struct thread *td, struct ucred *cred, u_int32_t prog, u_int32_t vers,
672 u_char *retsum, int toplevel, u_int64_t *xidp, struct nfsclsession *dssep)
673 {
674 uint32_t retseq, retval, retval0, slotseq, *tl;
675 int i = 0, j = 0, opcnt, set_sigset = 0, slot;
676 int error = 0, usegssname = 0, secflavour = AUTH_SYS;
677 int freeslot, maxslot, reterr, slotpos, timeo;
678 u_int16_t procnum;
679 u_int nextconn;
680 struct nfs_feedback_arg nf;
681 struct timeval timo;
682 AUTH *auth;
683 struct rpc_callextra ext;
684 enum clnt_stat stat;
685 struct nfsreq *rep = NULL;
686 char *srv_principal = NULL, *clnt_principal = NULL;
687 sigset_t oldset;
688 struct ucred *authcred;
689 struct nfsclsession *sep;
690 uint8_t sessionid[NFSX_V4SESSIONID];
691 bool nextconn_set;
692 struct timespec trylater_delay, ts, waituntil;
693
694 /* Initially 1msec. */
695 trylater_delay.tv_sec = 0;
696 trylater_delay.tv_nsec = 1000000;
697 sep = dssep;
698 if (xidp != NULL)
699 *xidp = 0;
700 /* Reject requests while attempting a forced unmount. */
701 if (nmp != NULL && NFSCL_FORCEDISM(nmp->nm_mountp)) {
702 m_freem(nd->nd_mreq);
703 return (ESTALE);
704 }
705
706 /*
707 * Set authcred, which is used to acquire RPC credentials to
708 * the cred argument, by default. The crhold() should not be
709 * necessary, but will ensure that some future code change
710 * doesn't result in the credential being free'd prematurely.
711 */
712 authcred = crhold(cred);
713
714 /* For client side interruptible mounts, mask off the signals. */
715 if (nmp != NULL && td != NULL && NFSHASINT(nmp)) {
716 newnfs_set_sigmask(td, &oldset);
717 set_sigset = 1;
718 }
719
720 /*
721 * If not already connected call newnfs_connect now.
722 */
723 if (nrp->nr_client == NULL)
724 newnfs_connect(nmp, nrp, cred, td, 0, false, &nrp->nr_client);
725
726 /*
727 * If the "nconnect" mount option was specified and this RPC is
728 * one that can have a large RPC message and is being done through
729 * the NFS/MDS server, use an additional connection. (When the RPC is
730 * being done through the server/MDS, nrp == &nmp->nm_sockreq.)
731 * The "nconnect" mount option normally has minimal effect when the
732 * "pnfs" mount option is specified, since only Readdir RPCs are
733 * normally done through the NFS/MDS server.
734 */
735 nextconn_set = false;
736 if (nmp != NULL && nmp->nm_aconnect > 0 && nrp == &nmp->nm_sockreq &&
737 (nd->nd_procnum == NFSPROC_READ ||
738 nd->nd_procnum == NFSPROC_READDIR ||
739 nd->nd_procnum == NFSPROC_READDIRPLUS ||
740 nd->nd_procnum == NFSPROC_WRITE)) {
741 nextconn = atomic_fetchadd_int(&nmp->nm_nextaconn, 1);
742 nextconn %= nmp->nm_aconnect;
743 nextconn_set = true;
744 if (nmp->nm_aconn[nextconn] == NULL)
745 newnfs_connect(nmp, nrp, cred, td, 0, false,
746 &nmp->nm_aconn[nextconn]);
747 }
748
749 /*
750 * For a client side mount, nmp is != NULL and clp == NULL. For
751 * server calls (callbacks or upcalls), nmp == NULL.
752 */
753 if (clp != NULL) {
754 NFSLOCKSTATE();
755 if ((clp->lc_flags & LCL_GSS) && nfsrv_gsscallbackson) {
756 secflavour = RPCSEC_GSS_KRB5;
757 if (nd->nd_procnum != NFSPROC_NULL) {
758 if (clp->lc_flags & LCL_GSSINTEGRITY)
759 secflavour = RPCSEC_GSS_KRB5I;
760 else if (clp->lc_flags & LCL_GSSPRIVACY)
761 secflavour = RPCSEC_GSS_KRB5P;
762 }
763 }
764 NFSUNLOCKSTATE();
765 } else if (nmp != NULL && NFSHASKERB(nmp) &&
766 nd->nd_procnum != NFSPROC_NULL && (!NFSHASSYSKRB5(nmp) ||
767 nfscl_use_gss[nd->nd_procnum])) {
768 if (NFSHASALLGSSNAME(nmp) && nmp->nm_krbnamelen > 0)
769 nd->nd_flag |= ND_USEGSSNAME;
770 if ((nd->nd_flag & ND_USEGSSNAME) != 0) {
771 /*
772 * If there is a client side host based credential,
773 * use that, otherwise use the system uid, if set.
774 * The system uid is in the nmp->nm_sockreq.nr_cred
775 * credentials.
776 */
777 if (nmp->nm_krbnamelen > 0) {
778 usegssname = 1;
779 clnt_principal = nmp->nm_krbname;
780 } else if (nmp->nm_uid != (uid_t)-1) {
781 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
782 ("newnfs_request: NULL nr_cred"));
783 crfree(authcred);
784 authcred = crhold(nmp->nm_sockreq.nr_cred);
785 }
786 } else if (nmp->nm_krbnamelen == 0 &&
787 nmp->nm_uid != (uid_t)-1 && cred->cr_uid == (uid_t)0) {
788 /*
789 * If there is no host based principal name and
790 * the system uid is set and this is root, use the
791 * system uid, since root won't have user
792 * credentials in a credentials cache file.
793 * The system uid is in the nmp->nm_sockreq.nr_cred
794 * credentials.
795 */
796 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
797 ("newnfs_request: NULL nr_cred"));
798 crfree(authcred);
799 authcred = crhold(nmp->nm_sockreq.nr_cred);
800 }
801 if (NFSHASINTEGRITY(nmp))
802 secflavour = RPCSEC_GSS_KRB5I;
803 else if (NFSHASPRIVACY(nmp))
804 secflavour = RPCSEC_GSS_KRB5P;
805 else
806 secflavour = RPCSEC_GSS_KRB5;
807 if (nrp->nr_srvprinc[0] == '\0')
808 srv_principal = NFSMNT_SRVKRBNAME(nmp);
809 else
810 srv_principal = nrp->nr_srvprinc;
811 } else if (nmp != NULL && (!NFSHASKERB(nmp) || NFSHASSYSKRB5(nmp)) &&
812 nd->nd_procnum != NFSPROC_NULL &&
813 (nd->nd_flag & ND_USEGSSNAME) != 0) {
814 /*
815 * Use the uid that did the mount when the RPC is doing
816 * NFSv4 system operations, as indicated by the
817 * ND_USEGSSNAME flag, for the AUTH_SYS case.
818 * The credentials in nm_sockreq.nr_cred were used for the
819 * mount.
820 */
821 KASSERT(nmp->nm_sockreq.nr_cred != NULL,
822 ("newnfs_request: NULL nr_cred"));
823 crfree(authcred);
824 authcred = crhold(nmp->nm_sockreq.nr_cred);
825 }
826
827 if (nmp != NULL) {
828 bzero(&nf, sizeof(struct nfs_feedback_arg));
829 nf.nf_mount = nmp;
830 nf.nf_td = td;
831 nf.nf_lastmsg = NFSD_MONOSEC -
832 ((nmp->nm_tprintf_delay)-(nmp->nm_tprintf_initial_delay));
833 }
834
835 if (nd->nd_procnum == NFSPROC_NULL)
836 auth = authnone_create();
837 else if (usegssname) {
838 /*
839 * For this case, the authenticator is held in the
840 * nfssockreq structure, so don't release the reference count
841 * held on it. --> Don't AUTH_DESTROY() it in this function.
842 */
843 if (nrp->nr_auth == NULL)
844 nrp->nr_auth = nfs_getauth(nrp, secflavour,
845 clnt_principal, srv_principal, NULL, authcred);
846 else
847 rpc_gss_refresh_auth_call(nrp->nr_auth);
848 auth = nrp->nr_auth;
849 } else
850 auth = nfs_getauth(nrp, secflavour, NULL,
851 srv_principal, NULL, authcred);
852 crfree(authcred);
853 if (auth == NULL) {
854 m_freem(nd->nd_mreq);
855 if (set_sigset)
856 newnfs_restore_sigmask(td, &oldset);
857 return (EACCES);
858 }
859 bzero(&ext, sizeof(ext));
860 ext.rc_auth = auth;
861 if (nmp != NULL) {
862 ext.rc_feedback = nfs_feedback;
863 ext.rc_feedback_arg = &nf;
864 }
865
866 procnum = nd->nd_procnum;
867 if ((nd->nd_flag & ND_NFSV4) &&
868 nd->nd_procnum != NFSPROC_NULL &&
869 nd->nd_procnum != NFSV4PROC_CBCOMPOUND)
870 procnum = NFSV4PROC_COMPOUND;
871
872 if (nmp != NULL) {
873 NFSINCRGLOBAL(nfsstatsv1.rpcrequests);
874
875 /* Map the procnum to the old NFSv2 one, as required. */
876 if ((nd->nd_flag & ND_NFSV2) != 0) {
877 if (nd->nd_procnum < NFS_V3NPROCS)
878 procnum = nfsv2_procid[nd->nd_procnum];
879 else
880 procnum = NFSV2PROC_NOOP;
881 }
882
883 /*
884 * Now only used for the R_DONTRECOVER case, but until that is
885 * supported within the krpc code, I need to keep a queue of
886 * outstanding RPCs for nfsv4 client requests.
887 */
888 if ((nd->nd_flag & ND_NFSV4) && procnum == NFSV4PROC_COMPOUND)
889 rep = malloc(sizeof(struct nfsreq),
890 M_NFSDREQ, M_WAITOK);
891 #ifdef KDTRACE_HOOKS
892 if (dtrace_nfscl_nfs234_start_probe != NULL) {
893 uint32_t probe_id;
894 int probe_procnum;
895
896 if (nd->nd_flag & ND_NFSV4) {
897 probe_id =
898 nfscl_nfs4_start_probes[nd->nd_procnum];
899 probe_procnum = nd->nd_procnum;
900 } else if (nd->nd_flag & ND_NFSV3) {
901 probe_id = nfscl_nfs3_start_probes[procnum];
902 probe_procnum = procnum;
903 } else {
904 probe_id =
905 nfscl_nfs2_start_probes[nd->nd_procnum];
906 probe_procnum = procnum;
907 }
908 if (probe_id != 0)
909 (dtrace_nfscl_nfs234_start_probe)
910 (probe_id, vp, nd->nd_mreq, cred,
911 probe_procnum);
912 }
913 #endif
914 }
915 freeslot = -1; /* Set to slot that needs to be free'd */
916 tryagain:
917 slot = -1; /* Slot that needs a sequence# increment. */
918 /*
919 * This timeout specifies when a new socket should be created,
920 * along with new xid values. For UDP, this should be done
921 * infrequently, since retransmits of RPC requests should normally
922 * use the same xid.
923 */
924 if (nmp == NULL) {
925 if (clp == NULL) {
926 timo.tv_sec = NFSV4_UPCALLTIMEO;
927 timo.tv_usec = 0;
928 } else {
929 timo.tv_sec = NFSV4_CALLBACKTIMEO / 1000;
930 timo.tv_usec = NFSV4_CALLBACKTIMEO * 1000;
931 }
932 } else {
933 if (nrp->nr_sotype != SOCK_DGRAM) {
934 timo.tv_usec = 0;
935 if ((nmp->nm_flag & NFSMNT_NFSV4))
936 timo.tv_sec = INT_MAX;
937 else
938 timo.tv_sec = NFS_TCPTIMEO;
939 } else {
940 if (NFSHASSOFT(nmp)) {
941 /*
942 * CLSET_RETRIES is set to 2, so this should be
943 * half of the total timeout required.
944 */
945 timeo = nmp->nm_retry * nmp->nm_timeo / 2;
946 if (timeo < 1)
947 timeo = 1;
948 timo.tv_sec = timeo / NFS_HZ;
949 timo.tv_usec = (timeo % NFS_HZ) * 1000000 /
950 NFS_HZ;
951 } else {
952 /* For UDP hard mounts, use a large value. */
953 timo.tv_sec = NFS_MAXTIMEO / NFS_HZ;
954 timo.tv_usec = 0;
955 }
956 }
957
958 if (rep != NULL) {
959 rep->r_flags = 0;
960 rep->r_nmp = nmp;
961 /*
962 * Chain request into list of outstanding requests.
963 */
964 NFSLOCKREQ();
965 TAILQ_INSERT_TAIL(&nfsd_reqq, rep, r_chain);
966 NFSUNLOCKREQ();
967 }
968 }
969
970 nd->nd_mrep = NULL;
971 if (clp != NULL && sep != NULL)
972 stat = clnt_bck_call(nrp->nr_client, &ext, procnum,
973 nd->nd_mreq, &nd->nd_mrep, timo, sep->nfsess_xprt);
974 else if (nextconn_set)
975 /*
976 * When there are multiple TCP connections, send the
977 * RPCs with large messages on the alternate TCP
978 * connection(s) in a round robin fashion.
979 * The small RPC messages are sent on the default
980 * TCP connection because they do not require much
981 * network bandwidth and separating them from the
982 * large RPC messages avoids them getting "log jammed"
983 * behind several large RPC messages.
984 */
985 stat = CLNT_CALL_MBUF(nmp->nm_aconn[nextconn],
986 &ext, procnum, nd->nd_mreq, &nd->nd_mrep, timo);
987 else
988 stat = CLNT_CALL_MBUF(nrp->nr_client, &ext, procnum,
989 nd->nd_mreq, &nd->nd_mrep, timo);
990 NFSCL_DEBUG(2, "clnt call=%d\n", stat);
991
992 if (rep != NULL) {
993 /*
994 * RPC done, unlink the request.
995 */
996 NFSLOCKREQ();
997 TAILQ_REMOVE(&nfsd_reqq, rep, r_chain);
998 NFSUNLOCKREQ();
999 }
1000
1001 /*
1002 * If there was a successful reply and a tprintf msg.
1003 * tprintf a response.
1004 */
1005 if (stat == RPC_SUCCESS) {
1006 error = 0;
1007 } else if (stat == RPC_TIMEDOUT) {
1008 NFSINCRGLOBAL(nfsstatsv1.rpctimeouts);
1009 error = ETIMEDOUT;
1010 } else if (stat == RPC_VERSMISMATCH) {
1011 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1012 error = EOPNOTSUPP;
1013 } else if (stat == RPC_PROGVERSMISMATCH) {
1014 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1015 error = EPROTONOSUPPORT;
1016 } else if (stat == RPC_CANTSEND || stat == RPC_CANTRECV ||
1017 stat == RPC_SYSTEMERROR || stat == RPC_INTR) {
1018 /* Check for a session slot that needs to be free'd. */
1019 if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1020 (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1021 nd->nd_procnum != NFSPROC_NULL) {
1022 /*
1023 * This should only occur when either the MDS or
1024 * a client has an RPC against a DS fail.
1025 * This happens because these cases use "soft"
1026 * connections that can time out and fail.
1027 * The slot used for this RPC is now in a
1028 * non-deterministic state, but if the slot isn't
1029 * free'd, threads can get stuck waiting for a slot.
1030 */
1031 if (sep == NULL)
1032 sep = nfsmnt_mdssession(nmp);
1033 /*
1034 * Bump the sequence# out of range, so that reuse of
1035 * this slot will result in an NFSERR_SEQMISORDERED
1036 * error and not a bogus cached RPC reply.
1037 */
1038 mtx_lock(&sep->nfsess_mtx);
1039 sep->nfsess_slotseq[nd->nd_slotid] += 10;
1040 sep->nfsess_badslots |= (0x1ULL << nd->nd_slotid);
1041 mtx_unlock(&sep->nfsess_mtx);
1042 /* And free the slot. */
1043 nfsv4_freeslot(sep, nd->nd_slotid, true);
1044 }
1045 if (stat == RPC_INTR)
1046 error = EINTR;
1047 else {
1048 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1049 error = ENXIO;
1050 }
1051 } else if (stat == RPC_AUTHERROR) {
1052 /* Check for a session slot that needs to be free'd. */
1053 if ((nd->nd_flag & (ND_NFSV41 | ND_HASSLOTID)) ==
1054 (ND_NFSV41 | ND_HASSLOTID) && nmp != NULL &&
1055 nd->nd_procnum != NFSPROC_NULL) {
1056 /*
1057 * This can occur when a Kerberos/RPCSEC_GSS session
1058 * expires, due to TGT expiration.
1059 * Free the slot, resetting the slot's sequence#.
1060 */
1061 if (sep == NULL)
1062 sep = nfsmnt_mdssession(nmp);
1063 nfsv4_freeslot(sep, nd->nd_slotid, true);
1064 }
1065 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1066 error = EACCES;
1067 } else {
1068 NFSINCRGLOBAL(nfsstatsv1.rpcinvalid);
1069 error = EACCES;
1070 }
1071 if (error) {
1072 m_freem(nd->nd_mreq);
1073 if (usegssname == 0)
1074 AUTH_DESTROY(auth);
1075 if (rep != NULL)
1076 free(rep, M_NFSDREQ);
1077 if (set_sigset)
1078 newnfs_restore_sigmask(td, &oldset);
1079 return (error);
1080 }
1081
1082 KASSERT(nd->nd_mrep != NULL, ("mrep shouldn't be NULL if no error\n"));
1083
1084 /*
1085 * Search for any mbufs that are not a multiple of 4 bytes long
1086 * or with m_data not longword aligned.
1087 * These could cause pointer alignment problems, so copy them to
1088 * well aligned mbufs.
1089 */
1090 newnfs_realign(&nd->nd_mrep, M_WAITOK);
1091 nd->nd_md = nd->nd_mrep;
1092 nd->nd_dpos = mtod(nd->nd_md, caddr_t);
1093 nd->nd_repstat = 0;
1094 if (nd->nd_procnum != NFSPROC_NULL &&
1095 nd->nd_procnum != NFSV4PROC_CBNULL) {
1096 /* If sep == NULL, set it to the default in nmp. */
1097 if (sep == NULL && nmp != NULL)
1098 sep = nfsmnt_mdssession(nmp);
1099 /*
1100 * and now the actual NFS xdr.
1101 */
1102 NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1103 nd->nd_repstat = fxdr_unsigned(u_int32_t, *tl);
1104 if (nd->nd_repstat >= 10000)
1105 NFSCL_DEBUG(1, "proc=%d reps=%d\n", (int)nd->nd_procnum,
1106 (int)nd->nd_repstat);
1107
1108 /*
1109 * Get rid of the tag, return count and SEQUENCE result for
1110 * NFSv4.
1111 */
1112 if ((nd->nd_flag & ND_NFSV4) != 0 && nd->nd_repstat !=
1113 NFSERR_MINORVERMISMATCH) {
1114 NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1115 i = fxdr_unsigned(int, *tl);
1116 error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
1117 if (error)
1118 goto nfsmout;
1119 NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1120 opcnt = fxdr_unsigned(int, *tl++);
1121 i = fxdr_unsigned(int, *tl++);
1122 j = fxdr_unsigned(int, *tl);
1123 if (j >= 10000)
1124 NFSCL_DEBUG(1, "fop=%d fst=%d\n", i, j);
1125 /*
1126 * If the first op is Sequence, free up the slot.
1127 */
1128 if ((nmp != NULL && i == NFSV4OP_SEQUENCE && j != 0) ||
1129 (clp != NULL && i == NFSV4OP_CBSEQUENCE && j != 0)) {
1130 NFSCL_DEBUG(1, "failed seq=%d\n", j);
1131 if (sep != NULL && i == NFSV4OP_SEQUENCE &&
1132 j == NFSERR_SEQMISORDERED) {
1133 mtx_lock(&sep->nfsess_mtx);
1134 sep->nfsess_badslots |=
1135 (0x1ULL << nd->nd_slotid);
1136 mtx_unlock(&sep->nfsess_mtx);
1137 }
1138 }
1139 if (((nmp != NULL && i == NFSV4OP_SEQUENCE && j == 0) ||
1140 (clp != NULL && i == NFSV4OP_CBSEQUENCE &&
1141 j == 0)) && sep != NULL) {
1142 if (i == NFSV4OP_SEQUENCE)
1143 NFSM_DISSECT(tl, uint32_t *,
1144 NFSX_V4SESSIONID +
1145 5 * NFSX_UNSIGNED);
1146 else
1147 NFSM_DISSECT(tl, uint32_t *,
1148 NFSX_V4SESSIONID +
1149 4 * NFSX_UNSIGNED);
1150 mtx_lock(&sep->nfsess_mtx);
1151 if (bcmp(tl, sep->nfsess_sessionid,
1152 NFSX_V4SESSIONID) == 0) {
1153 tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
1154 retseq = fxdr_unsigned(uint32_t, *tl++);
1155 slot = fxdr_unsigned(int, *tl++);
1156 if ((nd->nd_flag & ND_HASSLOTID) != 0) {
1157 if (slot >= NFSV4_SLOTS ||
1158 (i == NFSV4OP_CBSEQUENCE &&
1159 slot >= NFSV4_CBSLOTS)) {
1160 printf("newnfs_request:"
1161 " Bogus slot\n");
1162 slot = nd->nd_slotid;
1163 } else if (slot !=
1164 nd->nd_slotid) {
1165 printf("newnfs_request:"
1166 " Wrong session "
1167 "srvslot=%d "
1168 "slot=%d\n", slot,
1169 nd->nd_slotid);
1170 if (i == NFSV4OP_SEQUENCE) {
1171 /*
1172 * Mark both slots as
1173 * bad, because we do
1174 * not know if the
1175 * server has advanced
1176 * the sequence# for
1177 * either of them.
1178 */
1179 sep->nfsess_badslots |=
1180 (0x1ULL << slot);
1181 sep->nfsess_badslots |=
1182 (0x1ULL <<
1183 nd->nd_slotid);
1184 }
1185 slot = nd->nd_slotid;
1186 }
1187 freeslot = slot;
1188 } else if (slot != 0) {
1189 printf("newnfs_request: Bad "
1190 "session slot=%d\n", slot);
1191 slot = 0;
1192 }
1193 if (retseq != sep->nfsess_slotseq[slot])
1194 printf("retseq diff 0x%x\n",
1195 retseq);
1196 retval0 = fxdr_unsigned(uint32_t,*tl++);
1197 retval = fxdr_unsigned(uint32_t, *tl);
1198 if ((retval + 1) < sep->nfsess_foreslots
1199 ) {
1200 sep->nfsess_foreslots = (retval
1201 + 1);
1202 nfs_resetslots(sep);
1203 } else if ((retval + 1) >
1204 sep->nfsess_foreslots) {
1205 if (retval0 > retval)
1206 printf("Sess:highest > "
1207 "target_highest\n");
1208 sep->nfsess_foreslots =
1209 (retval < NFSV4_SLOTS) ?
1210 (retval + 1) : NFSV4_SLOTS;
1211 }
1212 }
1213 mtx_unlock(&sep->nfsess_mtx);
1214
1215 /* Grab the op and status for the next one. */
1216 if (opcnt > 1) {
1217 NFSM_DISSECT(tl, uint32_t *,
1218 2 * NFSX_UNSIGNED);
1219 i = fxdr_unsigned(int, *tl++);
1220 j = fxdr_unsigned(int, *tl);
1221 }
1222 }
1223 }
1224 if (nd->nd_repstat != 0) {
1225 if (nd->nd_repstat == NFSERR_BADSESSION &&
1226 nmp != NULL && dssep == NULL &&
1227 (nd->nd_flag & ND_NFSV41) != 0) {
1228 /*
1229 * If this is a client side MDS RPC, mark
1230 * the MDS session defunct and initiate
1231 * recovery, as required.
1232 * The nfsess_defunct field is protected by
1233 * the NFSLOCKMNT()/nm_mtx lock and not the
1234 * nfsess_mtx lock to simplify its handling,
1235 * for the MDS session. This lock is also
1236 * sufficient for nfsess_sessionid, since it
1237 * never changes in the structure.
1238 */
1239 NFSCL_DEBUG(1, "Got badsession\n");
1240 NFSLOCKCLSTATE();
1241 NFSLOCKMNT(nmp);
1242 if (TAILQ_EMPTY(&nmp->nm_sess)) {
1243 NFSUNLOCKMNT(nmp);
1244 NFSUNLOCKCLSTATE();
1245 printf("If server has not rebooted, "
1246 "check NFS clients for unique "
1247 "/etc/hostid's\n");
1248 goto out;
1249 }
1250 sep = NFSMNT_MDSSESSION(nmp);
1251 if (bcmp(sep->nfsess_sessionid, nd->nd_sequence,
1252 NFSX_V4SESSIONID) == 0) {
1253 printf("Initiate recovery. If server "
1254 "has not rebooted, "
1255 "check NFS clients for unique "
1256 "/etc/hostid's\n");
1257 /* Initiate recovery. */
1258 sep->nfsess_defunct = 1;
1259 NFSCL_DEBUG(1, "Marked defunct\n");
1260 if (nmp->nm_clp != NULL) {
1261 nmp->nm_clp->nfsc_flags |=
1262 NFSCLFLAGS_RECOVER;
1263 wakeup(nmp->nm_clp);
1264 }
1265 }
1266 NFSUNLOCKCLSTATE();
1267 /*
1268 * Sleep for up to 1sec waiting for a new
1269 * session.
1270 */
1271 mtx_sleep(&nmp->nm_sess, &nmp->nm_mtx, PZERO,
1272 "nfsbadsess", hz);
1273 /*
1274 * Get the session again, in case a new one
1275 * has been created during the sleep.
1276 */
1277 sep = NFSMNT_MDSSESSION(nmp);
1278 NFSUNLOCKMNT(nmp);
1279 if ((nd->nd_flag & ND_LOOPBADSESS) != 0) {
1280 reterr = nfsv4_sequencelookup(nmp, sep,
1281 &slotpos, &maxslot, &slotseq,
1282 sessionid, true);
1283 if (reterr == 0) {
1284 /* Fill in new session info. */
1285 NFSCL_DEBUG(1,
1286 "Filling in new sequence\n");
1287 tl = nd->nd_sequence;
1288 bcopy(sessionid, tl,
1289 NFSX_V4SESSIONID);
1290 tl += NFSX_V4SESSIONID /
1291 NFSX_UNSIGNED;
1292 *tl++ = txdr_unsigned(slotseq);
1293 *tl++ = txdr_unsigned(slotpos);
1294 *tl = txdr_unsigned(maxslot);
1295 nd->nd_slotid = slotpos;
1296 nd->nd_flag |= ND_HASSLOTID;
1297 }
1298 if (reterr == NFSERR_BADSESSION ||
1299 reterr == 0) {
1300 NFSCL_DEBUG(1,
1301 "Badsession looping\n");
1302 m_freem(nd->nd_mrep);
1303 nd->nd_mrep = NULL;
1304 goto tryagain;
1305 }
1306 nd->nd_repstat = reterr;
1307 NFSCL_DEBUG(1, "Got err=%d\n", reterr);
1308 }
1309 }
1310 /*
1311 * When clp != NULL, it is a callback and all
1312 * callback operations can be retried for NFSERR_DELAY.
1313 */
1314 if (((nd->nd_repstat == NFSERR_DELAY ||
1315 nd->nd_repstat == NFSERR_GRACE) &&
1316 (nd->nd_flag & ND_NFSV4) && (clp != NULL ||
1317 (nd->nd_procnum != NFSPROC_DELEGRETURN &&
1318 nd->nd_procnum != NFSPROC_SETATTR &&
1319 nd->nd_procnum != NFSPROC_READ &&
1320 nd->nd_procnum != NFSPROC_READDS &&
1321 nd->nd_procnum != NFSPROC_WRITE &&
1322 nd->nd_procnum != NFSPROC_WRITEDS &&
1323 nd->nd_procnum != NFSPROC_OPEN &&
1324 nd->nd_procnum != NFSPROC_OPENLAYGET &&
1325 nd->nd_procnum != NFSPROC_CREATE &&
1326 nd->nd_procnum != NFSPROC_CREATELAYGET &&
1327 nd->nd_procnum != NFSPROC_OPENCONFIRM &&
1328 nd->nd_procnum != NFSPROC_OPENDOWNGRADE &&
1329 nd->nd_procnum != NFSPROC_CLOSE &&
1330 nd->nd_procnum != NFSPROC_LOCK &&
1331 nd->nd_procnum != NFSPROC_LOCKU))) ||
1332 (nd->nd_repstat == NFSERR_DELAY &&
1333 (nd->nd_flag & ND_NFSV4) == 0) ||
1334 nd->nd_repstat == NFSERR_RESOURCE ||
1335 nd->nd_repstat == NFSERR_RETRYUNCACHEDREP) {
1336 /* Clip at NFS_TRYLATERDEL. */
1337 if (timespeccmp(&trylater_delay,
1338 &nfs_trylater_max, >))
1339 trylater_delay = nfs_trylater_max;
1340 getnanouptime(&waituntil);
1341 timespecadd(&waituntil, &trylater_delay,
1342 &waituntil);
1343 do {
1344 nfs_catnap(PZERO, 0, "nfstry");
1345 getnanouptime(&ts);
1346 } while (timespeccmp(&ts, &waituntil, <));
1347 timespecadd(&trylater_delay, &trylater_delay,
1348 &trylater_delay); /* Double each time. */
1349 if (slot != -1) {
1350 mtx_lock(&sep->nfsess_mtx);
1351 sep->nfsess_slotseq[slot]++;
1352 *nd->nd_slotseq = txdr_unsigned(
1353 sep->nfsess_slotseq[slot]);
1354 mtx_unlock(&sep->nfsess_mtx);
1355 }
1356 m_freem(nd->nd_mrep);
1357 nd->nd_mrep = NULL;
1358 goto tryagain;
1359 }
1360
1361 /*
1362 * If the File Handle was stale, invalidate the
1363 * lookup cache, just in case.
1364 * (vp != NULL implies a client side call)
1365 */
1366 if (nd->nd_repstat == ESTALE && vp != NULL) {
1367 cache_purge(vp);
1368 if (ncl_call_invalcaches != NULL)
1369 (*ncl_call_invalcaches)(vp);
1370 }
1371 }
1372 if ((nd->nd_flag & ND_NFSV4) != 0) {
1373 /* Free the slot, as required. */
1374 if (freeslot != -1)
1375 nfsv4_freeslot(sep, freeslot, false);
1376 /*
1377 * If this op is Putfh, throw its results away.
1378 */
1379 if (j >= 10000)
1380 NFSCL_DEBUG(1, "nop=%d nst=%d\n", i, j);
1381 if (nmp != NULL && i == NFSV4OP_PUTFH && j == 0) {
1382 NFSM_DISSECT(tl,u_int32_t *,2 * NFSX_UNSIGNED);
1383 i = fxdr_unsigned(int, *tl++);
1384 j = fxdr_unsigned(int, *tl);
1385 if (j >= 10000)
1386 NFSCL_DEBUG(1, "n2op=%d n2st=%d\n", i,
1387 j);
1388 /*
1389 * All Compounds that do an Op that must
1390 * be in sequence consist of NFSV4OP_PUTFH
1391 * followed by one of these. As such, we
1392 * can determine if the seqid# should be
1393 * incremented, here.
1394 */
1395 if ((i == NFSV4OP_OPEN ||
1396 i == NFSV4OP_OPENCONFIRM ||
1397 i == NFSV4OP_OPENDOWNGRADE ||
1398 i == NFSV4OP_CLOSE ||
1399 i == NFSV4OP_LOCK ||
1400 i == NFSV4OP_LOCKU) &&
1401 (j == 0 ||
1402 (j != NFSERR_STALECLIENTID &&
1403 j != NFSERR_STALESTATEID &&
1404 j != NFSERR_BADSTATEID &&
1405 j != NFSERR_BADSEQID &&
1406 j != NFSERR_BADXDR &&
1407 j != NFSERR_RESOURCE &&
1408 j != NFSERR_NOFILEHANDLE)))
1409 nd->nd_flag |= ND_INCRSEQID;
1410 }
1411 /*
1412 * If this op's status is non-zero, mark
1413 * that there is no more data to process.
1414 * The exception is Setattr, which always has xdr
1415 * when it has failed.
1416 */
1417 if (j != 0 && i != NFSV4OP_SETATTR)
1418 nd->nd_flag |= ND_NOMOREDATA;
1419
1420 /*
1421 * If R_DONTRECOVER is set, replace the stale error
1422 * reply, so that recovery isn't initiated.
1423 */
1424 if ((nd->nd_repstat == NFSERR_STALECLIENTID ||
1425 nd->nd_repstat == NFSERR_BADSESSION ||
1426 nd->nd_repstat == NFSERR_STALESTATEID) &&
1427 rep != NULL && (rep->r_flags & R_DONTRECOVER))
1428 nd->nd_repstat = NFSERR_STALEDONTRECOVER;
1429 }
1430 }
1431 out:
1432
1433 #ifdef KDTRACE_HOOKS
1434 if (nmp != NULL && dtrace_nfscl_nfs234_done_probe != NULL) {
1435 uint32_t probe_id;
1436 int probe_procnum;
1437
1438 if (nd->nd_flag & ND_NFSV4) {
1439 probe_id = nfscl_nfs4_done_probes[nd->nd_procnum];
1440 probe_procnum = nd->nd_procnum;
1441 } else if (nd->nd_flag & ND_NFSV3) {
1442 probe_id = nfscl_nfs3_done_probes[procnum];
1443 probe_procnum = procnum;
1444 } else {
1445 probe_id = nfscl_nfs2_done_probes[nd->nd_procnum];
1446 probe_procnum = procnum;
1447 }
1448 if (probe_id != 0)
1449 (dtrace_nfscl_nfs234_done_probe)(probe_id, vp,
1450 nd->nd_mreq, cred, probe_procnum, 0);
1451 }
1452 #endif
1453
1454 m_freem(nd->nd_mreq);
1455 if (usegssname == 0)
1456 AUTH_DESTROY(auth);
1457 if (rep != NULL)
1458 free(rep, M_NFSDREQ);
1459 if (set_sigset)
1460 newnfs_restore_sigmask(td, &oldset);
1461 return (0);
1462 nfsmout:
1463 m_freem(nd->nd_mrep);
1464 m_freem(nd->nd_mreq);
1465 if (usegssname == 0)
1466 AUTH_DESTROY(auth);
1467 if (rep != NULL)
1468 free(rep, M_NFSDREQ);
1469 if (set_sigset)
1470 newnfs_restore_sigmask(td, &oldset);
1471 return (error);
1472 }
1473
1474 /*
1475 * Reset slots above nfsess_foreslots that are not busy.
1476 */
1477 void
nfs_resetslots(struct nfsclsession * sep)1478 nfs_resetslots(struct nfsclsession *sep)
1479 {
1480 int i;
1481 uint64_t bitval;
1482
1483 mtx_assert(&sep->nfsess_mtx, MA_OWNED);
1484 bitval = (1 << sep->nfsess_foreslots);
1485 for (i = sep->nfsess_foreslots; i < NFSV4_SLOTS; i++) {
1486 if ((sep->nfsess_slots & bitval) == 0 &&
1487 (sep->nfsess_badslots & bitval) == 0)
1488 sep->nfsess_slotseq[i] = 0;
1489 bitval <<= 1;
1490 }
1491 }
1492
1493 /*
1494 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1495 * wait for all requests to complete. This is used by forced unmounts
1496 * to terminate any outstanding RPCs.
1497 */
1498 int
newnfs_nmcancelreqs(struct nfsmount * nmp)1499 newnfs_nmcancelreqs(struct nfsmount *nmp)
1500 {
1501 struct nfsclds *dsp;
1502 struct __rpc_client *cl;
1503 int i;
1504
1505 if (nmp->nm_sockreq.nr_client != NULL)
1506 CLNT_CLOSE(nmp->nm_sockreq.nr_client);
1507 for (i = 0; i < nmp->nm_aconnect; i++)
1508 if (nmp->nm_aconn[i] != NULL)
1509 CLNT_CLOSE(nmp->nm_aconn[i]);
1510 lookformore:
1511 NFSLOCKMNT(nmp);
1512 TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) {
1513 NFSLOCKDS(dsp);
1514 if (dsp != TAILQ_FIRST(&nmp->nm_sess) &&
1515 (dsp->nfsclds_flags & NFSCLDS_CLOSED) == 0 &&
1516 dsp->nfsclds_sockp != NULL &&
1517 dsp->nfsclds_sockp->nr_client != NULL) {
1518 dsp->nfsclds_flags |= NFSCLDS_CLOSED;
1519 cl = dsp->nfsclds_sockp->nr_client;
1520 NFSUNLOCKDS(dsp);
1521 NFSUNLOCKMNT(nmp);
1522 CLNT_CLOSE(cl);
1523 goto lookformore;
1524 }
1525 NFSUNLOCKDS(dsp);
1526 }
1527 NFSUNLOCKMNT(nmp);
1528 return (0);
1529 }
1530
1531 /*
1532 * Any signal that can interrupt an NFS operation in an intr mount
1533 * should be added to this set. SIGSTOP and SIGKILL cannot be masked.
1534 */
1535 int newnfs_sig_set[] = {
1536 SIGINT,
1537 SIGTERM,
1538 SIGHUP,
1539 SIGKILL,
1540 SIGQUIT
1541 };
1542
1543 /*
1544 * Check to see if one of the signals in our subset is pending on
1545 * the process (in an intr mount).
1546 */
1547 static int
nfs_sig_pending(sigset_t set)1548 nfs_sig_pending(sigset_t set)
1549 {
1550 int i;
1551
1552 for (i = 0 ; i < nitems(newnfs_sig_set); i++)
1553 if (SIGISMEMBER(set, newnfs_sig_set[i]))
1554 return (1);
1555 return (0);
1556 }
1557
1558 /*
1559 * The set/restore sigmask functions are used to (temporarily) overwrite
1560 * the thread td_sigmask during an RPC call (for example). These are also
1561 * used in other places in the NFS client that might tsleep().
1562 */
1563 void
newnfs_set_sigmask(struct thread * td,sigset_t * oldset)1564 newnfs_set_sigmask(struct thread *td, sigset_t *oldset)
1565 {
1566 sigset_t newset;
1567 int i;
1568 struct proc *p;
1569
1570 SIGFILLSET(newset);
1571 if (td == NULL)
1572 td = curthread; /* XXX */
1573 p = td->td_proc;
1574 /* Remove the NFS set of signals from newset */
1575 PROC_LOCK(p);
1576 mtx_lock(&p->p_sigacts->ps_mtx);
1577 for (i = 0 ; i < nitems(newnfs_sig_set); i++) {
1578 /*
1579 * But make sure we leave the ones already masked
1580 * by the process, ie. remove the signal from the
1581 * temporary signalmask only if it wasn't already
1582 * in p_sigmask.
1583 */
1584 if (!SIGISMEMBER(td->td_sigmask, newnfs_sig_set[i]) &&
1585 !SIGISMEMBER(p->p_sigacts->ps_sigignore, newnfs_sig_set[i]))
1586 SIGDELSET(newset, newnfs_sig_set[i]);
1587 }
1588 mtx_unlock(&p->p_sigacts->ps_mtx);
1589 kern_sigprocmask(td, SIG_SETMASK, &newset, oldset,
1590 SIGPROCMASK_PROC_LOCKED);
1591 PROC_UNLOCK(p);
1592 }
1593
1594 void
newnfs_restore_sigmask(struct thread * td,sigset_t * set)1595 newnfs_restore_sigmask(struct thread *td, sigset_t *set)
1596 {
1597 if (td == NULL)
1598 td = curthread; /* XXX */
1599 kern_sigprocmask(td, SIG_SETMASK, set, NULL, 0);
1600 }
1601
1602 /*
1603 * NFS wrapper to msleep(), that shoves a new p_sigmask and restores the
1604 * old one after msleep() returns.
1605 */
1606 int
newnfs_msleep(struct thread * td,void * ident,struct mtx * mtx,int priority,char * wmesg,int timo)1607 newnfs_msleep(struct thread *td, void *ident, struct mtx *mtx, int priority, char *wmesg, int timo)
1608 {
1609 sigset_t oldset;
1610 int error;
1611
1612 if ((priority & PCATCH) == 0)
1613 return msleep(ident, mtx, priority, wmesg, timo);
1614 if (td == NULL)
1615 td = curthread; /* XXX */
1616 newnfs_set_sigmask(td, &oldset);
1617 error = msleep(ident, mtx, priority, wmesg, timo);
1618 newnfs_restore_sigmask(td, &oldset);
1619 return (error);
1620 }
1621
1622 /*
1623 * Test for a termination condition pending on the process.
1624 * This is used for NFSMNT_INT mounts.
1625 */
1626 int
newnfs_sigintr(struct nfsmount * nmp,struct thread * td)1627 newnfs_sigintr(struct nfsmount *nmp, struct thread *td)
1628 {
1629 struct proc *p;
1630 sigset_t tmpset;
1631
1632 /* Terminate all requests while attempting a forced unmount. */
1633 if (NFSCL_FORCEDISM(nmp->nm_mountp))
1634 return (EIO);
1635 if (!(nmp->nm_flag & NFSMNT_INT))
1636 return (0);
1637 if (td == NULL)
1638 return (0);
1639 p = td->td_proc;
1640 PROC_LOCK(p);
1641 tmpset = p->p_siglist;
1642 SIGSETOR(tmpset, td->td_siglist);
1643 SIGSETNAND(tmpset, td->td_sigmask);
1644 mtx_lock(&p->p_sigacts->ps_mtx);
1645 SIGSETNAND(tmpset, p->p_sigacts->ps_sigignore);
1646 mtx_unlock(&p->p_sigacts->ps_mtx);
1647 if ((SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist))
1648 && nfs_sig_pending(tmpset)) {
1649 PROC_UNLOCK(p);
1650 return (EINTR);
1651 }
1652 PROC_UNLOCK(p);
1653 return (0);
1654 }
1655
1656 static int
nfs_msg(struct thread * td,const char * server,const char * msg,int error)1657 nfs_msg(struct thread *td, const char *server, const char *msg, int error)
1658 {
1659 struct proc *p;
1660
1661 p = td ? td->td_proc : NULL;
1662 if (error) {
1663 tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n",
1664 server, msg, error);
1665 } else {
1666 tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
1667 }
1668 return (0);
1669 }
1670
1671 static void
nfs_down(struct nfsmount * nmp,struct thread * td,const char * msg,int error,int flags)1672 nfs_down(struct nfsmount *nmp, struct thread *td, const char *msg,
1673 int error, int flags)
1674 {
1675 if (nmp == NULL)
1676 return;
1677 mtx_lock(&nmp->nm_mtx);
1678 if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
1679 nmp->nm_state |= NFSSTA_TIMEO;
1680 mtx_unlock(&nmp->nm_mtx);
1681 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1682 VQ_NOTRESP, 0);
1683 } else
1684 mtx_unlock(&nmp->nm_mtx);
1685 mtx_lock(&nmp->nm_mtx);
1686 if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1687 nmp->nm_state |= NFSSTA_LOCKTIMEO;
1688 mtx_unlock(&nmp->nm_mtx);
1689 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1690 VQ_NOTRESPLOCK, 0);
1691 } else
1692 mtx_unlock(&nmp->nm_mtx);
1693 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
1694 }
1695
1696 static void
nfs_up(struct nfsmount * nmp,struct thread * td,const char * msg,int flags,int tprintfmsg)1697 nfs_up(struct nfsmount *nmp, struct thread *td, const char *msg,
1698 int flags, int tprintfmsg)
1699 {
1700 if (nmp == NULL)
1701 return;
1702 if (tprintfmsg) {
1703 nfs_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
1704 }
1705
1706 mtx_lock(&nmp->nm_mtx);
1707 if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
1708 nmp->nm_state &= ~NFSSTA_TIMEO;
1709 mtx_unlock(&nmp->nm_mtx);
1710 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1711 VQ_NOTRESP, 1);
1712 } else
1713 mtx_unlock(&nmp->nm_mtx);
1714
1715 mtx_lock(&nmp->nm_mtx);
1716 if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
1717 nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
1718 mtx_unlock(&nmp->nm_mtx);
1719 vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
1720 VQ_NOTRESPLOCK, 1);
1721 } else
1722 mtx_unlock(&nmp->nm_mtx);
1723 }
1724