xref: /freebsd/sys/netinet/tcp_usrreq.c (revision b52b9d56d4e96089873a75f9e29062eec19fabba)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34  * $FreeBSD$
35  */
36 
37 #include "opt_ipsec.h"
38 #include "opt_inet6.h"
39 #include "opt_tcpdebug.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/sysctl.h>
46 #include <sys/mbuf.h>
47 #ifdef INET6
48 #include <sys/domain.h>
49 #endif /* INET6 */
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/protosw.h>
53 #include <sys/proc.h>
54 #include <sys/jail.h>
55 
56 #include <net/if.h>
57 #include <net/route.h>
58 
59 #include <netinet/in.h>
60 #include <netinet/in_systm.h>
61 #ifdef INET6
62 #include <netinet/ip6.h>
63 #endif
64 #include <netinet/in_pcb.h>
65 #ifdef INET6
66 #include <netinet6/in6_pcb.h>
67 #endif
68 #include <netinet/in_var.h>
69 #include <netinet/ip_var.h>
70 #ifdef INET6
71 #include <netinet6/ip6_var.h>
72 #endif
73 #include <netinet/tcp.h>
74 #include <netinet/tcp_fsm.h>
75 #include <netinet/tcp_seq.h>
76 #include <netinet/tcp_timer.h>
77 #include <netinet/tcp_var.h>
78 #include <netinet/tcpip.h>
79 #ifdef TCPDEBUG
80 #include <netinet/tcp_debug.h>
81 #endif
82 
83 #ifdef IPSEC
84 #include <netinet6/ipsec.h>
85 #endif /*IPSEC*/
86 
87 /*
88  * TCP protocol interface to socket abstraction.
89  */
90 extern	char *tcpstates[];	/* XXX ??? */
91 
92 static int	tcp_attach(struct socket *, struct thread *td);
93 static int	tcp_connect(struct tcpcb *, struct sockaddr *,
94 		    struct thread *td);
95 #ifdef INET6
96 static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
97 		    struct thread *td);
98 #endif /* INET6 */
99 static struct tcpcb *
100 		tcp_disconnect(struct tcpcb *);
101 static struct tcpcb *
102 		tcp_usrclosed(struct tcpcb *);
103 
104 #ifdef TCPDEBUG
105 #define	TCPDEBUG0	int ostate = 0
106 #define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
107 #define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
108 				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
109 #else
110 #define	TCPDEBUG0
111 #define	TCPDEBUG1()
112 #define	TCPDEBUG2(req)
113 #endif
114 
115 /*
116  * TCP attaches to socket via pru_attach(), reserving space,
117  * and an internet control block.
118  */
119 static int
120 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
121 {
122 	int s = splnet();
123 	int error;
124 	struct inpcb *inp;
125 	struct tcpcb *tp = 0;
126 	TCPDEBUG0;
127 
128 	INP_INFO_WLOCK(&tcbinfo);
129 	TCPDEBUG1();
130 	inp = sotoinpcb(so);
131 	if (inp) {
132 		error = EISCONN;
133 		goto out;
134 	}
135 
136 	error = tcp_attach(so, td);
137 	if (error)
138 		goto out;
139 
140 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
141 		so->so_linger = TCP_LINGERTIME;
142 
143 	inp = sotoinpcb(so);
144 	tp = intotcpcb(inp);
145 out:
146 	TCPDEBUG2(PRU_ATTACH);
147 	INP_INFO_WUNLOCK(&tcbinfo);
148 	splx(s);
149 	return error;
150 }
151 
152 /*
153  * pru_detach() detaches the TCP protocol from the socket.
154  * If the protocol state is non-embryonic, then can't
155  * do this directly: have to initiate a pru_disconnect(),
156  * which may finish later; embryonic TCB's can just
157  * be discarded here.
158  */
159 static int
160 tcp_usr_detach(struct socket *so)
161 {
162 	int s = splnet();
163 	int error = 0;
164 	struct inpcb *inp;
165 	struct tcpcb *tp;
166 	TCPDEBUG0;
167 
168 	INP_INFO_WLOCK(&tcbinfo);
169 	inp = sotoinpcb(so);
170 	if (inp == 0) {
171 		INP_INFO_WUNLOCK(&tcbinfo);
172 		splx(s);
173 		return EINVAL;	/* XXX */
174 	}
175 	INP_LOCK(inp);
176 	tp = intotcpcb(inp);
177 	TCPDEBUG1();
178 	tp = tcp_disconnect(tp);
179 
180 	TCPDEBUG2(PRU_DETACH);
181 	if (tp)
182 		INP_UNLOCK(inp);
183 	INP_INFO_WUNLOCK(&tcbinfo);
184 	splx(s);
185 	return error;
186 }
187 
188 #define INI_NOLOCK	0
189 #define INI_READ	1
190 #define INI_WRITE	2
191 
192 #define	COMMON_START()						\
193 	TCPDEBUG0;						\
194 	do {							\
195 		if (inirw == INI_READ)				\
196 			INP_INFO_RLOCK(&tcbinfo);		\
197 		else if (inirw == INI_WRITE)			\
198 			INP_INFO_WLOCK(&tcbinfo);		\
199 		inp = sotoinpcb(so);				\
200 		if (inp == 0) {					\
201 			if (inirw == INI_READ)			\
202 				INP_INFO_RUNLOCK(&tcbinfo);	\
203 			else if (inirw == INI_WRITE)		\
204 				INP_INFO_WUNLOCK(&tcbinfo);	\
205 			splx(s);				\
206 			return EINVAL;				\
207 		}						\
208 		INP_LOCK(inp);					\
209 		if (inirw == INI_READ)				\
210 			INP_INFO_RUNLOCK(&tcbinfo);		\
211 		tp = intotcpcb(inp);				\
212 		TCPDEBUG1();					\
213 } while(0)
214 
215 #define COMMON_END(req)						\
216 out:	TCPDEBUG2(req);						\
217 	do {							\
218 		if (tp)						\
219 			INP_UNLOCK(inp);			\
220 		if (inirw == INI_WRITE)				\
221 			INP_INFO_WUNLOCK(&tcbinfo);		\
222 		splx(s);					\
223 		return error;					\
224 		goto out;					\
225 } while(0)
226 
227 /*
228  * Give the socket an address.
229  */
230 static int
231 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
232 {
233 	int s = splnet();
234 	int error = 0;
235 	struct inpcb *inp;
236 	struct tcpcb *tp;
237 	struct sockaddr_in *sinp;
238 	const int inirw = INI_READ;
239 
240 	COMMON_START();
241 
242 	/*
243 	 * Must check for multicast addresses and disallow binding
244 	 * to them.
245 	 */
246 	sinp = (struct sockaddr_in *)nam;
247 	if (sinp->sin_family == AF_INET &&
248 	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
249 		error = EAFNOSUPPORT;
250 		goto out;
251 	}
252 	error = in_pcbbind(inp, nam, td);
253 	if (error)
254 		goto out;
255 	COMMON_END(PRU_BIND);
256 }
257 
258 #ifdef INET6
259 static int
260 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
261 {
262 	int s = splnet();
263 	int error = 0;
264 	struct inpcb *inp;
265 	struct tcpcb *tp;
266 	struct sockaddr_in6 *sin6p;
267 	const int inirw = INI_READ;
268 
269 	COMMON_START();
270 
271 	/*
272 	 * Must check for multicast addresses and disallow binding
273 	 * to them.
274 	 */
275 	sin6p = (struct sockaddr_in6 *)nam;
276 	if (sin6p->sin6_family == AF_INET6 &&
277 	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
278 		error = EAFNOSUPPORT;
279 		goto out;
280 	}
281 	inp->inp_vflag &= ~INP_IPV4;
282 	inp->inp_vflag |= INP_IPV6;
283 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
284 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
285 			inp->inp_vflag |= INP_IPV4;
286 		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
287 			struct sockaddr_in sin;
288 
289 			in6_sin6_2_sin(&sin, sin6p);
290 			inp->inp_vflag |= INP_IPV4;
291 			inp->inp_vflag &= ~INP_IPV6;
292 			error = in_pcbbind(inp, (struct sockaddr *)&sin, td);
293 			goto out;
294 		}
295 	}
296 	error = in6_pcbbind(inp, nam, td);
297 	if (error)
298 		goto out;
299 	COMMON_END(PRU_BIND);
300 }
301 #endif /* INET6 */
302 
303 /*
304  * Prepare to accept connections.
305  */
306 static int
307 tcp_usr_listen(struct socket *so, struct thread *td)
308 {
309 	int s = splnet();
310 	int error = 0;
311 	struct inpcb *inp;
312 	struct tcpcb *tp;
313 	const int inirw = INI_READ;
314 
315 	COMMON_START();
316 	if (inp->inp_lport == 0)
317 		error = in_pcbbind(inp, (struct sockaddr *)0, td);
318 	if (error == 0)
319 		tp->t_state = TCPS_LISTEN;
320 	COMMON_END(PRU_LISTEN);
321 }
322 
323 #ifdef INET6
324 static int
325 tcp6_usr_listen(struct socket *so, struct thread *td)
326 {
327 	int s = splnet();
328 	int error = 0;
329 	struct inpcb *inp;
330 	struct tcpcb *tp;
331 	const int inirw = INI_READ;
332 
333 	COMMON_START();
334 	if (inp->inp_lport == 0) {
335 		inp->inp_vflag &= ~INP_IPV4;
336 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
337 			inp->inp_vflag |= INP_IPV4;
338 		error = in6_pcbbind(inp, (struct sockaddr *)0, td);
339 	}
340 	if (error == 0)
341 		tp->t_state = TCPS_LISTEN;
342 	COMMON_END(PRU_LISTEN);
343 }
344 #endif /* INET6 */
345 
346 /*
347  * Initiate connection to peer.
348  * Create a template for use in transmissions on this connection.
349  * Enter SYN_SENT state, and mark socket as connecting.
350  * Start keep-alive timer, and seed output sequence space.
351  * Send initial segment on connection.
352  */
353 static int
354 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
355 {
356 	int s = splnet();
357 	int error = 0;
358 	struct inpcb *inp;
359 	struct tcpcb *tp;
360 	struct sockaddr_in *sinp;
361 	const int inirw = INI_WRITE;
362 
363 	COMMON_START();
364 
365 	/*
366 	 * Must disallow TCP ``connections'' to multicast addresses.
367 	 */
368 	sinp = (struct sockaddr_in *)nam;
369 	if (sinp->sin_family == AF_INET
370 	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
371 		error = EAFNOSUPPORT;
372 		goto out;
373 	}
374 
375 	if (td && jailed(td->td_ucred))
376 		prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
377 
378 	if ((error = tcp_connect(tp, nam, td)) != 0)
379 		goto out;
380 	error = tcp_output(tp);
381 	COMMON_END(PRU_CONNECT);
382 }
383 
384 #ifdef INET6
385 static int
386 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
387 {
388 	int s = splnet();
389 	int error = 0;
390 	struct inpcb *inp;
391 	struct tcpcb *tp;
392 	struct sockaddr_in6 *sin6p;
393 	const int inirw = INI_WRITE;
394 
395 	COMMON_START();
396 
397 	/*
398 	 * Must disallow TCP ``connections'' to multicast addresses.
399 	 */
400 	sin6p = (struct sockaddr_in6 *)nam;
401 	if (sin6p->sin6_family == AF_INET6
402 	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
403 		error = EAFNOSUPPORT;
404 		goto out;
405 	}
406 
407 	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
408 		struct sockaddr_in sin;
409 
410 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
411 			return(EINVAL);
412 
413 		in6_sin6_2_sin(&sin, sin6p);
414 		inp->inp_vflag |= INP_IPV4;
415 		inp->inp_vflag &= ~INP_IPV6;
416 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
417 			goto out;
418 		error = tcp_output(tp);
419 		goto out;
420 	}
421 	inp->inp_vflag &= ~INP_IPV4;
422 	inp->inp_vflag |= INP_IPV6;
423 	inp->inp_inc.inc_isipv6 = 1;
424 	if ((error = tcp6_connect(tp, nam, td)) != 0)
425 		goto out;
426 	error = tcp_output(tp);
427 	COMMON_END(PRU_CONNECT);
428 }
429 #endif /* INET6 */
430 
431 /*
432  * Initiate disconnect from peer.
433  * If connection never passed embryonic stage, just drop;
434  * else if don't need to let data drain, then can just drop anyways,
435  * else have to begin TCP shutdown process: mark socket disconnecting,
436  * drain unread data, state switch to reflect user close, and
437  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
438  * when peer sends FIN and acks ours.
439  *
440  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
441  */
442 static int
443 tcp_usr_disconnect(struct socket *so)
444 {
445 	int s = splnet();
446 	int error = 0;
447 	struct inpcb *inp;
448 	struct tcpcb *tp;
449 	const int inirw = INI_WRITE;
450 
451 	COMMON_START();
452 	tp = tcp_disconnect(tp);
453 	COMMON_END(PRU_DISCONNECT);
454 }
455 
456 /*
457  * Accept a connection.  Essentially all the work is
458  * done at higher levels; just return the address
459  * of the peer, storing through addr.
460  */
461 static int
462 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
463 {
464 	int s;
465 	int error = 0;
466 	struct inpcb *inp = NULL;
467 	struct tcpcb *tp = NULL;
468 	struct sockaddr_in *sin;
469 	const int inirw = INI_READ;
470 	TCPDEBUG0;
471 
472 	if (so->so_state & SS_ISDISCONNECTED) {
473 		error = ECONNABORTED;
474 		goto out;
475 	}
476 
477 	/*
478 	 * Do the malloc first in case it blocks.
479 	 */
480 	MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
481 		M_WAITOK | M_ZERO);
482 	sin->sin_family = AF_INET;
483 	sin->sin_len = sizeof(*sin);
484 
485 	s = splnet();
486 	INP_INFO_RLOCK(&tcbinfo);
487 	inp = sotoinpcb(so);
488 	if (!inp) {
489 		INP_INFO_RUNLOCK(&tcbinfo);
490 		splx(s);
491 		free(sin, M_SONAME);
492 		return (EINVAL);
493 	}
494 	INP_LOCK(inp);
495 	INP_INFO_RUNLOCK(&tcbinfo);
496 	tp = intotcpcb(inp);
497 	TCPDEBUG1();
498 
499 	/*
500 	 * We inline in_setpeeraddr here, because we have already done
501 	 * the locking and the malloc.
502 	 */
503 	sin->sin_port = inp->inp_fport;
504 	sin->sin_addr = inp->inp_faddr;
505 	*nam = (struct sockaddr *)sin;
506 
507 	COMMON_END(PRU_ACCEPT);
508 }
509 
510 #ifdef INET6
511 static int
512 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
513 {
514 	int s;
515 	struct inpcb *inp = NULL;
516 	int error = 0;
517 	struct tcpcb *tp = NULL;
518 	const int inirw = INI_READ;
519 	TCPDEBUG0;
520 
521 	if (so->so_state & SS_ISDISCONNECTED) {
522 		error = ECONNABORTED;
523 		goto out;
524 	}
525 
526 	s = splnet();
527 	INP_INFO_RLOCK(&tcbinfo);
528 	inp = sotoinpcb(so);
529 	if (inp == 0) {
530 		INP_INFO_RUNLOCK(&tcbinfo);
531 		splx(s);
532 		return (EINVAL);
533 	}
534 	INP_LOCK(inp);
535 	INP_INFO_RUNLOCK(&tcbinfo);
536 	tp = intotcpcb(inp);
537 	TCPDEBUG1();
538 	in6_mapped_peeraddr(so, nam);
539 	COMMON_END(PRU_ACCEPT);
540 }
541 #endif /* INET6 */
542 
543 /*
544  * This is the wrapper function for in_setsockaddr. We just pass down
545  * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking
546  * here because in_setsockaddr will call malloc and can block.
547  */
548 static int
549 tcp_sockaddr(struct socket *so, struct sockaddr **nam)
550 {
551 	return (in_setsockaddr(so, nam, &tcbinfo));
552 }
553 
554 /*
555  * This is the wrapper function for in_setpeeraddr. We just pass down
556  * the pcbinfo for in_setpeeraddr to lock.
557  */
558 static int
559 tcp_peeraddr(struct socket *so, struct sockaddr **nam)
560 {
561 	return (in_setpeeraddr(so, nam, &tcbinfo));
562 }
563 
564 /*
565  * Mark the connection as being incapable of further output.
566  */
567 static int
568 tcp_usr_shutdown(struct socket *so)
569 {
570 	int s = splnet();
571 	int error = 0;
572 	struct inpcb *inp;
573 	struct tcpcb *tp;
574 	const int inirw = INI_WRITE;
575 
576 	COMMON_START();
577 	socantsendmore(so);
578 	tp = tcp_usrclosed(tp);
579 	if (tp)
580 		error = tcp_output(tp);
581 	COMMON_END(PRU_SHUTDOWN);
582 }
583 
584 /*
585  * After a receive, possibly send window update to peer.
586  */
587 static int
588 tcp_usr_rcvd(struct socket *so, int flags)
589 {
590 	int s = splnet();
591 	int error = 0;
592 	struct inpcb *inp;
593 	struct tcpcb *tp;
594 	const int inirw = INI_READ;
595 
596 	COMMON_START();
597 	tcp_output(tp);
598 	COMMON_END(PRU_RCVD);
599 }
600 
601 /*
602  * Do a send by putting data in output queue and updating urgent
603  * marker if URG set.  Possibly send more data.  Unlike the other
604  * pru_*() routines, the mbuf chains are our responsibility.  We
605  * must either enqueue them or free them.  The other pru_* routines
606  * generally are caller-frees.
607  */
608 static int
609 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
610 	     struct sockaddr *nam, struct mbuf *control, struct thread *td)
611 {
612 	int s = splnet();
613 	int error = 0;
614 	struct inpcb *inp;
615 	struct tcpcb *tp;
616 	const int inirw = INI_WRITE;
617 #ifdef INET6
618 	int isipv6;
619 #endif
620 	TCPDEBUG0;
621 
622 	/*
623 	 * Need write lock here because this function might call
624 	 * tcp_connect or tcp_usrclosed.
625 	 * We really want to have to this function upgrade from read lock
626 	 * to write lock.  XXX
627 	 */
628 	INP_INFO_WLOCK(&tcbinfo);
629 	inp = sotoinpcb(so);
630 	if (inp == NULL) {
631 		/*
632 		 * OOPS! we lost a race, the TCP session got reset after
633 		 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
634 		 * network interrupt in the non-splnet() section of sosend().
635 		 */
636 		if (m)
637 			m_freem(m);
638 		if (control)
639 			m_freem(control);
640 		error = ECONNRESET;	/* XXX EPIPE? */
641 		tp = NULL;
642 		TCPDEBUG1();
643 		goto out;
644 	}
645 	INP_LOCK(inp);
646 #ifdef INET6
647 	isipv6 = nam && nam->sa_family == AF_INET6;
648 #endif /* INET6 */
649 	tp = intotcpcb(inp);
650 	TCPDEBUG1();
651 	if (control) {
652 		/* TCP doesn't do control messages (rights, creds, etc) */
653 		if (control->m_len) {
654 			m_freem(control);
655 			if (m)
656 				m_freem(m);
657 			error = EINVAL;
658 			goto out;
659 		}
660 		m_freem(control);	/* empty control, just free it */
661 	}
662 	if (!(flags & PRUS_OOB)) {
663 		sbappend(&so->so_snd, m);
664 		if (nam && tp->t_state < TCPS_SYN_SENT) {
665 			/*
666 			 * Do implied connect if not yet connected,
667 			 * initialize window to default value, and
668 			 * initialize maxseg/maxopd using peer's cached
669 			 * MSS.
670 			 */
671 #ifdef INET6
672 			if (isipv6)
673 				error = tcp6_connect(tp, nam, td);
674 			else
675 #endif /* INET6 */
676 			error = tcp_connect(tp, nam, td);
677 			if (error)
678 				goto out;
679 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
680 			tcp_mss(tp, -1);
681 		}
682 
683 		if (flags & PRUS_EOF) {
684 			/*
685 			 * Close the send side of the connection after
686 			 * the data is sent.
687 			 */
688 			socantsendmore(so);
689 			tp = tcp_usrclosed(tp);
690 		}
691 		if (tp != NULL) {
692 			if (flags & PRUS_MORETOCOME)
693 				tp->t_flags |= TF_MORETOCOME;
694 			error = tcp_output(tp);
695 			if (flags & PRUS_MORETOCOME)
696 				tp->t_flags &= ~TF_MORETOCOME;
697 		}
698 	} else {
699 		if (sbspace(&so->so_snd) < -512) {
700 			m_freem(m);
701 			error = ENOBUFS;
702 			goto out;
703 		}
704 		/*
705 		 * According to RFC961 (Assigned Protocols),
706 		 * the urgent pointer points to the last octet
707 		 * of urgent data.  We continue, however,
708 		 * to consider it to indicate the first octet
709 		 * of data past the urgent section.
710 		 * Otherwise, snd_up should be one lower.
711 		 */
712 		sbappend(&so->so_snd, m);
713 		if (nam && tp->t_state < TCPS_SYN_SENT) {
714 			/*
715 			 * Do implied connect if not yet connected,
716 			 * initialize window to default value, and
717 			 * initialize maxseg/maxopd using peer's cached
718 			 * MSS.
719 			 */
720 #ifdef INET6
721 			if (isipv6)
722 				error = tcp6_connect(tp, nam, td);
723 			else
724 #endif /* INET6 */
725 			error = tcp_connect(tp, nam, td);
726 			if (error)
727 				goto out;
728 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
729 			tcp_mss(tp, -1);
730 		}
731 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
732 		tp->t_force = 1;
733 		error = tcp_output(tp);
734 		tp->t_force = 0;
735 	}
736 	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
737 		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
738 }
739 
740 /*
741  * Abort the TCP.
742  */
743 static int
744 tcp_usr_abort(struct socket *so)
745 {
746 	int s = splnet();
747 	int error = 0;
748 	struct inpcb *inp;
749 	struct tcpcb *tp;
750 	const int inirw = INI_WRITE;
751 
752 	COMMON_START();
753 	tp = tcp_drop(tp, ECONNABORTED);
754 	COMMON_END(PRU_ABORT);
755 }
756 
757 /*
758  * Receive out-of-band data.
759  */
760 static int
761 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
762 {
763 	int s = splnet();
764 	int error = 0;
765 	struct inpcb *inp;
766 	struct tcpcb *tp;
767 	const int inirw = INI_READ;
768 
769 	COMMON_START();
770 	if ((so->so_oobmark == 0 &&
771 	     (so->so_state & SS_RCVATMARK) == 0) ||
772 	    so->so_options & SO_OOBINLINE ||
773 	    tp->t_oobflags & TCPOOB_HADDATA) {
774 		error = EINVAL;
775 		goto out;
776 	}
777 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
778 		error = EWOULDBLOCK;
779 		goto out;
780 	}
781 	m->m_len = 1;
782 	*mtod(m, caddr_t) = tp->t_iobc;
783 	if ((flags & MSG_PEEK) == 0)
784 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
785 	COMMON_END(PRU_RCVOOB);
786 }
787 
788 /* xxx - should be const */
789 struct pr_usrreqs tcp_usrreqs = {
790 	tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
791 	tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
792 	tcp_usr_disconnect, tcp_usr_listen, tcp_peeraddr, tcp_usr_rcvd,
793 	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
794 	tcp_sockaddr, sosend, soreceive, sopoll
795 };
796 
797 #ifdef INET6
798 struct pr_usrreqs tcp6_usrreqs = {
799 	tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
800 	tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
801 	tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
802 	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
803 	in6_mapped_sockaddr, sosend, soreceive, sopoll
804 };
805 #endif /* INET6 */
806 
807 /*
808  * Common subroutine to open a TCP connection to remote host specified
809  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
810  * port number if needed.  Call in_pcbladdr to do the routing and to choose
811  * a local host address (interface).  If there is an existing incarnation
812  * of the same connection in TIME-WAIT state and if the remote host was
813  * sending CC options and if the connection duration was < MSL, then
814  * truncate the previous TIME-WAIT state and proceed.
815  * Initialize connection parameters and enter SYN-SENT state.
816  */
817 static int
818 tcp_connect(tp, nam, td)
819 	register struct tcpcb *tp;
820 	struct sockaddr *nam;
821 	struct thread *td;
822 {
823 	struct inpcb *inp = tp->t_inpcb, *oinp;
824 	struct socket *so = inp->inp_socket;
825 	struct tcpcb *otp;
826 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
827 	struct sockaddr_in *ifaddr;
828 	struct rmxp_tao *taop;
829 	struct rmxp_tao tao_noncached;
830 	int error;
831 
832 	if (inp->inp_lport == 0) {
833 		error = in_pcbbind(inp, (struct sockaddr *)0, td);
834 		if (error)
835 			return error;
836 	}
837 
838 	/*
839 	 * Cannot simply call in_pcbconnect, because there might be an
840 	 * earlier incarnation of this same connection still in
841 	 * TIME_WAIT state, creating an ADDRINUSE error.
842 	 */
843 	error = in_pcbladdr(inp, nam, &ifaddr);
844 	if (error)
845 		return error;
846 	oinp = in_pcblookup_hash(inp->inp_pcbinfo,
847 	    sin->sin_addr, sin->sin_port,
848 	    inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
849 						: ifaddr->sin_addr,
850 	    inp->inp_lport,  0, NULL);
851 	if (oinp) {
852 		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
853 		otp->t_state == TCPS_TIME_WAIT &&
854 		    (ticks - otp->t_starttime) < tcp_msl &&
855 		    (otp->t_flags & TF_RCVD_CC))
856 			otp = tcp_close(otp);
857 		else
858 			return EADDRINUSE;
859 	}
860 	if (inp->inp_laddr.s_addr == INADDR_ANY)
861 		inp->inp_laddr = ifaddr->sin_addr;
862 	inp->inp_faddr = sin->sin_addr;
863 	inp->inp_fport = sin->sin_port;
864 	in_pcbrehash(inp);
865 
866 	/* Compute window scaling to request.  */
867 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
868 	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
869 		tp->request_r_scale++;
870 
871 	soisconnecting(so);
872 	tcpstat.tcps_connattempt++;
873 	tp->t_state = TCPS_SYN_SENT;
874 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
875 	tp->iss = tcp_new_isn(tp);
876 	tcp_sendseqinit(tp);
877 
878 	/*
879 	 * Generate a CC value for this connection and
880 	 * check whether CC or CCnew should be used.
881 	 */
882 	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
883 		taop = &tao_noncached;
884 		bzero(taop, sizeof(*taop));
885 	}
886 
887 	tp->cc_send = CC_INC(tcp_ccgen);
888 	if (taop->tao_ccsent != 0 &&
889 	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
890 		taop->tao_ccsent = tp->cc_send;
891 	} else {
892 		taop->tao_ccsent = 0;
893 		tp->t_flags |= TF_SENDCCNEW;
894 	}
895 
896 	return 0;
897 }
898 
899 #ifdef INET6
900 static int
901 tcp6_connect(tp, nam, td)
902 	register struct tcpcb *tp;
903 	struct sockaddr *nam;
904 	struct thread *td;
905 {
906 	struct inpcb *inp = tp->t_inpcb, *oinp;
907 	struct socket *so = inp->inp_socket;
908 	struct tcpcb *otp;
909 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
910 	struct in6_addr *addr6;
911 	struct rmxp_tao *taop;
912 	struct rmxp_tao tao_noncached;
913 	int error;
914 
915 	if (inp->inp_lport == 0) {
916 		error = in6_pcbbind(inp, (struct sockaddr *)0, td);
917 		if (error)
918 			return error;
919 	}
920 
921 	/*
922 	 * Cannot simply call in_pcbconnect, because there might be an
923 	 * earlier incarnation of this same connection still in
924 	 * TIME_WAIT state, creating an ADDRINUSE error.
925 	 */
926 	error = in6_pcbladdr(inp, nam, &addr6);
927 	if (error)
928 		return error;
929 	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
930 				  &sin6->sin6_addr, sin6->sin6_port,
931 				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
932 				  ? addr6
933 				  : &inp->in6p_laddr,
934 				  inp->inp_lport,  0, NULL);
935 	if (oinp) {
936 		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
937 		    otp->t_state == TCPS_TIME_WAIT &&
938 		    (ticks - otp->t_starttime) < tcp_msl &&
939 		    (otp->t_flags & TF_RCVD_CC))
940 			otp = tcp_close(otp);
941 		else
942 			return EADDRINUSE;
943 	}
944 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
945 		inp->in6p_laddr = *addr6;
946 	inp->in6p_faddr = sin6->sin6_addr;
947 	inp->inp_fport = sin6->sin6_port;
948 	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL)
949 		inp->in6p_flowinfo = sin6->sin6_flowinfo;
950 	in_pcbrehash(inp);
951 
952 	/* Compute window scaling to request.  */
953 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
954 	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
955 		tp->request_r_scale++;
956 
957 	soisconnecting(so);
958 	tcpstat.tcps_connattempt++;
959 	tp->t_state = TCPS_SYN_SENT;
960 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
961 	tp->iss = tcp_new_isn(tp);
962 	tcp_sendseqinit(tp);
963 
964 	/*
965 	 * Generate a CC value for this connection and
966 	 * check whether CC or CCnew should be used.
967 	 */
968 	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
969 		taop = &tao_noncached;
970 		bzero(taop, sizeof(*taop));
971 	}
972 
973 	tp->cc_send = CC_INC(tcp_ccgen);
974 	if (taop->tao_ccsent != 0 &&
975 	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
976 		taop->tao_ccsent = tp->cc_send;
977 	} else {
978 		taop->tao_ccsent = 0;
979 		tp->t_flags |= TF_SENDCCNEW;
980 	}
981 
982 	return 0;
983 }
984 #endif /* INET6 */
985 
986 /*
987  * The new sockopt interface makes it possible for us to block in the
988  * copyin/out step (if we take a page fault).  Taking a page fault at
989  * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
990  * use TSM, there probably isn't any need for this function to run at
991  * splnet() any more.  This needs more examination.)
992  */
993 int
994 tcp_ctloutput(so, sopt)
995 	struct socket *so;
996 	struct sockopt *sopt;
997 {
998 	int	error, opt, optval, s;
999 	struct	inpcb *inp;
1000 	struct	tcpcb *tp;
1001 
1002 	error = 0;
1003 	s = splnet();		/* XXX */
1004 	INP_INFO_RLOCK(&tcbinfo);
1005 	inp = sotoinpcb(so);
1006 	if (inp == NULL) {
1007 		INP_INFO_RUNLOCK(&tcbinfo);
1008 		splx(s);
1009 		return (ECONNRESET);
1010 	}
1011 	INP_LOCK(inp);
1012 	INP_INFO_RUNLOCK(&tcbinfo);
1013 	if (sopt->sopt_level != IPPROTO_TCP) {
1014 #ifdef INET6
1015 		if (INP_CHECK_SOCKAF(so, AF_INET6))
1016 			error = ip6_ctloutput(so, sopt);
1017 		else
1018 #endif /* INET6 */
1019 		error = ip_ctloutput(so, sopt);
1020 		INP_UNLOCK(inp);
1021 		splx(s);
1022 		return (error);
1023 	}
1024 	tp = intotcpcb(inp);
1025 
1026 	switch (sopt->sopt_dir) {
1027 	case SOPT_SET:
1028 		switch (sopt->sopt_name) {
1029 		case TCP_NODELAY:
1030 		case TCP_NOOPT:
1031 			error = sooptcopyin(sopt, &optval, sizeof optval,
1032 					    sizeof optval);
1033 			if (error)
1034 				break;
1035 
1036 			switch (sopt->sopt_name) {
1037 			case TCP_NODELAY:
1038 				opt = TF_NODELAY;
1039 				break;
1040 			case TCP_NOOPT:
1041 				opt = TF_NOOPT;
1042 				break;
1043 			default:
1044 				opt = 0; /* dead code to fool gcc */
1045 				break;
1046 			}
1047 
1048 			if (optval)
1049 				tp->t_flags |= opt;
1050 			else
1051 				tp->t_flags &= ~opt;
1052 			break;
1053 
1054 		case TCP_NOPUSH:
1055 			error = sooptcopyin(sopt, &optval, sizeof optval,
1056 					    sizeof optval);
1057 			if (error)
1058 				break;
1059 
1060 			if (optval)
1061 				tp->t_flags |= TF_NOPUSH;
1062 			else {
1063 				tp->t_flags &= ~TF_NOPUSH;
1064 				error = tcp_output(tp);
1065 			}
1066 			break;
1067 
1068 		case TCP_MAXSEG:
1069 			error = sooptcopyin(sopt, &optval, sizeof optval,
1070 					    sizeof optval);
1071 			if (error)
1072 				break;
1073 
1074 			if (optval > 0 && optval <= tp->t_maxseg)
1075 				tp->t_maxseg = optval;
1076 			else
1077 				error = EINVAL;
1078 			break;
1079 
1080 		default:
1081 			error = ENOPROTOOPT;
1082 			break;
1083 		}
1084 		break;
1085 
1086 	case SOPT_GET:
1087 		switch (sopt->sopt_name) {
1088 		case TCP_NODELAY:
1089 			optval = tp->t_flags & TF_NODELAY;
1090 			break;
1091 		case TCP_MAXSEG:
1092 			optval = tp->t_maxseg;
1093 			break;
1094 		case TCP_NOOPT:
1095 			optval = tp->t_flags & TF_NOOPT;
1096 			break;
1097 		case TCP_NOPUSH:
1098 			optval = tp->t_flags & TF_NOPUSH;
1099 			break;
1100 		default:
1101 			error = ENOPROTOOPT;
1102 			break;
1103 		}
1104 		if (error == 0)
1105 			error = sooptcopyout(sopt, &optval, sizeof optval);
1106 		break;
1107 	}
1108 	INP_UNLOCK(inp);
1109 	splx(s);
1110 	return (error);
1111 }
1112 
1113 /*
1114  * tcp_sendspace and tcp_recvspace are the default send and receive window
1115  * sizes, respectively.  These are obsolescent (this information should
1116  * be set by the route).
1117  */
1118 u_long	tcp_sendspace = 1024*32;
1119 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1120     &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1121 u_long	tcp_recvspace = 1024*64;
1122 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1123     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1124 
1125 /*
1126  * Attach TCP protocol to socket, allocating
1127  * internet protocol control block, tcp control block,
1128  * bufer space, and entering LISTEN state if to accept connections.
1129  */
1130 static int
1131 tcp_attach(so, td)
1132 	struct socket *so;
1133 	struct thread *td;
1134 {
1135 	register struct tcpcb *tp;
1136 	struct inpcb *inp;
1137 	int error;
1138 #ifdef INET6
1139 	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL;
1140 #endif
1141 
1142 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1143 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
1144 		if (error)
1145 			return (error);
1146 	}
1147 	error = in_pcballoc(so, &tcbinfo, td);
1148 	if (error)
1149 		return (error);
1150 	inp = sotoinpcb(so);
1151 #ifdef INET6
1152 	if (isipv6) {
1153 		inp->inp_vflag |= INP_IPV6;
1154 		inp->in6p_hops = -1;	/* use kernel default */
1155 	}
1156 	else
1157 #endif
1158 	inp->inp_vflag |= INP_IPV4;
1159 	tp = tcp_newtcpcb(inp);
1160 	if (tp == 0) {
1161 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
1162 
1163 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
1164 #ifdef INET6
1165 		if (isipv6)
1166 			in6_pcbdetach(inp);
1167 		else
1168 #endif
1169 		in_pcbdetach(inp);
1170 		so->so_state |= nofd;
1171 		return (ENOBUFS);
1172 	}
1173 	tp->t_state = TCPS_CLOSED;
1174 	return (0);
1175 }
1176 
1177 /*
1178  * Initiate (or continue) disconnect.
1179  * If embryonic state, just send reset (once).
1180  * If in ``let data drain'' option and linger null, just drop.
1181  * Otherwise (hard), mark socket disconnecting and drop
1182  * current input data; switch states based on user close, and
1183  * send segment to peer (with FIN).
1184  */
1185 static struct tcpcb *
1186 tcp_disconnect(tp)
1187 	register struct tcpcb *tp;
1188 {
1189 	struct socket *so = tp->t_inpcb->inp_socket;
1190 
1191 	if (tp->t_state < TCPS_ESTABLISHED)
1192 		tp = tcp_close(tp);
1193 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1194 		tp = tcp_drop(tp, 0);
1195 	else {
1196 		soisdisconnecting(so);
1197 		sbflush(&so->so_rcv);
1198 		tp = tcp_usrclosed(tp);
1199 		if (tp)
1200 			(void) tcp_output(tp);
1201 	}
1202 	return (tp);
1203 }
1204 
1205 /*
1206  * User issued close, and wish to trail through shutdown states:
1207  * if never received SYN, just forget it.  If got a SYN from peer,
1208  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1209  * If already got a FIN from peer, then almost done; go to LAST_ACK
1210  * state.  In all other cases, have already sent FIN to peer (e.g.
1211  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1212  * for peer to send FIN or not respond to keep-alives, etc.
1213  * We can let the user exit from the close as soon as the FIN is acked.
1214  */
1215 static struct tcpcb *
1216 tcp_usrclosed(tp)
1217 	register struct tcpcb *tp;
1218 {
1219 
1220 	switch (tp->t_state) {
1221 
1222 	case TCPS_CLOSED:
1223 	case TCPS_LISTEN:
1224 		tp->t_state = TCPS_CLOSED;
1225 		tp = tcp_close(tp);
1226 		break;
1227 
1228 	case TCPS_SYN_SENT:
1229 	case TCPS_SYN_RECEIVED:
1230 		tp->t_flags |= TF_NEEDFIN;
1231 		break;
1232 
1233 	case TCPS_ESTABLISHED:
1234 		tp->t_state = TCPS_FIN_WAIT_1;
1235 		break;
1236 
1237 	case TCPS_CLOSE_WAIT:
1238 		tp->t_state = TCPS_LAST_ACK;
1239 		break;
1240 	}
1241 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1242 		soisdisconnected(tp->t_inpcb->inp_socket);
1243 		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
1244 		if (tp->t_state == TCPS_FIN_WAIT_2)
1245 			callout_reset(tp->tt_2msl, tcp_maxidle,
1246 				      tcp_timer_2msl, tp);
1247 	}
1248 	return (tp);
1249 }
1250 
1251