xref: /freebsd/sys/netinet/tcp_usrreq.c (revision 4b2eaea43fec8e8792be611dea204071a10b655a)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34  * $FreeBSD$
35  */
36 
37 #include "opt_ipsec.h"
38 #include "opt_inet6.h"
39 #include "opt_tcpdebug.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/sysctl.h>
46 #include <sys/mbuf.h>
47 #ifdef INET6
48 #include <sys/domain.h>
49 #endif /* INET6 */
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/protosw.h>
53 #include <sys/proc.h>
54 #include <sys/jail.h>
55 
56 #include <net/if.h>
57 #include <net/route.h>
58 
59 #include <netinet/in.h>
60 #include <netinet/in_systm.h>
61 #ifdef INET6
62 #include <netinet/ip6.h>
63 #endif
64 #include <netinet/in_pcb.h>
65 #ifdef INET6
66 #include <netinet6/in6_pcb.h>
67 #endif
68 #include <netinet/in_var.h>
69 #include <netinet/ip_var.h>
70 #ifdef INET6
71 #include <netinet6/ip6_var.h>
72 #endif
73 #include <netinet/tcp.h>
74 #include <netinet/tcp_fsm.h>
75 #include <netinet/tcp_seq.h>
76 #include <netinet/tcp_timer.h>
77 #include <netinet/tcp_var.h>
78 #include <netinet/tcpip.h>
79 #ifdef TCPDEBUG
80 #include <netinet/tcp_debug.h>
81 #endif
82 
83 #ifdef IPSEC
84 #include <netinet6/ipsec.h>
85 #endif /*IPSEC*/
86 
87 /*
88  * TCP protocol interface to socket abstraction.
89  */
90 extern	char *tcpstates[];	/* XXX ??? */
91 
92 static int	tcp_attach(struct socket *, struct thread *td);
93 static int	tcp_connect(struct tcpcb *, struct sockaddr *,
94 		    struct thread *td);
95 #ifdef INET6
96 static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
97 		    struct thread *td);
98 #endif /* INET6 */
99 static struct tcpcb *
100 		tcp_disconnect(struct tcpcb *);
101 static struct tcpcb *
102 		tcp_usrclosed(struct tcpcb *);
103 
104 #ifdef TCPDEBUG
105 #define	TCPDEBUG0	int ostate = 0
106 #define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
107 #define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
108 				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
109 #else
110 #define	TCPDEBUG0
111 #define	TCPDEBUG1()
112 #define	TCPDEBUG2(req)
113 #endif
114 
115 /*
116  * TCP attaches to socket via pru_attach(), reserving space,
117  * and an internet control block.
118  */
119 static int
120 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
121 {
122 	int s = splnet();
123 	int error;
124 	struct inpcb *inp;
125 	struct tcpcb *tp = 0;
126 	TCPDEBUG0;
127 
128 	INP_INFO_WLOCK(&tcbinfo);
129 	TCPDEBUG1();
130 	inp = sotoinpcb(so);
131 	if (inp) {
132 		error = EISCONN;
133 		goto out;
134 	}
135 
136 	error = tcp_attach(so, td);
137 	if (error)
138 		goto out;
139 
140 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
141 		so->so_linger = TCP_LINGERTIME;
142 
143 	inp = sotoinpcb(so);
144 	tp = intotcpcb(inp);
145 out:
146 	TCPDEBUG2(PRU_ATTACH);
147 	INP_INFO_WUNLOCK(&tcbinfo);
148 	splx(s);
149 	return error;
150 }
151 
152 /*
153  * pru_detach() detaches the TCP protocol from the socket.
154  * If the protocol state is non-embryonic, then can't
155  * do this directly: have to initiate a pru_disconnect(),
156  * which may finish later; embryonic TCB's can just
157  * be discarded here.
158  */
159 static int
160 tcp_usr_detach(struct socket *so)
161 {
162 	int s = splnet();
163 	int error = 0;
164 	struct inpcb *inp;
165 	struct tcpcb *tp;
166 	TCPDEBUG0;
167 
168 	INP_INFO_WLOCK(&tcbinfo);
169 	inp = sotoinpcb(so);
170 	if (inp == 0) {
171 		INP_INFO_WUNLOCK(&tcbinfo);
172 		splx(s);
173 		return EINVAL;	/* XXX */
174 	}
175 	INP_LOCK(inp);
176 	tp = intotcpcb(inp);
177 	TCPDEBUG1();
178 	tp = tcp_disconnect(tp);
179 
180 	TCPDEBUG2(PRU_DETACH);
181 	if (tp)
182 		INP_UNLOCK(inp);
183 	INP_INFO_WUNLOCK(&tcbinfo);
184 	splx(s);
185 	return error;
186 }
187 
188 #define INI_NOLOCK	0
189 #define INI_READ	1
190 #define INI_WRITE	2
191 
192 #define	COMMON_START()						\
193 	TCPDEBUG0;						\
194 	do {							\
195 		if (inirw == INI_READ)				\
196 			INP_INFO_RLOCK(&tcbinfo);		\
197 		else if (inirw == INI_WRITE)			\
198 			INP_INFO_WLOCK(&tcbinfo);		\
199 		inp = sotoinpcb(so);				\
200 		if (inp == 0) {					\
201 			if (inirw == INI_READ)			\
202 				INP_INFO_RUNLOCK(&tcbinfo);	\
203 			else if (inirw == INI_WRITE)		\
204 				INP_INFO_WUNLOCK(&tcbinfo);	\
205 			splx(s);				\
206 			return EINVAL;				\
207 		}						\
208 		INP_LOCK(inp);					\
209 		if (inirw == INI_READ)				\
210 			INP_INFO_RUNLOCK(&tcbinfo);		\
211 		tp = intotcpcb(inp);				\
212 		TCPDEBUG1();					\
213 } while(0)
214 
215 #define COMMON_END(req)						\
216 out:	TCPDEBUG2(req);						\
217 	do {							\
218 		if (tp)						\
219 			INP_UNLOCK(inp);			\
220 		if (inirw == INI_WRITE)				\
221 			INP_INFO_WUNLOCK(&tcbinfo);		\
222 		splx(s);					\
223 		return error;					\
224 		goto out;					\
225 } while(0)
226 
227 /*
228  * Give the socket an address.
229  */
230 static int
231 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
232 {
233 	int s = splnet();
234 	int error = 0;
235 	struct inpcb *inp;
236 	struct tcpcb *tp;
237 	struct sockaddr_in *sinp;
238 	const int inirw = INI_READ;
239 
240 	COMMON_START();
241 
242 	/*
243 	 * Must check for multicast addresses and disallow binding
244 	 * to them.
245 	 */
246 	sinp = (struct sockaddr_in *)nam;
247 	if (sinp->sin_family == AF_INET &&
248 	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
249 		error = EAFNOSUPPORT;
250 		goto out;
251 	}
252 	error = in_pcbbind(inp, nam, td);
253 	if (error)
254 		goto out;
255 	COMMON_END(PRU_BIND);
256 }
257 
258 #ifdef INET6
259 static int
260 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
261 {
262 	int s = splnet();
263 	int error = 0;
264 	struct inpcb *inp;
265 	struct tcpcb *tp;
266 	struct sockaddr_in6 *sin6p;
267 	const int inirw = INI_READ;
268 
269 	COMMON_START();
270 
271 	/*
272 	 * Must check for multicast addresses and disallow binding
273 	 * to them.
274 	 */
275 	sin6p = (struct sockaddr_in6 *)nam;
276 	if (sin6p->sin6_family == AF_INET6 &&
277 	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
278 		error = EAFNOSUPPORT;
279 		goto out;
280 	}
281 	inp->inp_vflag &= ~INP_IPV4;
282 	inp->inp_vflag |= INP_IPV6;
283 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
284 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
285 			inp->inp_vflag |= INP_IPV4;
286 		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
287 			struct sockaddr_in sin;
288 
289 			in6_sin6_2_sin(&sin, sin6p);
290 			inp->inp_vflag |= INP_IPV4;
291 			inp->inp_vflag &= ~INP_IPV6;
292 			error = in_pcbbind(inp, (struct sockaddr *)&sin, td);
293 			goto out;
294 		}
295 	}
296 	error = in6_pcbbind(inp, nam, td);
297 	if (error)
298 		goto out;
299 	COMMON_END(PRU_BIND);
300 }
301 #endif /* INET6 */
302 
303 /*
304  * Prepare to accept connections.
305  */
306 static int
307 tcp_usr_listen(struct socket *so, struct thread *td)
308 {
309 	int s = splnet();
310 	int error = 0;
311 	struct inpcb *inp;
312 	struct tcpcb *tp;
313 	const int inirw = INI_READ;
314 
315 	COMMON_START();
316 	if (inp->inp_lport == 0)
317 		error = in_pcbbind(inp, (struct sockaddr *)0, td);
318 	if (error == 0)
319 		tp->t_state = TCPS_LISTEN;
320 	COMMON_END(PRU_LISTEN);
321 }
322 
323 #ifdef INET6
324 static int
325 tcp6_usr_listen(struct socket *so, struct thread *td)
326 {
327 	int s = splnet();
328 	int error = 0;
329 	struct inpcb *inp;
330 	struct tcpcb *tp;
331 	const int inirw = INI_READ;
332 
333 	COMMON_START();
334 	if (inp->inp_lport == 0) {
335 		inp->inp_vflag &= ~INP_IPV4;
336 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
337 			inp->inp_vflag |= INP_IPV4;
338 		error = in6_pcbbind(inp, (struct sockaddr *)0, td);
339 	}
340 	if (error == 0)
341 		tp->t_state = TCPS_LISTEN;
342 	COMMON_END(PRU_LISTEN);
343 }
344 #endif /* INET6 */
345 
346 /*
347  * Initiate connection to peer.
348  * Create a template for use in transmissions on this connection.
349  * Enter SYN_SENT state, and mark socket as connecting.
350  * Start keep-alive timer, and seed output sequence space.
351  * Send initial segment on connection.
352  */
353 static int
354 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
355 {
356 	int s = splnet();
357 	int error = 0;
358 	struct inpcb *inp;
359 	struct tcpcb *tp;
360 	struct sockaddr_in *sinp;
361 	const int inirw = INI_WRITE;
362 
363 	COMMON_START();
364 
365 	/*
366 	 * Must disallow TCP ``connections'' to multicast addresses.
367 	 */
368 	sinp = (struct sockaddr_in *)nam;
369 	if (sinp->sin_family == AF_INET
370 	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
371 		error = EAFNOSUPPORT;
372 		goto out;
373 	}
374 
375 	if (td && jailed(td->td_ucred))
376 		prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
377 
378 	if ((error = tcp_connect(tp, nam, td)) != 0)
379 		goto out;
380 	error = tcp_output(tp);
381 	COMMON_END(PRU_CONNECT);
382 }
383 
384 #ifdef INET6
385 static int
386 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
387 {
388 	int s = splnet();
389 	int error = 0;
390 	struct inpcb *inp;
391 	struct tcpcb *tp;
392 	struct sockaddr_in6 *sin6p;
393 	const int inirw = INI_WRITE;
394 
395 	COMMON_START();
396 
397 	/*
398 	 * Must disallow TCP ``connections'' to multicast addresses.
399 	 */
400 	sin6p = (struct sockaddr_in6 *)nam;
401 	if (sin6p->sin6_family == AF_INET6
402 	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
403 		error = EAFNOSUPPORT;
404 		goto out;
405 	}
406 
407 	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
408 		struct sockaddr_in sin;
409 
410 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
411 			error = EINVAL;
412 			goto out;
413 		}
414 
415 		in6_sin6_2_sin(&sin, sin6p);
416 		inp->inp_vflag |= INP_IPV4;
417 		inp->inp_vflag &= ~INP_IPV6;
418 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
419 			goto out;
420 		error = tcp_output(tp);
421 		goto out;
422 	}
423 	inp->inp_vflag &= ~INP_IPV4;
424 	inp->inp_vflag |= INP_IPV6;
425 	inp->inp_inc.inc_isipv6 = 1;
426 	if ((error = tcp6_connect(tp, nam, td)) != 0)
427 		goto out;
428 	error = tcp_output(tp);
429 	COMMON_END(PRU_CONNECT);
430 }
431 #endif /* INET6 */
432 
433 /*
434  * Initiate disconnect from peer.
435  * If connection never passed embryonic stage, just drop;
436  * else if don't need to let data drain, then can just drop anyways,
437  * else have to begin TCP shutdown process: mark socket disconnecting,
438  * drain unread data, state switch to reflect user close, and
439  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
440  * when peer sends FIN and acks ours.
441  *
442  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
443  */
444 static int
445 tcp_usr_disconnect(struct socket *so)
446 {
447 	int s = splnet();
448 	int error = 0;
449 	struct inpcb *inp;
450 	struct tcpcb *tp;
451 	const int inirw = INI_WRITE;
452 
453 	COMMON_START();
454 	tp = tcp_disconnect(tp);
455 	COMMON_END(PRU_DISCONNECT);
456 }
457 
458 /*
459  * Accept a connection.  Essentially all the work is
460  * done at higher levels; just return the address
461  * of the peer, storing through addr.
462  */
463 static int
464 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
465 {
466 	int s;
467 	int error = 0;
468 	struct inpcb *inp = NULL;
469 	struct tcpcb *tp = NULL;
470 	struct in_addr addr;
471 	in_port_t port = 0;
472 	TCPDEBUG0;
473 
474 	if (so->so_state & SS_ISDISCONNECTED) {
475 		error = ECONNABORTED;
476 		goto out;
477 	}
478 
479 	s = splnet();
480 	INP_INFO_RLOCK(&tcbinfo);
481 	inp = sotoinpcb(so);
482 	if (!inp) {
483 		INP_INFO_RUNLOCK(&tcbinfo);
484 		splx(s);
485 		return (EINVAL);
486 	}
487 	INP_LOCK(inp);
488 	INP_INFO_RUNLOCK(&tcbinfo);
489 	tp = intotcpcb(inp);
490 	TCPDEBUG1();
491 
492 	/*
493 	 * We inline in_setpeeraddr and COMMON_END here, so that we can
494 	 * copy the data of interest and defer the malloc until after we
495 	 * release the lock.
496 	 */
497 	port = inp->inp_fport;
498 	addr = inp->inp_faddr;
499 
500 out:	TCPDEBUG2(PRU_ACCEPT);
501 	if (tp)
502 		INP_UNLOCK(inp);
503 	splx(s);
504 	if (error == 0)
505 		*nam = in_sockaddr(port, &addr);
506 	return error;
507 }
508 
509 #ifdef INET6
510 static int
511 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
512 {
513 	int s;
514 	struct inpcb *inp = NULL;
515 	int error = 0;
516 	struct tcpcb *tp = NULL;
517 	struct in_addr addr;
518 	struct in6_addr addr6;
519 	in_port_t port = 0;
520 	int v4 = 0;
521 	TCPDEBUG0;
522 
523 	if (so->so_state & SS_ISDISCONNECTED) {
524 		error = ECONNABORTED;
525 		goto out;
526 	}
527 
528 	s = splnet();
529 	INP_INFO_RLOCK(&tcbinfo);
530 	inp = sotoinpcb(so);
531 	if (inp == 0) {
532 		INP_INFO_RUNLOCK(&tcbinfo);
533 		splx(s);
534 		return (EINVAL);
535 	}
536 	INP_LOCK(inp);
537 	INP_INFO_RUNLOCK(&tcbinfo);
538 	tp = intotcpcb(inp);
539 	TCPDEBUG1();
540 	/*
541 	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
542 	 * copy the data of interest and defer the malloc until after we
543 	 * release the lock.
544 	 */
545 	if (inp->inp_vflag & INP_IPV4) {
546 		v4 = 1;
547 		port = inp->inp_fport;
548 		addr = inp->inp_faddr;
549 	} else {
550 		port = inp->inp_fport;
551 		addr6 = inp->in6p_faddr;
552 	}
553 
554 out:	TCPDEBUG2(PRU_ACCEPT);
555 	if (tp)
556 		INP_UNLOCK(inp);
557 	splx(s);
558 	if (error == 0) {
559 		if (v4)
560 			*nam = in6_v4mapsin6_sockaddr(port, &addr);
561 		else
562 			*nam = in6_sockaddr(port, &addr6);
563 	}
564 	return error;
565 }
566 #endif /* INET6 */
567 
568 /*
569  * This is the wrapper function for in_setsockaddr. We just pass down
570  * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking
571  * here because in_setsockaddr will call malloc and can block.
572  */
573 static int
574 tcp_sockaddr(struct socket *so, struct sockaddr **nam)
575 {
576 	return (in_setsockaddr(so, nam, &tcbinfo));
577 }
578 
579 /*
580  * This is the wrapper function for in_setpeeraddr. We just pass down
581  * the pcbinfo for in_setpeeraddr to lock.
582  */
583 static int
584 tcp_peeraddr(struct socket *so, struct sockaddr **nam)
585 {
586 	return (in_setpeeraddr(so, nam, &tcbinfo));
587 }
588 
589 /*
590  * Mark the connection as being incapable of further output.
591  */
592 static int
593 tcp_usr_shutdown(struct socket *so)
594 {
595 	int s = splnet();
596 	int error = 0;
597 	struct inpcb *inp;
598 	struct tcpcb *tp;
599 	const int inirw = INI_WRITE;
600 
601 	COMMON_START();
602 	socantsendmore(so);
603 	tp = tcp_usrclosed(tp);
604 	if (tp)
605 		error = tcp_output(tp);
606 	COMMON_END(PRU_SHUTDOWN);
607 }
608 
609 /*
610  * After a receive, possibly send window update to peer.
611  */
612 static int
613 tcp_usr_rcvd(struct socket *so, int flags)
614 {
615 	int s = splnet();
616 	int error = 0;
617 	struct inpcb *inp;
618 	struct tcpcb *tp;
619 	const int inirw = INI_READ;
620 
621 	COMMON_START();
622 	tcp_output(tp);
623 	COMMON_END(PRU_RCVD);
624 }
625 
626 /*
627  * Do a send by putting data in output queue and updating urgent
628  * marker if URG set.  Possibly send more data.  Unlike the other
629  * pru_*() routines, the mbuf chains are our responsibility.  We
630  * must either enqueue them or free them.  The other pru_* routines
631  * generally are caller-frees.
632  */
633 static int
634 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
635 	     struct sockaddr *nam, struct mbuf *control, struct thread *td)
636 {
637 	int s = splnet();
638 	int error = 0;
639 	struct inpcb *inp;
640 	struct tcpcb *tp;
641 	const int inirw = INI_WRITE;
642 #ifdef INET6
643 	int isipv6;
644 #endif
645 	TCPDEBUG0;
646 
647 	/*
648 	 * Need write lock here because this function might call
649 	 * tcp_connect or tcp_usrclosed.
650 	 * We really want to have to this function upgrade from read lock
651 	 * to write lock.  XXX
652 	 */
653 	INP_INFO_WLOCK(&tcbinfo);
654 	inp = sotoinpcb(so);
655 	if (inp == NULL) {
656 		/*
657 		 * OOPS! we lost a race, the TCP session got reset after
658 		 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
659 		 * network interrupt in the non-splnet() section of sosend().
660 		 */
661 		if (m)
662 			m_freem(m);
663 		if (control)
664 			m_freem(control);
665 		error = ECONNRESET;	/* XXX EPIPE? */
666 		tp = NULL;
667 		TCPDEBUG1();
668 		goto out;
669 	}
670 	INP_LOCK(inp);
671 #ifdef INET6
672 	isipv6 = nam && nam->sa_family == AF_INET6;
673 #endif /* INET6 */
674 	tp = intotcpcb(inp);
675 	TCPDEBUG1();
676 	if (control) {
677 		/* TCP doesn't do control messages (rights, creds, etc) */
678 		if (control->m_len) {
679 			m_freem(control);
680 			if (m)
681 				m_freem(m);
682 			error = EINVAL;
683 			goto out;
684 		}
685 		m_freem(control);	/* empty control, just free it */
686 	}
687 	if (!(flags & PRUS_OOB)) {
688 		sbappend(&so->so_snd, m);
689 		if (nam && tp->t_state < TCPS_SYN_SENT) {
690 			/*
691 			 * Do implied connect if not yet connected,
692 			 * initialize window to default value, and
693 			 * initialize maxseg/maxopd using peer's cached
694 			 * MSS.
695 			 */
696 #ifdef INET6
697 			if (isipv6)
698 				error = tcp6_connect(tp, nam, td);
699 			else
700 #endif /* INET6 */
701 			error = tcp_connect(tp, nam, td);
702 			if (error)
703 				goto out;
704 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
705 			tcp_mss(tp, -1);
706 		}
707 
708 		if (flags & PRUS_EOF) {
709 			/*
710 			 * Close the send side of the connection after
711 			 * the data is sent.
712 			 */
713 			socantsendmore(so);
714 			tp = tcp_usrclosed(tp);
715 		}
716 		if (tp != NULL) {
717 			if (flags & PRUS_MORETOCOME)
718 				tp->t_flags |= TF_MORETOCOME;
719 			error = tcp_output(tp);
720 			if (flags & PRUS_MORETOCOME)
721 				tp->t_flags &= ~TF_MORETOCOME;
722 		}
723 	} else {
724 		if (sbspace(&so->so_snd) < -512) {
725 			m_freem(m);
726 			error = ENOBUFS;
727 			goto out;
728 		}
729 		/*
730 		 * According to RFC961 (Assigned Protocols),
731 		 * the urgent pointer points to the last octet
732 		 * of urgent data.  We continue, however,
733 		 * to consider it to indicate the first octet
734 		 * of data past the urgent section.
735 		 * Otherwise, snd_up should be one lower.
736 		 */
737 		sbappend(&so->so_snd, m);
738 		if (nam && tp->t_state < TCPS_SYN_SENT) {
739 			/*
740 			 * Do implied connect if not yet connected,
741 			 * initialize window to default value, and
742 			 * initialize maxseg/maxopd using peer's cached
743 			 * MSS.
744 			 */
745 #ifdef INET6
746 			if (isipv6)
747 				error = tcp6_connect(tp, nam, td);
748 			else
749 #endif /* INET6 */
750 			error = tcp_connect(tp, nam, td);
751 			if (error)
752 				goto out;
753 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
754 			tcp_mss(tp, -1);
755 		}
756 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
757 		tp->t_force = 1;
758 		error = tcp_output(tp);
759 		tp->t_force = 0;
760 	}
761 	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
762 		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
763 }
764 
765 /*
766  * Abort the TCP.
767  */
768 static int
769 tcp_usr_abort(struct socket *so)
770 {
771 	int s = splnet();
772 	int error = 0;
773 	struct inpcb *inp;
774 	struct tcpcb *tp;
775 	const int inirw = INI_WRITE;
776 
777 	COMMON_START();
778 	tp = tcp_drop(tp, ECONNABORTED);
779 	COMMON_END(PRU_ABORT);
780 }
781 
782 /*
783  * Receive out-of-band data.
784  */
785 static int
786 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
787 {
788 	int s = splnet();
789 	int error = 0;
790 	struct inpcb *inp;
791 	struct tcpcb *tp;
792 	const int inirw = INI_READ;
793 
794 	COMMON_START();
795 	if ((so->so_oobmark == 0 &&
796 	     (so->so_state & SS_RCVATMARK) == 0) ||
797 	    so->so_options & SO_OOBINLINE ||
798 	    tp->t_oobflags & TCPOOB_HADDATA) {
799 		error = EINVAL;
800 		goto out;
801 	}
802 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
803 		error = EWOULDBLOCK;
804 		goto out;
805 	}
806 	m->m_len = 1;
807 	*mtod(m, caddr_t) = tp->t_iobc;
808 	if ((flags & MSG_PEEK) == 0)
809 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
810 	COMMON_END(PRU_RCVOOB);
811 }
812 
813 /* xxx - should be const */
814 struct pr_usrreqs tcp_usrreqs = {
815 	tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
816 	tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
817 	tcp_usr_disconnect, tcp_usr_listen, tcp_peeraddr, tcp_usr_rcvd,
818 	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
819 	tcp_sockaddr, sosend, soreceive, sopoll
820 };
821 
822 #ifdef INET6
823 struct pr_usrreqs tcp6_usrreqs = {
824 	tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
825 	tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
826 	tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
827 	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
828 	in6_mapped_sockaddr, sosend, soreceive, sopoll
829 };
830 #endif /* INET6 */
831 
832 /*
833  * Common subroutine to open a TCP connection to remote host specified
834  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
835  * port number if needed.  Call in_pcbconnect_setup to do the routing and
836  * to choose a local host address (interface).  If there is an existing
837  * incarnation of the same connection in TIME-WAIT state and if the remote
838  * host was sending CC options and if the connection duration was < MSL, then
839  * truncate the previous TIME-WAIT state and proceed.
840  * Initialize connection parameters and enter SYN-SENT state.
841  */
842 static int
843 tcp_connect(tp, nam, td)
844 	register struct tcpcb *tp;
845 	struct sockaddr *nam;
846 	struct thread *td;
847 {
848 	struct inpcb *inp = tp->t_inpcb, *oinp;
849 	struct socket *so = inp->inp_socket;
850 	struct tcpcb *otp;
851 	struct rmxp_tao *taop;
852 	struct rmxp_tao tao_noncached;
853 	struct in_addr laddr;
854 	u_short lport;
855 	int error;
856 
857 	if (inp->inp_lport == 0) {
858 		error = in_pcbbind(inp, (struct sockaddr *)0, td);
859 		if (error)
860 			return error;
861 	}
862 
863 	/*
864 	 * Cannot simply call in_pcbconnect, because there might be an
865 	 * earlier incarnation of this same connection still in
866 	 * TIME_WAIT state, creating an ADDRINUSE error.
867 	 */
868 	laddr = inp->inp_laddr;
869 	lport = inp->inp_lport;
870 	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
871 	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td);
872 	if (error && oinp == NULL)
873 		return error;
874 	if (oinp) {
875 		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
876 		otp->t_state == TCPS_TIME_WAIT &&
877 		    (ticks - otp->t_starttime) < tcp_msl &&
878 		    (otp->t_flags & TF_RCVD_CC)) {
879 			inp->inp_faddr = oinp->inp_faddr;
880 			inp->inp_fport = oinp->inp_fport;
881 			otp = tcp_close(otp);
882 		} else
883 			return EADDRINUSE;
884 	}
885 	inp->inp_laddr = laddr;
886 	in_pcbrehash(inp);
887 
888 	/* Compute window scaling to request.  */
889 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
890 	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
891 		tp->request_r_scale++;
892 
893 	soisconnecting(so);
894 	tcpstat.tcps_connattempt++;
895 	tp->t_state = TCPS_SYN_SENT;
896 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
897 	tp->iss = tcp_new_isn(tp);
898 	tp->t_bw_rtseq = tp->iss;
899 	tcp_sendseqinit(tp);
900 
901 	/*
902 	 * Generate a CC value for this connection and
903 	 * check whether CC or CCnew should be used.
904 	 */
905 	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
906 		taop = &tao_noncached;
907 		bzero(taop, sizeof(*taop));
908 	}
909 
910 	tp->cc_send = CC_INC(tcp_ccgen);
911 	if (taop->tao_ccsent != 0 &&
912 	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
913 		taop->tao_ccsent = tp->cc_send;
914 	} else {
915 		taop->tao_ccsent = 0;
916 		tp->t_flags |= TF_SENDCCNEW;
917 	}
918 
919 	return 0;
920 }
921 
922 #ifdef INET6
923 static int
924 tcp6_connect(tp, nam, td)
925 	register struct tcpcb *tp;
926 	struct sockaddr *nam;
927 	struct thread *td;
928 {
929 	struct inpcb *inp = tp->t_inpcb, *oinp;
930 	struct socket *so = inp->inp_socket;
931 	struct tcpcb *otp;
932 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
933 	struct in6_addr *addr6;
934 	struct rmxp_tao *taop;
935 	struct rmxp_tao tao_noncached;
936 	int error;
937 
938 	if (inp->inp_lport == 0) {
939 		error = in6_pcbbind(inp, (struct sockaddr *)0, td);
940 		if (error)
941 			return error;
942 	}
943 
944 	/*
945 	 * Cannot simply call in_pcbconnect, because there might be an
946 	 * earlier incarnation of this same connection still in
947 	 * TIME_WAIT state, creating an ADDRINUSE error.
948 	 */
949 	error = in6_pcbladdr(inp, nam, &addr6);
950 	if (error)
951 		return error;
952 	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
953 				  &sin6->sin6_addr, sin6->sin6_port,
954 				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
955 				  ? addr6
956 				  : &inp->in6p_laddr,
957 				  inp->inp_lport,  0, NULL);
958 	if (oinp) {
959 		if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
960 		    otp->t_state == TCPS_TIME_WAIT &&
961 		    (ticks - otp->t_starttime) < tcp_msl &&
962 		    (otp->t_flags & TF_RCVD_CC))
963 			otp = tcp_close(otp);
964 		else
965 			return EADDRINUSE;
966 	}
967 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
968 		inp->in6p_laddr = *addr6;
969 	inp->in6p_faddr = sin6->sin6_addr;
970 	inp->inp_fport = sin6->sin6_port;
971 	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
972 		inp->in6p_flowinfo = sin6->sin6_flowinfo;
973 	in_pcbrehash(inp);
974 
975 	/* Compute window scaling to request.  */
976 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
977 	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
978 		tp->request_r_scale++;
979 
980 	soisconnecting(so);
981 	tcpstat.tcps_connattempt++;
982 	tp->t_state = TCPS_SYN_SENT;
983 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
984 	tp->iss = tcp_new_isn(tp);
985 	tp->t_bw_rtseq = tp->iss;
986 	tcp_sendseqinit(tp);
987 
988 	/*
989 	 * Generate a CC value for this connection and
990 	 * check whether CC or CCnew should be used.
991 	 */
992 	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
993 		taop = &tao_noncached;
994 		bzero(taop, sizeof(*taop));
995 	}
996 
997 	tp->cc_send = CC_INC(tcp_ccgen);
998 	if (taop->tao_ccsent != 0 &&
999 	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
1000 		taop->tao_ccsent = tp->cc_send;
1001 	} else {
1002 		taop->tao_ccsent = 0;
1003 		tp->t_flags |= TF_SENDCCNEW;
1004 	}
1005 
1006 	return 0;
1007 }
1008 #endif /* INET6 */
1009 
1010 /*
1011  * The new sockopt interface makes it possible for us to block in the
1012  * copyin/out step (if we take a page fault).  Taking a page fault at
1013  * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
1014  * use TSM, there probably isn't any need for this function to run at
1015  * splnet() any more.  This needs more examination.)
1016  */
1017 int
1018 tcp_ctloutput(so, sopt)
1019 	struct socket *so;
1020 	struct sockopt *sopt;
1021 {
1022 	int	error, opt, optval, s;
1023 	struct	inpcb *inp;
1024 	struct	tcpcb *tp;
1025 
1026 	error = 0;
1027 	s = splnet();		/* XXX */
1028 	INP_INFO_RLOCK(&tcbinfo);
1029 	inp = sotoinpcb(so);
1030 	if (inp == NULL) {
1031 		INP_INFO_RUNLOCK(&tcbinfo);
1032 		splx(s);
1033 		return (ECONNRESET);
1034 	}
1035 	INP_LOCK(inp);
1036 	INP_INFO_RUNLOCK(&tcbinfo);
1037 	if (sopt->sopt_level != IPPROTO_TCP) {
1038 #ifdef INET6
1039 		if (INP_CHECK_SOCKAF(so, AF_INET6))
1040 			error = ip6_ctloutput(so, sopt);
1041 		else
1042 #endif /* INET6 */
1043 		error = ip_ctloutput(so, sopt);
1044 		INP_UNLOCK(inp);
1045 		splx(s);
1046 		return (error);
1047 	}
1048 	tp = intotcpcb(inp);
1049 
1050 	switch (sopt->sopt_dir) {
1051 	case SOPT_SET:
1052 		switch (sopt->sopt_name) {
1053 		case TCP_NODELAY:
1054 		case TCP_NOOPT:
1055 			error = sooptcopyin(sopt, &optval, sizeof optval,
1056 					    sizeof optval);
1057 			if (error)
1058 				break;
1059 
1060 			switch (sopt->sopt_name) {
1061 			case TCP_NODELAY:
1062 				opt = TF_NODELAY;
1063 				break;
1064 			case TCP_NOOPT:
1065 				opt = TF_NOOPT;
1066 				break;
1067 			default:
1068 				opt = 0; /* dead code to fool gcc */
1069 				break;
1070 			}
1071 
1072 			if (optval)
1073 				tp->t_flags |= opt;
1074 			else
1075 				tp->t_flags &= ~opt;
1076 			break;
1077 
1078 		case TCP_NOPUSH:
1079 			error = sooptcopyin(sopt, &optval, sizeof optval,
1080 					    sizeof optval);
1081 			if (error)
1082 				break;
1083 
1084 			if (optval)
1085 				tp->t_flags |= TF_NOPUSH;
1086 			else {
1087 				tp->t_flags &= ~TF_NOPUSH;
1088 				error = tcp_output(tp);
1089 			}
1090 			break;
1091 
1092 		case TCP_MAXSEG:
1093 			error = sooptcopyin(sopt, &optval, sizeof optval,
1094 					    sizeof optval);
1095 			if (error)
1096 				break;
1097 
1098 			if (optval > 0 && optval <= tp->t_maxseg)
1099 				tp->t_maxseg = optval;
1100 			else
1101 				error = EINVAL;
1102 			break;
1103 
1104 		default:
1105 			error = ENOPROTOOPT;
1106 			break;
1107 		}
1108 		break;
1109 
1110 	case SOPT_GET:
1111 		switch (sopt->sopt_name) {
1112 		case TCP_NODELAY:
1113 			optval = tp->t_flags & TF_NODELAY;
1114 			break;
1115 		case TCP_MAXSEG:
1116 			optval = tp->t_maxseg;
1117 			break;
1118 		case TCP_NOOPT:
1119 			optval = tp->t_flags & TF_NOOPT;
1120 			break;
1121 		case TCP_NOPUSH:
1122 			optval = tp->t_flags & TF_NOPUSH;
1123 			break;
1124 		default:
1125 			error = ENOPROTOOPT;
1126 			break;
1127 		}
1128 		if (error == 0)
1129 			error = sooptcopyout(sopt, &optval, sizeof optval);
1130 		break;
1131 	}
1132 	INP_UNLOCK(inp);
1133 	splx(s);
1134 	return (error);
1135 }
1136 
1137 /*
1138  * tcp_sendspace and tcp_recvspace are the default send and receive window
1139  * sizes, respectively.  These are obsolescent (this information should
1140  * be set by the route).
1141  */
1142 u_long	tcp_sendspace = 1024*32;
1143 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1144     &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1145 u_long	tcp_recvspace = 1024*64;
1146 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1147     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1148 
1149 /*
1150  * Attach TCP protocol to socket, allocating
1151  * internet protocol control block, tcp control block,
1152  * bufer space, and entering LISTEN state if to accept connections.
1153  */
1154 static int
1155 tcp_attach(so, td)
1156 	struct socket *so;
1157 	struct thread *td;
1158 {
1159 	register struct tcpcb *tp;
1160 	struct inpcb *inp;
1161 	int error;
1162 #ifdef INET6
1163 	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
1164 #endif
1165 
1166 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1167 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
1168 		if (error)
1169 			return (error);
1170 	}
1171 	error = in_pcballoc(so, &tcbinfo, td);
1172 	if (error)
1173 		return (error);
1174 	inp = sotoinpcb(so);
1175 #ifdef INET6
1176 	if (isipv6) {
1177 		inp->inp_vflag |= INP_IPV6;
1178 		inp->in6p_hops = -1;	/* use kernel default */
1179 	}
1180 	else
1181 #endif
1182 	inp->inp_vflag |= INP_IPV4;
1183 	tp = tcp_newtcpcb(inp);
1184 	if (tp == 0) {
1185 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
1186 
1187 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
1188 #ifdef INET6
1189 		if (isipv6)
1190 			in6_pcbdetach(inp);
1191 		else
1192 #endif
1193 		in_pcbdetach(inp);
1194 		so->so_state |= nofd;
1195 		return (ENOBUFS);
1196 	}
1197 	tp->t_state = TCPS_CLOSED;
1198 	return (0);
1199 }
1200 
1201 /*
1202  * Initiate (or continue) disconnect.
1203  * If embryonic state, just send reset (once).
1204  * If in ``let data drain'' option and linger null, just drop.
1205  * Otherwise (hard), mark socket disconnecting and drop
1206  * current input data; switch states based on user close, and
1207  * send segment to peer (with FIN).
1208  */
1209 static struct tcpcb *
1210 tcp_disconnect(tp)
1211 	register struct tcpcb *tp;
1212 {
1213 	struct socket *so = tp->t_inpcb->inp_socket;
1214 
1215 	if (tp->t_state < TCPS_ESTABLISHED)
1216 		tp = tcp_close(tp);
1217 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1218 		tp = tcp_drop(tp, 0);
1219 	else {
1220 		soisdisconnecting(so);
1221 		sbflush(&so->so_rcv);
1222 		tp = tcp_usrclosed(tp);
1223 		if (tp)
1224 			(void) tcp_output(tp);
1225 	}
1226 	return (tp);
1227 }
1228 
1229 /*
1230  * User issued close, and wish to trail through shutdown states:
1231  * if never received SYN, just forget it.  If got a SYN from peer,
1232  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1233  * If already got a FIN from peer, then almost done; go to LAST_ACK
1234  * state.  In all other cases, have already sent FIN to peer (e.g.
1235  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1236  * for peer to send FIN or not respond to keep-alives, etc.
1237  * We can let the user exit from the close as soon as the FIN is acked.
1238  */
1239 static struct tcpcb *
1240 tcp_usrclosed(tp)
1241 	register struct tcpcb *tp;
1242 {
1243 
1244 	switch (tp->t_state) {
1245 
1246 	case TCPS_CLOSED:
1247 	case TCPS_LISTEN:
1248 		tp->t_state = TCPS_CLOSED;
1249 		tp = tcp_close(tp);
1250 		break;
1251 
1252 	case TCPS_SYN_SENT:
1253 	case TCPS_SYN_RECEIVED:
1254 		tp->t_flags |= TF_NEEDFIN;
1255 		break;
1256 
1257 	case TCPS_ESTABLISHED:
1258 		tp->t_state = TCPS_FIN_WAIT_1;
1259 		break;
1260 
1261 	case TCPS_CLOSE_WAIT:
1262 		tp->t_state = TCPS_LAST_ACK;
1263 		break;
1264 	}
1265 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1266 		soisdisconnected(tp->t_inpcb->inp_socket);
1267 		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
1268 		if (tp->t_state == TCPS_FIN_WAIT_2)
1269 			callout_reset(tp->tt_2msl, tcp_maxidle,
1270 				      tcp_timer_2msl, tp);
1271 	}
1272 	return (tp);
1273 }
1274 
1275