xref: /freebsd/sys/netinet/tcp_usrreq.c (revision 729362425c09cf6b362366aabc6fb547eee8035a)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
34  * $FreeBSD$
35  */
36 
37 #include "opt_ipsec.h"
38 #include "opt_inet6.h"
39 #include "opt_tcpdebug.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/sysctl.h>
46 #include <sys/mbuf.h>
47 #ifdef INET6
48 #include <sys/domain.h>
49 #endif /* INET6 */
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/protosw.h>
53 #include <sys/proc.h>
54 #include <sys/jail.h>
55 
56 #include <net/if.h>
57 #include <net/route.h>
58 
59 #include <netinet/in.h>
60 #include <netinet/in_systm.h>
61 #ifdef INET6
62 #include <netinet/ip6.h>
63 #endif
64 #include <netinet/in_pcb.h>
65 #ifdef INET6
66 #include <netinet6/in6_pcb.h>
67 #endif
68 #include <netinet/in_var.h>
69 #include <netinet/ip_var.h>
70 #ifdef INET6
71 #include <netinet6/ip6_var.h>
72 #endif
73 #include <netinet/tcp.h>
74 #include <netinet/tcp_fsm.h>
75 #include <netinet/tcp_seq.h>
76 #include <netinet/tcp_timer.h>
77 #include <netinet/tcp_var.h>
78 #include <netinet/tcpip.h>
79 #ifdef TCPDEBUG
80 #include <netinet/tcp_debug.h>
81 #endif
82 
83 #ifdef IPSEC
84 #include <netinet6/ipsec.h>
85 #endif /*IPSEC*/
86 
87 /*
88  * TCP protocol interface to socket abstraction.
89  */
90 extern	char *tcpstates[];	/* XXX ??? */
91 
92 static int	tcp_attach(struct socket *, struct thread *td);
93 static int	tcp_connect(struct tcpcb *, struct sockaddr *,
94 		    struct thread *td);
95 #ifdef INET6
96 static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
97 		    struct thread *td);
98 #endif /* INET6 */
99 static struct tcpcb *
100 		tcp_disconnect(struct tcpcb *);
101 static struct tcpcb *
102 		tcp_usrclosed(struct tcpcb *);
103 
104 #ifdef TCPDEBUG
105 #define	TCPDEBUG0	int ostate = 0
106 #define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
107 #define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
108 				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
109 #else
110 #define	TCPDEBUG0
111 #define	TCPDEBUG1()
112 #define	TCPDEBUG2(req)
113 #endif
114 
115 /*
116  * TCP attaches to socket via pru_attach(), reserving space,
117  * and an internet control block.
118  */
119 static int
120 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
121 {
122 	int s = splnet();
123 	int error;
124 	struct inpcb *inp;
125 	struct tcpcb *tp = 0;
126 	TCPDEBUG0;
127 
128 	INP_INFO_WLOCK(&tcbinfo);
129 	TCPDEBUG1();
130 	inp = sotoinpcb(so);
131 	if (inp) {
132 		error = EISCONN;
133 		goto out;
134 	}
135 
136 	error = tcp_attach(so, td);
137 	if (error)
138 		goto out;
139 
140 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
141 		so->so_linger = TCP_LINGERTIME;
142 
143 	inp = sotoinpcb(so);
144 	tp = intotcpcb(inp);
145 out:
146 	TCPDEBUG2(PRU_ATTACH);
147 	INP_INFO_WUNLOCK(&tcbinfo);
148 	splx(s);
149 	return error;
150 }
151 
152 /*
153  * pru_detach() detaches the TCP protocol from the socket.
154  * If the protocol state is non-embryonic, then can't
155  * do this directly: have to initiate a pru_disconnect(),
156  * which may finish later; embryonic TCB's can just
157  * be discarded here.
158  */
159 static int
160 tcp_usr_detach(struct socket *so)
161 {
162 	int s = splnet();
163 	int error = 0;
164 	struct inpcb *inp;
165 	struct tcpcb *tp;
166 	TCPDEBUG0;
167 
168 	INP_INFO_WLOCK(&tcbinfo);
169 	inp = sotoinpcb(so);
170 	if (inp == 0) {
171 		INP_INFO_WUNLOCK(&tcbinfo);
172 		splx(s);
173 		return EINVAL;	/* XXX */
174 	}
175 	INP_LOCK(inp);
176 	tp = intotcpcb(inp);
177 	TCPDEBUG1();
178 	tp = tcp_disconnect(tp);
179 
180 	TCPDEBUG2(PRU_DETACH);
181 	if (tp)
182 		INP_UNLOCK(inp);
183 	INP_INFO_WUNLOCK(&tcbinfo);
184 	splx(s);
185 	return error;
186 }
187 
188 #define INI_NOLOCK	0
189 #define INI_READ	1
190 #define INI_WRITE	2
191 
192 #define	COMMON_START()						\
193 	TCPDEBUG0;						\
194 	do {							\
195 		if (inirw == INI_READ)				\
196 			INP_INFO_RLOCK(&tcbinfo);		\
197 		else if (inirw == INI_WRITE)			\
198 			INP_INFO_WLOCK(&tcbinfo);		\
199 		inp = sotoinpcb(so);				\
200 		if (inp == 0) {					\
201 			if (inirw == INI_READ)			\
202 				INP_INFO_RUNLOCK(&tcbinfo);	\
203 			else if (inirw == INI_WRITE)		\
204 				INP_INFO_WUNLOCK(&tcbinfo);	\
205 			splx(s);				\
206 			return EINVAL;				\
207 		}						\
208 		INP_LOCK(inp);					\
209 		if (inirw == INI_READ)				\
210 			INP_INFO_RUNLOCK(&tcbinfo);		\
211 		tp = intotcpcb(inp);				\
212 		TCPDEBUG1();					\
213 } while(0)
214 
215 #define COMMON_END(req)						\
216 out:	TCPDEBUG2(req);						\
217 	do {							\
218 		if (tp)						\
219 			INP_UNLOCK(inp);			\
220 		if (inirw == INI_WRITE)				\
221 			INP_INFO_WUNLOCK(&tcbinfo);		\
222 		splx(s);					\
223 		return error;					\
224 		goto out;					\
225 } while(0)
226 
227 /*
228  * Give the socket an address.
229  */
230 static int
231 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
232 {
233 	int s = splnet();
234 	int error = 0;
235 	struct inpcb *inp;
236 	struct tcpcb *tp;
237 	struct sockaddr_in *sinp;
238 	const int inirw = INI_WRITE;
239 
240 	COMMON_START();
241 
242 	/*
243 	 * Must check for multicast addresses and disallow binding
244 	 * to them.
245 	 */
246 	sinp = (struct sockaddr_in *)nam;
247 	if (sinp->sin_family == AF_INET &&
248 	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
249 		error = EAFNOSUPPORT;
250 		goto out;
251 	}
252 	error = in_pcbbind(inp, nam, td);
253 	if (error)
254 		goto out;
255 	COMMON_END(PRU_BIND);
256 }
257 
258 #ifdef INET6
259 static int
260 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
261 {
262 	int s = splnet();
263 	int error = 0;
264 	struct inpcb *inp;
265 	struct tcpcb *tp;
266 	struct sockaddr_in6 *sin6p;
267 	const int inirw = INI_WRITE;
268 
269 	COMMON_START();
270 
271 	/*
272 	 * Must check for multicast addresses and disallow binding
273 	 * to them.
274 	 */
275 	sin6p = (struct sockaddr_in6 *)nam;
276 	if (sin6p->sin6_family == AF_INET6 &&
277 	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
278 		error = EAFNOSUPPORT;
279 		goto out;
280 	}
281 	inp->inp_vflag &= ~INP_IPV4;
282 	inp->inp_vflag |= INP_IPV6;
283 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
284 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
285 			inp->inp_vflag |= INP_IPV4;
286 		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
287 			struct sockaddr_in sin;
288 
289 			in6_sin6_2_sin(&sin, sin6p);
290 			inp->inp_vflag |= INP_IPV4;
291 			inp->inp_vflag &= ~INP_IPV6;
292 			error = in_pcbbind(inp, (struct sockaddr *)&sin, td);
293 			goto out;
294 		}
295 	}
296 	error = in6_pcbbind(inp, nam, td);
297 	if (error)
298 		goto out;
299 	COMMON_END(PRU_BIND);
300 }
301 #endif /* INET6 */
302 
303 /*
304  * Prepare to accept connections.
305  */
306 static int
307 tcp_usr_listen(struct socket *so, struct thread *td)
308 {
309 	int s = splnet();
310 	int error = 0;
311 	struct inpcb *inp;
312 	struct tcpcb *tp;
313 	const int inirw = INI_WRITE;
314 
315 	COMMON_START();
316 	if (inp->inp_lport == 0)
317 		error = in_pcbbind(inp, (struct sockaddr *)0, td);
318 	if (error == 0)
319 		tp->t_state = TCPS_LISTEN;
320 	COMMON_END(PRU_LISTEN);
321 }
322 
323 #ifdef INET6
324 static int
325 tcp6_usr_listen(struct socket *so, struct thread *td)
326 {
327 	int s = splnet();
328 	int error = 0;
329 	struct inpcb *inp;
330 	struct tcpcb *tp;
331 	const int inirw = INI_WRITE;
332 
333 	COMMON_START();
334 	if (inp->inp_lport == 0) {
335 		inp->inp_vflag &= ~INP_IPV4;
336 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
337 			inp->inp_vflag |= INP_IPV4;
338 		error = in6_pcbbind(inp, (struct sockaddr *)0, td);
339 	}
340 	if (error == 0)
341 		tp->t_state = TCPS_LISTEN;
342 	COMMON_END(PRU_LISTEN);
343 }
344 #endif /* INET6 */
345 
346 /*
347  * Initiate connection to peer.
348  * Create a template for use in transmissions on this connection.
349  * Enter SYN_SENT state, and mark socket as connecting.
350  * Start keep-alive timer, and seed output sequence space.
351  * Send initial segment on connection.
352  */
353 static int
354 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
355 {
356 	int s = splnet();
357 	int error = 0;
358 	struct inpcb *inp;
359 	struct tcpcb *tp;
360 	struct sockaddr_in *sinp;
361 	const int inirw = INI_WRITE;
362 
363 	COMMON_START();
364 
365 	/*
366 	 * Must disallow TCP ``connections'' to multicast addresses.
367 	 */
368 	sinp = (struct sockaddr_in *)nam;
369 	if (sinp->sin_family == AF_INET
370 	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
371 		error = EAFNOSUPPORT;
372 		goto out;
373 	}
374 
375 	if (td && jailed(td->td_ucred))
376 		prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
377 
378 	if ((error = tcp_connect(tp, nam, td)) != 0)
379 		goto out;
380 	error = tcp_output(tp);
381 	COMMON_END(PRU_CONNECT);
382 }
383 
384 #ifdef INET6
385 static int
386 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
387 {
388 	int s = splnet();
389 	int error = 0;
390 	struct inpcb *inp;
391 	struct tcpcb *tp;
392 	struct sockaddr_in6 *sin6p;
393 	const int inirw = INI_WRITE;
394 
395 	COMMON_START();
396 
397 	/*
398 	 * Must disallow TCP ``connections'' to multicast addresses.
399 	 */
400 	sin6p = (struct sockaddr_in6 *)nam;
401 	if (sin6p->sin6_family == AF_INET6
402 	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
403 		error = EAFNOSUPPORT;
404 		goto out;
405 	}
406 
407 	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
408 		struct sockaddr_in sin;
409 
410 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
411 			error = EINVAL;
412 			goto out;
413 		}
414 
415 		in6_sin6_2_sin(&sin, sin6p);
416 		inp->inp_vflag |= INP_IPV4;
417 		inp->inp_vflag &= ~INP_IPV6;
418 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
419 			goto out;
420 		error = tcp_output(tp);
421 		goto out;
422 	}
423 	inp->inp_vflag &= ~INP_IPV4;
424 	inp->inp_vflag |= INP_IPV6;
425 	inp->inp_inc.inc_isipv6 = 1;
426 	if ((error = tcp6_connect(tp, nam, td)) != 0)
427 		goto out;
428 	error = tcp_output(tp);
429 	COMMON_END(PRU_CONNECT);
430 }
431 #endif /* INET6 */
432 
433 /*
434  * Initiate disconnect from peer.
435  * If connection never passed embryonic stage, just drop;
436  * else if don't need to let data drain, then can just drop anyways,
437  * else have to begin TCP shutdown process: mark socket disconnecting,
438  * drain unread data, state switch to reflect user close, and
439  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
440  * when peer sends FIN and acks ours.
441  *
442  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
443  */
444 static int
445 tcp_usr_disconnect(struct socket *so)
446 {
447 	int s = splnet();
448 	int error = 0;
449 	struct inpcb *inp;
450 	struct tcpcb *tp;
451 	const int inirw = INI_WRITE;
452 
453 	COMMON_START();
454 	tp = tcp_disconnect(tp);
455 	COMMON_END(PRU_DISCONNECT);
456 }
457 
458 /*
459  * Accept a connection.  Essentially all the work is
460  * done at higher levels; just return the address
461  * of the peer, storing through addr.
462  */
463 static int
464 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
465 {
466 	int s;
467 	int error = 0;
468 	struct inpcb *inp = NULL;
469 	struct tcpcb *tp = NULL;
470 	struct in_addr addr;
471 	in_port_t port = 0;
472 	TCPDEBUG0;
473 
474 	if (so->so_state & SS_ISDISCONNECTED) {
475 		error = ECONNABORTED;
476 		goto out;
477 	}
478 
479 	s = splnet();
480 	INP_INFO_RLOCK(&tcbinfo);
481 	inp = sotoinpcb(so);
482 	if (!inp) {
483 		INP_INFO_RUNLOCK(&tcbinfo);
484 		splx(s);
485 		return (EINVAL);
486 	}
487 	INP_LOCK(inp);
488 	INP_INFO_RUNLOCK(&tcbinfo);
489 	tp = intotcpcb(inp);
490 	TCPDEBUG1();
491 
492 	/*
493 	 * We inline in_setpeeraddr and COMMON_END here, so that we can
494 	 * copy the data of interest and defer the malloc until after we
495 	 * release the lock.
496 	 */
497 	port = inp->inp_fport;
498 	addr = inp->inp_faddr;
499 
500 out:	TCPDEBUG2(PRU_ACCEPT);
501 	if (tp)
502 		INP_UNLOCK(inp);
503 	splx(s);
504 	if (error == 0)
505 		*nam = in_sockaddr(port, &addr);
506 	return error;
507 }
508 
509 #ifdef INET6
510 static int
511 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
512 {
513 	int s;
514 	struct inpcb *inp = NULL;
515 	int error = 0;
516 	struct tcpcb *tp = NULL;
517 	struct in_addr addr;
518 	struct in6_addr addr6;
519 	in_port_t port = 0;
520 	int v4 = 0;
521 	TCPDEBUG0;
522 
523 	if (so->so_state & SS_ISDISCONNECTED) {
524 		error = ECONNABORTED;
525 		goto out;
526 	}
527 
528 	s = splnet();
529 	INP_INFO_RLOCK(&tcbinfo);
530 	inp = sotoinpcb(so);
531 	if (inp == 0) {
532 		INP_INFO_RUNLOCK(&tcbinfo);
533 		splx(s);
534 		return (EINVAL);
535 	}
536 	INP_LOCK(inp);
537 	INP_INFO_RUNLOCK(&tcbinfo);
538 	tp = intotcpcb(inp);
539 	TCPDEBUG1();
540 	/*
541 	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
542 	 * copy the data of interest and defer the malloc until after we
543 	 * release the lock.
544 	 */
545 	if (inp->inp_vflag & INP_IPV4) {
546 		v4 = 1;
547 		port = inp->inp_fport;
548 		addr = inp->inp_faddr;
549 	} else {
550 		port = inp->inp_fport;
551 		addr6 = inp->in6p_faddr;
552 	}
553 
554 out:	TCPDEBUG2(PRU_ACCEPT);
555 	if (tp)
556 		INP_UNLOCK(inp);
557 	splx(s);
558 	if (error == 0) {
559 		if (v4)
560 			*nam = in6_v4mapsin6_sockaddr(port, &addr);
561 		else
562 			*nam = in6_sockaddr(port, &addr6);
563 	}
564 	return error;
565 }
566 #endif /* INET6 */
567 
568 /*
569  * This is the wrapper function for in_setsockaddr. We just pass down
570  * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking
571  * here because in_setsockaddr will call malloc and can block.
572  */
573 static int
574 tcp_sockaddr(struct socket *so, struct sockaddr **nam)
575 {
576 	return (in_setsockaddr(so, nam, &tcbinfo));
577 }
578 
579 /*
580  * This is the wrapper function for in_setpeeraddr. We just pass down
581  * the pcbinfo for in_setpeeraddr to lock.
582  */
583 static int
584 tcp_peeraddr(struct socket *so, struct sockaddr **nam)
585 {
586 	return (in_setpeeraddr(so, nam, &tcbinfo));
587 }
588 
589 /*
590  * Mark the connection as being incapable of further output.
591  */
592 static int
593 tcp_usr_shutdown(struct socket *so)
594 {
595 	int s = splnet();
596 	int error = 0;
597 	struct inpcb *inp;
598 	struct tcpcb *tp;
599 	const int inirw = INI_WRITE;
600 
601 	COMMON_START();
602 	socantsendmore(so);
603 	tp = tcp_usrclosed(tp);
604 	if (tp)
605 		error = tcp_output(tp);
606 	COMMON_END(PRU_SHUTDOWN);
607 }
608 
609 /*
610  * After a receive, possibly send window update to peer.
611  */
612 static int
613 tcp_usr_rcvd(struct socket *so, int flags)
614 {
615 	int s = splnet();
616 	int error = 0;
617 	struct inpcb *inp;
618 	struct tcpcb *tp;
619 	const int inirw = INI_READ;
620 
621 	COMMON_START();
622 	tcp_output(tp);
623 	COMMON_END(PRU_RCVD);
624 }
625 
626 /*
627  * Do a send by putting data in output queue and updating urgent
628  * marker if URG set.  Possibly send more data.  Unlike the other
629  * pru_*() routines, the mbuf chains are our responsibility.  We
630  * must either enqueue them or free them.  The other pru_* routines
631  * generally are caller-frees.
632  */
633 static int
634 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
635 	     struct sockaddr *nam, struct mbuf *control, struct thread *td)
636 {
637 	int s = splnet();
638 	int error = 0;
639 	struct inpcb *inp;
640 	struct tcpcb *tp;
641 	const int inirw = INI_WRITE;
642 #ifdef INET6
643 	int isipv6;
644 #endif
645 	TCPDEBUG0;
646 
647 	/*
648 	 * Need write lock here because this function might call
649 	 * tcp_connect or tcp_usrclosed.
650 	 * We really want to have to this function upgrade from read lock
651 	 * to write lock.  XXX
652 	 */
653 	INP_INFO_WLOCK(&tcbinfo);
654 	inp = sotoinpcb(so);
655 	if (inp == NULL) {
656 		/*
657 		 * OOPS! we lost a race, the TCP session got reset after
658 		 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
659 		 * network interrupt in the non-splnet() section of sosend().
660 		 */
661 		if (m)
662 			m_freem(m);
663 		if (control)
664 			m_freem(control);
665 		error = ECONNRESET;	/* XXX EPIPE? */
666 		tp = NULL;
667 		TCPDEBUG1();
668 		goto out;
669 	}
670 	INP_LOCK(inp);
671 #ifdef INET6
672 	isipv6 = nam && nam->sa_family == AF_INET6;
673 #endif /* INET6 */
674 	tp = intotcpcb(inp);
675 	TCPDEBUG1();
676 	if (control) {
677 		/* TCP doesn't do control messages (rights, creds, etc) */
678 		if (control->m_len) {
679 			m_freem(control);
680 			if (m)
681 				m_freem(m);
682 			error = EINVAL;
683 			goto out;
684 		}
685 		m_freem(control);	/* empty control, just free it */
686 	}
687 	if (!(flags & PRUS_OOB)) {
688 		sbappend(&so->so_snd, m);
689 		if (nam && tp->t_state < TCPS_SYN_SENT) {
690 			/*
691 			 * Do implied connect if not yet connected,
692 			 * initialize window to default value, and
693 			 * initialize maxseg/maxopd using peer's cached
694 			 * MSS.
695 			 */
696 #ifdef INET6
697 			if (isipv6)
698 				error = tcp6_connect(tp, nam, td);
699 			else
700 #endif /* INET6 */
701 			error = tcp_connect(tp, nam, td);
702 			if (error)
703 				goto out;
704 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
705 			tcp_mss(tp, -1);
706 		}
707 
708 		if (flags & PRUS_EOF) {
709 			/*
710 			 * Close the send side of the connection after
711 			 * the data is sent.
712 			 */
713 			socantsendmore(so);
714 			tp = tcp_usrclosed(tp);
715 		}
716 		if (tp != NULL) {
717 			if (flags & PRUS_MORETOCOME)
718 				tp->t_flags |= TF_MORETOCOME;
719 			error = tcp_output(tp);
720 			if (flags & PRUS_MORETOCOME)
721 				tp->t_flags &= ~TF_MORETOCOME;
722 		}
723 	} else {
724 		if (sbspace(&so->so_snd) < -512) {
725 			m_freem(m);
726 			error = ENOBUFS;
727 			goto out;
728 		}
729 		/*
730 		 * According to RFC961 (Assigned Protocols),
731 		 * the urgent pointer points to the last octet
732 		 * of urgent data.  We continue, however,
733 		 * to consider it to indicate the first octet
734 		 * of data past the urgent section.
735 		 * Otherwise, snd_up should be one lower.
736 		 */
737 		sbappend(&so->so_snd, m);
738 		if (nam && tp->t_state < TCPS_SYN_SENT) {
739 			/*
740 			 * Do implied connect if not yet connected,
741 			 * initialize window to default value, and
742 			 * initialize maxseg/maxopd using peer's cached
743 			 * MSS.
744 			 */
745 #ifdef INET6
746 			if (isipv6)
747 				error = tcp6_connect(tp, nam, td);
748 			else
749 #endif /* INET6 */
750 			error = tcp_connect(tp, nam, td);
751 			if (error)
752 				goto out;
753 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
754 			tcp_mss(tp, -1);
755 		}
756 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
757 		tp->t_force = 1;
758 		error = tcp_output(tp);
759 		tp->t_force = 0;
760 	}
761 	COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
762 		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
763 }
764 
765 /*
766  * Abort the TCP.
767  */
768 static int
769 tcp_usr_abort(struct socket *so)
770 {
771 	int s = splnet();
772 	int error = 0;
773 	struct inpcb *inp;
774 	struct tcpcb *tp;
775 	const int inirw = INI_WRITE;
776 
777 	COMMON_START();
778 	tp = tcp_drop(tp, ECONNABORTED);
779 	COMMON_END(PRU_ABORT);
780 }
781 
782 /*
783  * Receive out-of-band data.
784  */
785 static int
786 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
787 {
788 	int s = splnet();
789 	int error = 0;
790 	struct inpcb *inp;
791 	struct tcpcb *tp;
792 	const int inirw = INI_READ;
793 
794 	COMMON_START();
795 	if ((so->so_oobmark == 0 &&
796 	     (so->so_state & SS_RCVATMARK) == 0) ||
797 	    so->so_options & SO_OOBINLINE ||
798 	    tp->t_oobflags & TCPOOB_HADDATA) {
799 		error = EINVAL;
800 		goto out;
801 	}
802 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
803 		error = EWOULDBLOCK;
804 		goto out;
805 	}
806 	m->m_len = 1;
807 	*mtod(m, caddr_t) = tp->t_iobc;
808 	if ((flags & MSG_PEEK) == 0)
809 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
810 	COMMON_END(PRU_RCVOOB);
811 }
812 
813 /* xxx - should be const */
814 struct pr_usrreqs tcp_usrreqs = {
815 	tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
816 	tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
817 	tcp_usr_disconnect, tcp_usr_listen, tcp_peeraddr, tcp_usr_rcvd,
818 	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
819 	tcp_sockaddr, sosend, soreceive, sopoll
820 };
821 
822 #ifdef INET6
823 struct pr_usrreqs tcp6_usrreqs = {
824 	tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
825 	tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
826 	tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
827 	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
828 	in6_mapped_sockaddr, sosend, soreceive, sopoll
829 };
830 #endif /* INET6 */
831 
832 /*
833  * Common subroutine to open a TCP connection to remote host specified
834  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
835  * port number if needed.  Call in_pcbconnect_setup to do the routing and
836  * to choose a local host address (interface).  If there is an existing
837  * incarnation of the same connection in TIME-WAIT state and if the remote
838  * host was sending CC options and if the connection duration was < MSL, then
839  * truncate the previous TIME-WAIT state and proceed.
840  * Initialize connection parameters and enter SYN-SENT state.
841  */
842 static int
843 tcp_connect(tp, nam, td)
844 	register struct tcpcb *tp;
845 	struct sockaddr *nam;
846 	struct thread *td;
847 {
848 	struct inpcb *inp = tp->t_inpcb, *oinp;
849 	struct socket *so = inp->inp_socket;
850 	struct tcptw *otw;
851 	struct rmxp_tao *taop;
852 	struct rmxp_tao tao_noncached;
853 	struct in_addr laddr;
854 	u_short lport;
855 	int error;
856 
857 	if (inp->inp_lport == 0) {
858 		error = in_pcbbind(inp, (struct sockaddr *)0, td);
859 		if (error)
860 			return error;
861 	}
862 
863 	/*
864 	 * Cannot simply call in_pcbconnect, because there might be an
865 	 * earlier incarnation of this same connection still in
866 	 * TIME_WAIT state, creating an ADDRINUSE error.
867 	 */
868 	laddr = inp->inp_laddr;
869 	lport = inp->inp_lport;
870 	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
871 	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td);
872 	if (error && oinp == NULL)
873 		return error;
874 	if (oinp) {
875 		if (oinp != inp &&
876 		    (oinp->inp_vflag & INP_TIMEWAIT) &&
877 		    (ticks - (otw = intotw(oinp))->t_starttime) < tcp_msl &&
878 		    otw->cc_recv != 0) {
879 			inp->inp_faddr = oinp->inp_faddr;
880 			inp->inp_fport = oinp->inp_fport;
881 			(void) tcp_twclose(otw, 0);
882 		} else
883 			return EADDRINUSE;
884 	}
885 	inp->inp_laddr = laddr;
886 	in_pcbrehash(inp);
887 
888 	/* Compute window scaling to request.  */
889 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
890 	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
891 		tp->request_r_scale++;
892 
893 	soisconnecting(so);
894 	tcpstat.tcps_connattempt++;
895 	tp->t_state = TCPS_SYN_SENT;
896 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
897 	tp->iss = tcp_new_isn(tp);
898 	tp->t_bw_rtseq = tp->iss;
899 	tcp_sendseqinit(tp);
900 
901 	/*
902 	 * Generate a CC value for this connection and
903 	 * check whether CC or CCnew should be used.
904 	 */
905 	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
906 		taop = &tao_noncached;
907 		bzero(taop, sizeof(*taop));
908 	}
909 
910 	tp->cc_send = CC_INC(tcp_ccgen);
911 	if (taop->tao_ccsent != 0 &&
912 	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
913 		taop->tao_ccsent = tp->cc_send;
914 	} else {
915 		taop->tao_ccsent = 0;
916 		tp->t_flags |= TF_SENDCCNEW;
917 	}
918 
919 	return 0;
920 }
921 
922 #ifdef INET6
923 static int
924 tcp6_connect(tp, nam, td)
925 	register struct tcpcb *tp;
926 	struct sockaddr *nam;
927 	struct thread *td;
928 {
929 	struct inpcb *inp = tp->t_inpcb, *oinp;
930 	struct socket *so = inp->inp_socket;
931 	struct tcptw *otw;
932 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
933 	struct in6_addr *addr6;
934 	struct rmxp_tao *taop;
935 	struct rmxp_tao tao_noncached;
936 	int error;
937 
938 	if (inp->inp_lport == 0) {
939 		error = in6_pcbbind(inp, (struct sockaddr *)0, td);
940 		if (error)
941 			return error;
942 	}
943 
944 	/*
945 	 * Cannot simply call in_pcbconnect, because there might be an
946 	 * earlier incarnation of this same connection still in
947 	 * TIME_WAIT state, creating an ADDRINUSE error.
948 	 */
949 	error = in6_pcbladdr(inp, nam, &addr6);
950 	if (error)
951 		return error;
952 	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
953 				  &sin6->sin6_addr, sin6->sin6_port,
954 				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
955 				  ? addr6
956 				  : &inp->in6p_laddr,
957 				  inp->inp_lport,  0, NULL);
958 	if (oinp) {
959 		if (oinp != inp &&
960 		    (oinp->inp_vflag & INP_TIMEWAIT) &&
961 		    (ticks - (otw = intotw(oinp))->t_starttime) < tcp_msl &&
962 		    otw->cc_recv != 0) {
963 			inp->inp_faddr = oinp->inp_faddr;
964 			inp->inp_fport = oinp->inp_fport;
965 			(void) tcp_twclose(otw, 0);
966 		} else
967 			return EADDRINUSE;
968 	}
969 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
970 		inp->in6p_laddr = *addr6;
971 	inp->in6p_faddr = sin6->sin6_addr;
972 	inp->inp_fport = sin6->sin6_port;
973 	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
974 		inp->in6p_flowinfo = sin6->sin6_flowinfo;
975 	in_pcbrehash(inp);
976 
977 	/* Compute window scaling to request.  */
978 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
979 	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
980 		tp->request_r_scale++;
981 
982 	soisconnecting(so);
983 	tcpstat.tcps_connattempt++;
984 	tp->t_state = TCPS_SYN_SENT;
985 	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
986 	tp->iss = tcp_new_isn(tp);
987 	tp->t_bw_rtseq = tp->iss;
988 	tcp_sendseqinit(tp);
989 
990 	/*
991 	 * Generate a CC value for this connection and
992 	 * check whether CC or CCnew should be used.
993 	 */
994 	if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
995 		taop = &tao_noncached;
996 		bzero(taop, sizeof(*taop));
997 	}
998 
999 	tp->cc_send = CC_INC(tcp_ccgen);
1000 	if (taop->tao_ccsent != 0 &&
1001 	    CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
1002 		taop->tao_ccsent = tp->cc_send;
1003 	} else {
1004 		taop->tao_ccsent = 0;
1005 		tp->t_flags |= TF_SENDCCNEW;
1006 	}
1007 
1008 	return 0;
1009 }
1010 #endif /* INET6 */
1011 
1012 /*
1013  * The new sockopt interface makes it possible for us to block in the
1014  * copyin/out step (if we take a page fault).  Taking a page fault at
1015  * splnet() is probably a Bad Thing.  (Since sockets and pcbs both now
1016  * use TSM, there probably isn't any need for this function to run at
1017  * splnet() any more.  This needs more examination.)
1018  */
1019 int
1020 tcp_ctloutput(so, sopt)
1021 	struct socket *so;
1022 	struct sockopt *sopt;
1023 {
1024 	int	error, opt, optval, s;
1025 	struct	inpcb *inp;
1026 	struct	tcpcb *tp;
1027 
1028 	error = 0;
1029 	s = splnet();		/* XXX */
1030 	INP_INFO_RLOCK(&tcbinfo);
1031 	inp = sotoinpcb(so);
1032 	if (inp == NULL) {
1033 		INP_INFO_RUNLOCK(&tcbinfo);
1034 		splx(s);
1035 		return (ECONNRESET);
1036 	}
1037 	INP_LOCK(inp);
1038 	INP_INFO_RUNLOCK(&tcbinfo);
1039 	if (sopt->sopt_level != IPPROTO_TCP) {
1040 #ifdef INET6
1041 		if (INP_CHECK_SOCKAF(so, AF_INET6))
1042 			error = ip6_ctloutput(so, sopt);
1043 		else
1044 #endif /* INET6 */
1045 		error = ip_ctloutput(so, sopt);
1046 		INP_UNLOCK(inp);
1047 		splx(s);
1048 		return (error);
1049 	}
1050 	tp = intotcpcb(inp);
1051 
1052 	switch (sopt->sopt_dir) {
1053 	case SOPT_SET:
1054 		switch (sopt->sopt_name) {
1055 		case TCP_NODELAY:
1056 		case TCP_NOOPT:
1057 			error = sooptcopyin(sopt, &optval, sizeof optval,
1058 					    sizeof optval);
1059 			if (error)
1060 				break;
1061 
1062 			switch (sopt->sopt_name) {
1063 			case TCP_NODELAY:
1064 				opt = TF_NODELAY;
1065 				break;
1066 			case TCP_NOOPT:
1067 				opt = TF_NOOPT;
1068 				break;
1069 			default:
1070 				opt = 0; /* dead code to fool gcc */
1071 				break;
1072 			}
1073 
1074 			if (optval)
1075 				tp->t_flags |= opt;
1076 			else
1077 				tp->t_flags &= ~opt;
1078 			break;
1079 
1080 		case TCP_NOPUSH:
1081 			error = sooptcopyin(sopt, &optval, sizeof optval,
1082 					    sizeof optval);
1083 			if (error)
1084 				break;
1085 
1086 			if (optval)
1087 				tp->t_flags |= TF_NOPUSH;
1088 			else {
1089 				tp->t_flags &= ~TF_NOPUSH;
1090 				error = tcp_output(tp);
1091 			}
1092 			break;
1093 
1094 		case TCP_MAXSEG:
1095 			error = sooptcopyin(sopt, &optval, sizeof optval,
1096 					    sizeof optval);
1097 			if (error)
1098 				break;
1099 
1100 			if (optval > 0 && optval <= tp->t_maxseg)
1101 				tp->t_maxseg = optval;
1102 			else
1103 				error = EINVAL;
1104 			break;
1105 
1106 		default:
1107 			error = ENOPROTOOPT;
1108 			break;
1109 		}
1110 		break;
1111 
1112 	case SOPT_GET:
1113 		switch (sopt->sopt_name) {
1114 		case TCP_NODELAY:
1115 			optval = tp->t_flags & TF_NODELAY;
1116 			break;
1117 		case TCP_MAXSEG:
1118 			optval = tp->t_maxseg;
1119 			break;
1120 		case TCP_NOOPT:
1121 			optval = tp->t_flags & TF_NOOPT;
1122 			break;
1123 		case TCP_NOPUSH:
1124 			optval = tp->t_flags & TF_NOPUSH;
1125 			break;
1126 		default:
1127 			error = ENOPROTOOPT;
1128 			break;
1129 		}
1130 		if (error == 0)
1131 			error = sooptcopyout(sopt, &optval, sizeof optval);
1132 		break;
1133 	}
1134 	INP_UNLOCK(inp);
1135 	splx(s);
1136 	return (error);
1137 }
1138 
1139 /*
1140  * tcp_sendspace and tcp_recvspace are the default send and receive window
1141  * sizes, respectively.  These are obsolescent (this information should
1142  * be set by the route).
1143  */
1144 u_long	tcp_sendspace = 1024*32;
1145 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
1146     &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
1147 u_long	tcp_recvspace = 1024*64;
1148 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
1149     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
1150 
1151 /*
1152  * Attach TCP protocol to socket, allocating
1153  * internet protocol control block, tcp control block,
1154  * bufer space, and entering LISTEN state if to accept connections.
1155  */
1156 static int
1157 tcp_attach(so, td)
1158 	struct socket *so;
1159 	struct thread *td;
1160 {
1161 	register struct tcpcb *tp;
1162 	struct inpcb *inp;
1163 	int error;
1164 #ifdef INET6
1165 	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
1166 #endif
1167 
1168 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1169 		error = soreserve(so, tcp_sendspace, tcp_recvspace);
1170 		if (error)
1171 			return (error);
1172 	}
1173 	error = in_pcballoc(so, &tcbinfo, td);
1174 	if (error)
1175 		return (error);
1176 	inp = sotoinpcb(so);
1177 #ifdef INET6
1178 	if (isipv6) {
1179 		inp->inp_vflag |= INP_IPV6;
1180 		inp->in6p_hops = -1;	/* use kernel default */
1181 	}
1182 	else
1183 #endif
1184 	inp->inp_vflag |= INP_IPV4;
1185 	tp = tcp_newtcpcb(inp);
1186 	if (tp == 0) {
1187 		int nofd = so->so_state & SS_NOFDREF;	/* XXX */
1188 
1189 		so->so_state &= ~SS_NOFDREF;	/* don't free the socket yet */
1190 #ifdef INET6
1191 		if (isipv6)
1192 			in6_pcbdetach(inp);
1193 		else
1194 #endif
1195 		in_pcbdetach(inp);
1196 		so->so_state |= nofd;
1197 		return (ENOBUFS);
1198 	}
1199 	tp->t_state = TCPS_CLOSED;
1200 	return (0);
1201 }
1202 
1203 /*
1204  * Initiate (or continue) disconnect.
1205  * If embryonic state, just send reset (once).
1206  * If in ``let data drain'' option and linger null, just drop.
1207  * Otherwise (hard), mark socket disconnecting and drop
1208  * current input data; switch states based on user close, and
1209  * send segment to peer (with FIN).
1210  */
1211 static struct tcpcb *
1212 tcp_disconnect(tp)
1213 	register struct tcpcb *tp;
1214 {
1215 	struct socket *so = tp->t_inpcb->inp_socket;
1216 
1217 	if (tp->t_state < TCPS_ESTABLISHED)
1218 		tp = tcp_close(tp);
1219 	else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1220 		tp = tcp_drop(tp, 0);
1221 	else {
1222 		soisdisconnecting(so);
1223 		sbflush(&so->so_rcv);
1224 		tp = tcp_usrclosed(tp);
1225 		if (tp)
1226 			(void) tcp_output(tp);
1227 	}
1228 	return (tp);
1229 }
1230 
1231 /*
1232  * User issued close, and wish to trail through shutdown states:
1233  * if never received SYN, just forget it.  If got a SYN from peer,
1234  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1235  * If already got a FIN from peer, then almost done; go to LAST_ACK
1236  * state.  In all other cases, have already sent FIN to peer (e.g.
1237  * after PRU_SHUTDOWN), and just have to play tedious game waiting
1238  * for peer to send FIN or not respond to keep-alives, etc.
1239  * We can let the user exit from the close as soon as the FIN is acked.
1240  */
1241 static struct tcpcb *
1242 tcp_usrclosed(tp)
1243 	register struct tcpcb *tp;
1244 {
1245 
1246 	switch (tp->t_state) {
1247 
1248 	case TCPS_CLOSED:
1249 	case TCPS_LISTEN:
1250 		tp->t_state = TCPS_CLOSED;
1251 		tp = tcp_close(tp);
1252 		break;
1253 
1254 	case TCPS_SYN_SENT:
1255 	case TCPS_SYN_RECEIVED:
1256 		tp->t_flags |= TF_NEEDFIN;
1257 		break;
1258 
1259 	case TCPS_ESTABLISHED:
1260 		tp->t_state = TCPS_FIN_WAIT_1;
1261 		break;
1262 
1263 	case TCPS_CLOSE_WAIT:
1264 		tp->t_state = TCPS_LAST_ACK;
1265 		break;
1266 	}
1267 	if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1268 		soisdisconnected(tp->t_inpcb->inp_socket);
1269 		/* To prevent the connection hanging in FIN_WAIT_2 forever. */
1270 		if (tp->t_state == TCPS_FIN_WAIT_2)
1271 			callout_reset(tp->tt_2msl, tcp_maxidle,
1272 				      tcp_timer_2msl, tp);
1273 	}
1274 	return (tp);
1275 }
1276 
1277