xref: /freebsd/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (revision 23833df4831a6f41aa39e952fba524edfb8cec6d)
1 
2 /*-
3  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
4  *      The Regents of the University of California.  All rights reserved.
5  * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
6  * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
33  */
34 
35 /*
36  *
37  * Copyright (c) 2010 Isilon Systems, Inc.
38  * Copyright (c) 2010 iX Systems, Inc.
39  * Copyright (c) 2010 Panasas, Inc.
40  * All rights reserved.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice unmodified, this list of conditions, and the following
47  *    disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
53  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
56  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
57  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62  *
63  */
64 #include <sys/cdefs.h>
65 __FBSDID("$FreeBSD$");
66 
67 #include <sys/param.h>
68 #include <sys/kernel.h>
69 #include <sys/malloc.h>
70 
71 #include "sdp.h"
72 
73 #include <net/if.h>
74 #include <net/route.h>
75 #include <net/vnet.h>
76 #include <sys/sysctl.h>
77 
78 uma_zone_t	sdp_zone;
79 struct rwlock	sdp_lock;
80 LIST_HEAD(, sdp_sock) sdp_list;
81 
82 struct workqueue_struct *rx_comp_wq;
83 
84 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
85 #define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
86 #define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
87 #define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
88 #define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
89 #define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
90 #define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
91 #define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
92 
93 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
94 
95 static void sdp_stop_keepalive_timer(struct socket *so);
96 
97 /*
98  * SDP protocol interface to socket abstraction.
99  */
100 /*
101  * sdp_sendspace and sdp_recvspace are the default send and receive window
102  * sizes, respectively.
103  */
104 u_long	sdp_sendspace = 1024*32;
105 u_long	sdp_recvspace = 1024*64;
106 
107 static int sdp_count;
108 
109 /*
110  * Disable async. CMA events for sockets which are being torn down.
111  */
112 static void
113 sdp_destroy_cma(struct sdp_sock *ssk)
114 {
115 
116 	if (ssk->id == NULL)
117 		return;
118 	rdma_destroy_id(ssk->id);
119 	ssk->id = NULL;
120 }
121 
122 static int
123 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
124 {
125 	struct sockaddr_in *sin;
126 	struct sockaddr_in null;
127 	int error;
128 
129 	SDP_WLOCK_ASSERT(ssk);
130 
131 	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
132 		return (EINVAL);
133 	/* rdma_bind_addr handles bind races.  */
134 	SDP_WUNLOCK(ssk);
135 	if (ssk->id == NULL)
136 		ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
137 	if (ssk->id == NULL) {
138 		SDP_WLOCK(ssk);
139 		return (ENOMEM);
140 	}
141 	if (nam == NULL) {
142 		null.sin_family = AF_INET;
143 		null.sin_len = sizeof(null);
144 		null.sin_addr.s_addr = INADDR_ANY;
145 		null.sin_port = 0;
146 		bzero(&null.sin_zero, sizeof(null.sin_zero));
147 		nam = (struct sockaddr *)&null;
148 	}
149 	error = -rdma_bind_addr(ssk->id, nam);
150 	SDP_WLOCK(ssk);
151 	if (error == 0) {
152 		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
153 		ssk->laddr = sin->sin_addr.s_addr;
154 		ssk->lport = sin->sin_port;
155 	} else
156 		sdp_destroy_cma(ssk);
157 	return (error);
158 }
159 
160 static void
161 sdp_pcbfree(struct sdp_sock *ssk)
162 {
163 
164 	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
165 	KASSERT((ssk->flags & SDP_DESTROY) == 0,
166 	    ("ssk %p already destroyed", ssk));
167 
168 	sdp_dbg(ssk->socket, "Freeing pcb");
169 	SDP_WLOCK_ASSERT(ssk);
170 	ssk->flags |= SDP_DESTROY;
171 	SDP_WUNLOCK(ssk);
172 	SDP_LIST_WLOCK();
173 	sdp_count--;
174 	LIST_REMOVE(ssk, list);
175 	SDP_LIST_WUNLOCK();
176 	crfree(ssk->cred);
177 	ssk->qp_active = 0;
178 	if (ssk->qp) {
179 		ib_destroy_qp(ssk->qp);
180 		ssk->qp = NULL;
181 	}
182 	sdp_tx_ring_destroy(ssk);
183 	sdp_rx_ring_destroy(ssk);
184 	sdp_destroy_cma(ssk);
185 	rw_destroy(&ssk->rx_ring.destroyed_lock);
186 	rw_destroy(&ssk->lock);
187 	uma_zfree(sdp_zone, ssk);
188 }
189 
190 /*
191  * Common routines to return a socket address.
192  */
193 static struct sockaddr *
194 sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
195 {
196 	struct sockaddr_in *sin;
197 
198 	sin = malloc(sizeof *sin, M_SONAME,
199 		M_WAITOK | M_ZERO);
200 	sin->sin_family = AF_INET;
201 	sin->sin_len = sizeof(*sin);
202 	sin->sin_addr = *addr_p;
203 	sin->sin_port = port;
204 
205 	return (struct sockaddr *)sin;
206 }
207 
208 static int
209 sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
210 {
211 	struct sdp_sock *ssk;
212 	struct in_addr addr;
213 	in_port_t port;
214 
215 	ssk = sdp_sk(so);
216 	SDP_RLOCK(ssk);
217 	port = ssk->lport;
218 	addr.s_addr = ssk->laddr;
219 	SDP_RUNLOCK(ssk);
220 
221 	*nam = sdp_sockaddr(port, &addr);
222 	return 0;
223 }
224 
225 static int
226 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
227 {
228 	struct sdp_sock *ssk;
229 	struct in_addr addr;
230 	in_port_t port;
231 
232 	ssk = sdp_sk(so);
233 	SDP_RLOCK(ssk);
234 	port = ssk->fport;
235 	addr.s_addr = ssk->faddr;
236 	SDP_RUNLOCK(ssk);
237 
238 	*nam = sdp_sockaddr(port, &addr);
239 	return 0;
240 }
241 
242 static void
243 sdp_pcbnotifyall(struct in_addr faddr, int errno,
244     struct sdp_sock *(*notify)(struct sdp_sock *, int))
245 {
246 	struct sdp_sock *ssk, *ssk_temp;
247 
248 	SDP_LIST_WLOCK();
249 	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
250 		SDP_WLOCK(ssk);
251 		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
252 			SDP_WUNLOCK(ssk);
253 			continue;
254 		}
255 		if ((ssk->flags & SDP_DESTROY) == 0)
256 			if ((*notify)(ssk, errno))
257 				SDP_WUNLOCK(ssk);
258 	}
259 	SDP_LIST_WUNLOCK();
260 }
261 
262 #if 0
263 static void
264 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
265 {
266 	struct sdp_sock *ssk;
267 
268 	SDP_LIST_RLOCK();
269 	LIST_FOREACH(ssk, &sdp_list, list) {
270 		SDP_WLOCK(ssk);
271 		func(ssk, arg);
272 		SDP_WUNLOCK(ssk);
273 	}
274 	SDP_LIST_RUNLOCK();
275 }
276 #endif
277 
278 static void
279 sdp_output_reset(struct sdp_sock *ssk)
280 {
281 	struct rdma_cm_id *id;
282 
283 	SDP_WLOCK_ASSERT(ssk);
284 	if (ssk->id) {
285 		id = ssk->id;
286 		ssk->qp_active = 0;
287 		SDP_WUNLOCK(ssk);
288 		rdma_disconnect(id);
289 		SDP_WLOCK(ssk);
290 	}
291 	ssk->state = TCPS_CLOSED;
292 }
293 
294 /*
295  * Attempt to close a SDP socket, marking it as dropped, and freeing
296  * the socket if we hold the only reference.
297  */
298 static struct sdp_sock *
299 sdp_closed(struct sdp_sock *ssk)
300 {
301 	struct socket *so;
302 
303 	SDP_WLOCK_ASSERT(ssk);
304 
305 	ssk->flags |= SDP_DROPPED;
306 	so = ssk->socket;
307 	soisdisconnected(so);
308 	if (ssk->flags & SDP_SOCKREF) {
309 		KASSERT(so->so_state & SS_PROTOREF,
310 		    ("sdp_closed: !SS_PROTOREF"));
311 		ssk->flags &= ~SDP_SOCKREF;
312 		SDP_WUNLOCK(ssk);
313 		SOCK_LOCK(so);
314 		so->so_state &= ~SS_PROTOREF;
315 		sofree(so);
316 		return (NULL);
317 	}
318 	return (ssk);
319 }
320 
321 /*
322  * Perform timer based shutdowns which can not operate in
323  * callout context.
324  */
325 static void
326 sdp_shutdown_task(void *data, int pending)
327 {
328 	struct sdp_sock *ssk;
329 
330 	ssk = data;
331 	SDP_WLOCK(ssk);
332 	/*
333 	 * I don't think this can race with another call to pcbfree()
334 	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
335 	 */
336 	if (ssk->flags & SDP_DESTROY)
337 		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
338 		    ssk);
339 	if (ssk->flags & SDP_DISCON)
340 		sdp_output_reset(ssk);
341 	/* We have to clear this so sdp_detach() will call pcbfree(). */
342 	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
343 	if ((ssk->flags & SDP_DROPPED) == 0 &&
344 	    sdp_closed(ssk) == NULL)
345 		return;
346 	if (ssk->socket == NULL) {
347 		sdp_pcbfree(ssk);
348 		return;
349 	}
350 	SDP_WUNLOCK(ssk);
351 }
352 
353 /*
354  * 2msl has expired, schedule the shutdown task.
355  */
356 static void
357 sdp_2msl_timeout(void *data)
358 {
359 	struct sdp_sock *ssk;
360 
361 	ssk = data;
362 	/* Callout canceled. */
363         if (!callout_active(&ssk->keep2msl))
364 		goto out;
365         callout_deactivate(&ssk->keep2msl);
366 	/* Should be impossible, defensive programming. */
367 	if ((ssk->flags & SDP_TIMEWAIT) == 0)
368 		goto out;
369 	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
370 out:
371 	SDP_WUNLOCK(ssk);
372 	return;
373 }
374 
375 /*
376  * Schedule the 2msl wait timer.
377  */
378 static void
379 sdp_2msl_wait(struct sdp_sock *ssk)
380 {
381 
382 	SDP_WLOCK_ASSERT(ssk);
383 	ssk->flags |= SDP_TIMEWAIT;
384 	ssk->state = TCPS_TIME_WAIT;
385 	soisdisconnected(ssk->socket);
386 	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
387 }
388 
389 /*
390  * Timed out waiting for the final fin/ack from rdma_disconnect().
391  */
392 static void
393 sdp_dreq_timeout(void *data)
394 {
395 	struct sdp_sock *ssk;
396 
397 	ssk = data;
398 	/* Callout canceled. */
399         if (!callout_active(&ssk->keep2msl))
400 		goto out;
401 	/* Callout rescheduled, probably as a different timer. */
402 	if (callout_pending(&ssk->keep2msl))
403 		goto out;
404         callout_deactivate(&ssk->keep2msl);
405 	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
406 		goto out;
407 	if ((ssk->flags & SDP_DREQWAIT) == 0)
408 		goto out;
409 	ssk->flags &= ~SDP_DREQWAIT;
410 	ssk->flags |= SDP_DISCON;
411 	sdp_2msl_wait(ssk);
412 	ssk->qp_active = 0;
413 out:
414 	SDP_WUNLOCK(ssk);
415 }
416 
417 /*
418  * Received the final fin/ack.  Cancel the 2msl.
419  */
420 void
421 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
422 {
423 	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
424 	ssk->flags &= ~SDP_DREQWAIT;
425 	sdp_2msl_wait(ssk);
426 }
427 
428 static int
429 sdp_init_sock(struct socket *sk)
430 {
431 	struct sdp_sock *ssk = sdp_sk(sk);
432 
433 	sdp_dbg(sk, "%s\n", __func__);
434 
435 	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
436 	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
437 #ifdef SDP_ZCOPY
438 	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
439 	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
440 	ssk->tx_ring.rdma_inflight = NULL;
441 #endif
442 	atomic_set(&ssk->mseq_ack, 0);
443 	sdp_rx_ring_init(ssk);
444 	ssk->tx_ring.buffer = NULL;
445 
446 	return 0;
447 }
448 
449 /*
450  * Allocate an sdp_sock for the socket and reserve socket buffer space.
451  */
452 static int
453 sdp_attach(struct socket *so, int proto, struct thread *td)
454 {
455 	struct sdp_sock *ssk;
456 	int error;
457 
458 	ssk = sdp_sk(so);
459 	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
460 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
461 		error = soreserve(so, sdp_sendspace, sdp_recvspace);
462 		if (error)
463 			return (error);
464 	}
465 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
466 	so->so_snd.sb_flags |= SB_AUTOSIZE;
467 	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
468 	if (ssk == NULL)
469 		return (ENOBUFS);
470 	rw_init(&ssk->lock, "sdpsock");
471 	ssk->socket = so;
472 	ssk->cred = crhold(so->so_cred);
473 	so->so_pcb = (caddr_t)ssk;
474 	sdp_init_sock(so);
475 	ssk->flags = 0;
476 	ssk->qp_active = 0;
477 	ssk->state = TCPS_CLOSED;
478 	mbufq_init(&ssk->rxctlq, INT_MAX);
479 	SDP_LIST_WLOCK();
480 	LIST_INSERT_HEAD(&sdp_list, ssk, list);
481 	sdp_count++;
482 	SDP_LIST_WUNLOCK();
483 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
484 		so->so_linger = TCP_LINGERTIME;
485 
486 	return (0);
487 }
488 
489 /*
490  * Detach SDP from the socket, potentially leaving it around for the
491  * timewait to expire.
492  */
493 static void
494 sdp_detach(struct socket *so)
495 {
496 	struct sdp_sock *ssk;
497 
498 	ssk = sdp_sk(so);
499 	SDP_WLOCK(ssk);
500 	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
501 	ssk->socket->so_pcb = NULL;
502 	ssk->socket = NULL;
503 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
504 		SDP_WUNLOCK(ssk);
505 	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
506 		sdp_pcbfree(ssk);
507 	else
508 		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
509 }
510 
511 /*
512  * Allocate a local address for the socket.
513  */
514 static int
515 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
516 {
517 	int error = 0;
518 	struct sdp_sock *ssk;
519 	struct sockaddr_in *sin;
520 
521 	sin = (struct sockaddr_in *)nam;
522 	if (nam->sa_len != sizeof (*sin))
523 		return (EINVAL);
524 	if (sin->sin_family != AF_INET)
525 		return (EINVAL);
526 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
527 		return (EAFNOSUPPORT);
528 
529 	ssk = sdp_sk(so);
530 	SDP_WLOCK(ssk);
531 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
532 		error = EINVAL;
533 		goto out;
534 	}
535 	error = sdp_pcbbind(ssk, nam, td->td_ucred);
536 out:
537 	SDP_WUNLOCK(ssk);
538 
539 	return (error);
540 }
541 
542 /*
543  * Prepare to accept connections.
544  */
545 static int
546 sdp_listen(struct socket *so, int backlog, struct thread *td)
547 {
548 	int error = 0;
549 	struct sdp_sock *ssk;
550 
551 	ssk = sdp_sk(so);
552 	SDP_WLOCK(ssk);
553 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
554 		error = EINVAL;
555 		goto out;
556 	}
557 	if (error == 0 && ssk->lport == 0)
558 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
559 	SOCK_LOCK(so);
560 	if (error == 0)
561 		error = solisten_proto_check(so);
562 	if (error == 0) {
563 		solisten_proto(so, backlog);
564 		ssk->state = TCPS_LISTEN;
565 	}
566 	SOCK_UNLOCK(so);
567 
568 out:
569 	SDP_WUNLOCK(ssk);
570 	if (error == 0)
571 		error = -rdma_listen(ssk->id, backlog);
572 	return (error);
573 }
574 
575 /*
576  * Initiate a SDP connection to nam.
577  */
578 static int
579 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
580 {
581 	struct sockaddr_in src;
582 	struct socket *so;
583 	int error;
584 
585 	so = ssk->socket;
586 
587 	SDP_WLOCK_ASSERT(ssk);
588 	if (ssk->lport == 0) {
589 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
590 		if (error)
591 			return error;
592 	}
593 	src.sin_family = AF_INET;
594 	src.sin_len = sizeof(src);
595 	bzero(&src.sin_zero, sizeof(src.sin_zero));
596 	src.sin_port = ssk->lport;
597 	src.sin_addr.s_addr = ssk->laddr;
598 	soisconnecting(so);
599 	SDP_WUNLOCK(ssk);
600 	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
601 	    SDP_RESOLVE_TIMEOUT);
602 	SDP_WLOCK(ssk);
603 	if (error == 0)
604 		ssk->state = TCPS_SYN_SENT;
605 
606 	return 0;
607 }
608 
609 /*
610  * Initiate SDP connection.
611  */
612 static int
613 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
614 {
615 	int error = 0;
616 	struct sdp_sock *ssk;
617 	struct sockaddr_in *sin;
618 
619 	sin = (struct sockaddr_in *)nam;
620 	if (nam->sa_len != sizeof (*sin))
621 		return (EINVAL);
622 	if (sin->sin_family != AF_INET)
623 		return (EINVAL);
624 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
625 		return (EAFNOSUPPORT);
626 	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
627 		return (error);
628 	ssk = sdp_sk(so);
629 	SDP_WLOCK(ssk);
630 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
631 		error = EINVAL;
632 	else
633 		error = sdp_start_connect(ssk, nam, td);
634 	SDP_WUNLOCK(ssk);
635 	return (error);
636 }
637 
638 /*
639  * Drop a SDP socket, reporting
640  * the specified error.  If connection is synchronized,
641  * then send a RST to peer.
642  */
643 static struct sdp_sock *
644 sdp_drop(struct sdp_sock *ssk, int errno)
645 {
646 	struct socket *so;
647 
648 	SDP_WLOCK_ASSERT(ssk);
649 	so = ssk->socket;
650 	if (TCPS_HAVERCVDSYN(ssk->state))
651 		sdp_output_reset(ssk);
652 	if (errno == ETIMEDOUT && ssk->softerror)
653 		errno = ssk->softerror;
654 	so->so_error = errno;
655 	return (sdp_closed(ssk));
656 }
657 
658 /*
659  * User issued close, and wish to trail through shutdown states:
660  * if never received SYN, just forget it.  If got a SYN from peer,
661  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
662  * If already got a FIN from peer, then almost done; go to LAST_ACK
663  * state.  In all other cases, have already sent FIN to peer (e.g.
664  * after PRU_SHUTDOWN), and just have to play tedious game waiting
665  * for peer to send FIN or not respond to keep-alives, etc.
666  * We can let the user exit from the close as soon as the FIN is acked.
667  */
668 static void
669 sdp_usrclosed(struct sdp_sock *ssk)
670 {
671 
672 	SDP_WLOCK_ASSERT(ssk);
673 
674 	switch (ssk->state) {
675 	case TCPS_LISTEN:
676 		ssk->state = TCPS_CLOSED;
677 		SDP_WUNLOCK(ssk);
678 		sdp_destroy_cma(ssk);
679 		SDP_WLOCK(ssk);
680 		/* FALLTHROUGH */
681 	case TCPS_CLOSED:
682 		ssk = sdp_closed(ssk);
683 		/*
684 		 * sdp_closed() should never return NULL here as the socket is
685 		 * still open.
686 		 */
687 		KASSERT(ssk != NULL,
688 		    ("sdp_usrclosed: sdp_closed() returned NULL"));
689 		break;
690 
691 	case TCPS_SYN_SENT:
692 		/* FALLTHROUGH */
693 	case TCPS_SYN_RECEIVED:
694 		ssk->flags |= SDP_NEEDFIN;
695 		break;
696 
697 	case TCPS_ESTABLISHED:
698 		ssk->flags |= SDP_NEEDFIN;
699 		ssk->state = TCPS_FIN_WAIT_1;
700 		break;
701 
702 	case TCPS_CLOSE_WAIT:
703 		ssk->state = TCPS_LAST_ACK;
704 		break;
705 	}
706 	if (ssk->state >= TCPS_FIN_WAIT_2) {
707 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
708 		if (ssk->state == TCPS_FIN_WAIT_2)
709 			sdp_2msl_wait(ssk);
710 		else
711 			soisdisconnected(ssk->socket);
712 	}
713 }
714 
715 static void
716 sdp_output_disconnect(struct sdp_sock *ssk)
717 {
718 
719 	SDP_WLOCK_ASSERT(ssk);
720 	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
721 	    sdp_dreq_timeout, ssk);
722 	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
723 	sdp_post_sends(ssk, M_NOWAIT);
724 }
725 
726 /*
727  * Initiate or continue a disconnect.
728  * If embryonic state, just send reset (once).
729  * If in ``let data drain'' option and linger null, just drop.
730  * Otherwise (hard), mark socket disconnecting and drop
731  * current input data; switch states based on user close, and
732  * send segment to peer (with FIN).
733  */
734 static void
735 sdp_start_disconnect(struct sdp_sock *ssk)
736 {
737 	struct socket *so;
738 	int unread;
739 
740 	so = ssk->socket;
741 	SDP_WLOCK_ASSERT(ssk);
742 	sdp_stop_keepalive_timer(so);
743 	/*
744 	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
745 	 * socket is still open.
746 	 */
747 	if (ssk->state < TCPS_ESTABLISHED) {
748 		ssk = sdp_closed(ssk);
749 		KASSERT(ssk != NULL,
750 		    ("sdp_start_disconnect: sdp_close() returned NULL"));
751 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
752 		ssk = sdp_drop(ssk, 0);
753 		KASSERT(ssk != NULL,
754 		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
755 	} else {
756 		soisdisconnecting(so);
757 		unread = sbused(&so->so_rcv);
758 		sbflush(&so->so_rcv);
759 		sdp_usrclosed(ssk);
760 		if (!(ssk->flags & SDP_DROPPED)) {
761 			if (unread)
762 				sdp_output_reset(ssk);
763 			else
764 				sdp_output_disconnect(ssk);
765 		}
766 	}
767 }
768 
769 /*
770  * User initiated disconnect.
771  */
772 static int
773 sdp_disconnect(struct socket *so)
774 {
775 	struct sdp_sock *ssk;
776 	int error = 0;
777 
778 	ssk = sdp_sk(so);
779 	SDP_WLOCK(ssk);
780 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
781 		error = ECONNRESET;
782 		goto out;
783 	}
784 	sdp_start_disconnect(ssk);
785 out:
786 	SDP_WUNLOCK(ssk);
787 	return (error);
788 }
789 
790 /*
791  * Accept a connection.  Essentially all the work is done at higher levels;
792  * just return the address of the peer, storing through addr.
793  *
794  *
795  * XXX This is broken XXX
796  *
797  * The rationale for acquiring the sdp lock here is somewhat complicated,
798  * and is described in detail in the commit log entry for r175612.  Acquiring
799  * it delays an accept(2) racing with sonewconn(), which inserts the socket
800  * before the address/port fields are initialized.  A better fix would
801  * prevent the socket from being placed in the listen queue until all fields
802  * are fully initialized.
803  */
804 static int
805 sdp_accept(struct socket *so, struct sockaddr **nam)
806 {
807 	struct sdp_sock *ssk = NULL;
808 	struct in_addr addr;
809 	in_port_t port;
810 	int error;
811 
812 	if (so->so_state & SS_ISDISCONNECTED)
813 		return (ECONNABORTED);
814 
815 	port = 0;
816 	addr.s_addr = 0;
817 	error = 0;
818 	ssk = sdp_sk(so);
819 	SDP_WLOCK(ssk);
820 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
821 		error = ECONNABORTED;
822 		goto out;
823 	}
824 	port = ssk->fport;
825 	addr.s_addr = ssk->faddr;
826 out:
827 	SDP_WUNLOCK(ssk);
828 	if (error == 0)
829 		*nam = sdp_sockaddr(port, &addr);
830 	return error;
831 }
832 
833 /*
834  * Mark the connection as being incapable of further output.
835  */
836 static int
837 sdp_shutdown(struct socket *so)
838 {
839 	int error = 0;
840 	struct sdp_sock *ssk;
841 
842 	ssk = sdp_sk(so);
843 	SDP_WLOCK(ssk);
844 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
845 		error = ECONNRESET;
846 		goto out;
847 	}
848 	socantsendmore(so);
849 	sdp_usrclosed(ssk);
850 	if (!(ssk->flags & SDP_DROPPED))
851 		sdp_output_disconnect(ssk);
852 
853 out:
854 	SDP_WUNLOCK(ssk);
855 
856 	return (error);
857 }
858 
859 static void
860 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
861 {
862 	struct mbuf *n;
863 	int ncnt;
864 
865 	SOCKBUF_LOCK_ASSERT(sb);
866 	SBLASTRECORDCHK(sb);
867 	KASSERT(mb->m_flags & M_PKTHDR,
868 		("sdp_append: %p Missing packet header.\n", mb));
869 	n = sb->sb_lastrecord;
870 	/*
871 	 * If the queue is empty just set all pointers and proceed.
872 	 */
873 	if (n == NULL) {
874 		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
875 		for (; mb; mb = mb->m_next) {
876 	                sb->sb_mbtail = mb;
877 			sballoc(sb, mb);
878 		}
879 		return;
880 	}
881 	/*
882 	 * Count the number of mbufs in the current tail.
883 	 */
884 	for (ncnt = 0; n->m_next; n = n->m_next)
885 		ncnt++;
886 	n = sb->sb_lastrecord;
887 	/*
888 	 * If the two chains can fit in a single sdp packet and
889 	 * the last record has not been sent yet (WRITABLE) coalesce
890 	 * them.  The lastrecord remains the same but we must strip the
891 	 * packet header and then let sbcompress do the hard part.
892 	 */
893 	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
894 	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
895 	    ssk->xmit_size_goal) {
896 		m_adj(mb, SDP_HEAD_SIZE);
897 		n->m_pkthdr.len += mb->m_pkthdr.len;
898 		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
899 		m_demote(mb, 1, 0);
900 		sbcompress(sb, mb, sb->sb_mbtail);
901 		return;
902 	}
903 	/*
904 	 * Not compressible, just append to the end and adjust counters.
905 	 */
906 	sb->sb_lastrecord->m_flags |= M_PUSH;
907 	sb->sb_lastrecord->m_nextpkt = mb;
908 	sb->sb_lastrecord = mb;
909 	if (sb->sb_sndptr == NULL)
910 		sb->sb_sndptr = mb;
911 	for (; mb; mb = mb->m_next) {
912 		sb->sb_mbtail = mb;
913 		sballoc(sb, mb);
914 	}
915 }
916 
917 /*
918  * Do a send by putting data in output queue and updating urgent
919  * marker if URG set.  Possibly send more data.  Unlike the other
920  * pru_*() routines, the mbuf chains are our responsibility.  We
921  * must either enqueue them or free them.  The other pru_* routines
922  * generally are caller-frees.
923  *
924  * This comes from sendfile, normal sends will come from sdp_sosend().
925  */
926 static int
927 sdp_send(struct socket *so, int flags, struct mbuf *m,
928     struct sockaddr *nam, struct mbuf *control, struct thread *td)
929 {
930 	struct sdp_sock *ssk;
931 	struct mbuf *n;
932 	int error;
933 	int cnt;
934 
935 	error = 0;
936 	ssk = sdp_sk(so);
937 	KASSERT(m->m_flags & M_PKTHDR,
938 	    ("sdp_send: %p no packet header", m));
939 	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
940 	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
941 	for (n = m, cnt = 0; n->m_next; n = n->m_next)
942 		cnt++;
943 	if (cnt > SDP_MAX_SEND_SGES) {
944 		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
945 		if (n == NULL) {
946 			m_freem(m);
947 			return (EMSGSIZE);
948 		}
949 		m = n;
950 		for (cnt = 0; n->m_next; n = n->m_next)
951 			cnt++;
952 	}
953 	SDP_WLOCK(ssk);
954 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
955 		if (control)
956 			m_freem(control);
957 		if (m)
958 			m_freem(m);
959 		error = ECONNRESET;
960 		goto out;
961 	}
962 	if (control) {
963 		/* SDP doesn't support control messages. */
964 		if (control->m_len) {
965 			m_freem(control);
966 			if (m)
967 				m_freem(m);
968 			error = EINVAL;
969 			goto out;
970 		}
971 		m_freem(control);	/* empty control, just free it */
972 	}
973 	if (!(flags & PRUS_OOB)) {
974 		SOCKBUF_LOCK(&so->so_snd);
975 		sdp_append(ssk, &so->so_snd, m, cnt);
976 		SOCKBUF_UNLOCK(&so->so_snd);
977 		if (nam && ssk->state < TCPS_SYN_SENT) {
978 			/*
979 			 * Do implied connect if not yet connected.
980 			 */
981 			error = sdp_start_connect(ssk, nam, td);
982 			if (error)
983 				goto out;
984 		}
985 		if (flags & PRUS_EOF) {
986 			/*
987 			 * Close the send side of the connection after
988 			 * the data is sent.
989 			 */
990 			socantsendmore(so);
991 			sdp_usrclosed(ssk);
992 			if (!(ssk->flags & SDP_DROPPED))
993 				sdp_output_disconnect(ssk);
994 		} else if (!(ssk->flags & SDP_DROPPED) &&
995 		    !(flags & PRUS_MORETOCOME))
996 			sdp_post_sends(ssk, M_NOWAIT);
997 		SDP_WUNLOCK(ssk);
998 		return (0);
999 	} else {
1000 		SOCKBUF_LOCK(&so->so_snd);
1001 		if (sbspace(&so->so_snd) < -512) {
1002 			SOCKBUF_UNLOCK(&so->so_snd);
1003 			m_freem(m);
1004 			error = ENOBUFS;
1005 			goto out;
1006 		}
1007 		/*
1008 		 * According to RFC961 (Assigned Protocols),
1009 		 * the urgent pointer points to the last octet
1010 		 * of urgent data.  We continue, however,
1011 		 * to consider it to indicate the first octet
1012 		 * of data past the urgent section.
1013 		 * Otherwise, snd_up should be one lower.
1014 		 */
1015 		m->m_flags |= M_URG | M_PUSH;
1016 		sdp_append(ssk, &so->so_snd, m, cnt);
1017 		SOCKBUF_UNLOCK(&so->so_snd);
1018 		if (nam && ssk->state < TCPS_SYN_SENT) {
1019 			/*
1020 			 * Do implied connect if not yet connected.
1021 			 */
1022 			error = sdp_start_connect(ssk, nam, td);
1023 			if (error)
1024 				goto out;
1025 		}
1026 		sdp_post_sends(ssk, M_NOWAIT);
1027 		SDP_WUNLOCK(ssk);
1028 		return (0);
1029 	}
1030 out:
1031 	SDP_WUNLOCK(ssk);
1032 	return (error);
1033 }
1034 
1035 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1036 
1037 /*
1038  * Send on a socket.  If send must go all at once and message is larger than
1039  * send buffering, then hard error.  Lock against other senders.  If must go
1040  * all at once and not enough room now, then inform user that this would
1041  * block and do nothing.  Otherwise, if nonblocking, send as much as
1042  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1043  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1044  * in mbuf chain must be small enough to send all at once.
1045  *
1046  * Returns nonzero on error, timeout or signal; callers must check for short
1047  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1048  * on return.
1049  */
1050 static int
1051 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1052     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1053 {
1054 	struct sdp_sock *ssk;
1055 	long space, resid;
1056 	int atomic;
1057 	int error;
1058 	int copy;
1059 
1060 	if (uio != NULL)
1061 		resid = uio->uio_resid;
1062 	else
1063 		resid = top->m_pkthdr.len;
1064 	atomic = top != NULL;
1065 	if (control != NULL) {
1066 		if (control->m_len) {
1067 			m_freem(control);
1068 			if (top)
1069 				m_freem(top);
1070 			return (EINVAL);
1071 		}
1072 		m_freem(control);
1073 		control = NULL;
1074 	}
1075 	/*
1076 	 * In theory resid should be unsigned.  However, space must be
1077 	 * signed, as it might be less than 0 if we over-committed, and we
1078 	 * must use a signed comparison of space and resid.  On the other
1079 	 * hand, a negative resid causes us to loop sending 0-length
1080 	 * segments to the protocol.
1081 	 *
1082 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1083 	 * type sockets since that's an error.
1084 	 */
1085 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1086 		error = EINVAL;
1087 		goto out;
1088 	}
1089 	if (td != NULL)
1090 		td->td_ru.ru_msgsnd++;
1091 
1092 	ssk = sdp_sk(so);
1093 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1094 	if (error)
1095 		goto out;
1096 
1097 restart:
1098 	do {
1099 		SOCKBUF_LOCK(&so->so_snd);
1100 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1101 			SOCKBUF_UNLOCK(&so->so_snd);
1102 			error = EPIPE;
1103 			goto release;
1104 		}
1105 		if (so->so_error) {
1106 			error = so->so_error;
1107 			so->so_error = 0;
1108 			SOCKBUF_UNLOCK(&so->so_snd);
1109 			goto release;
1110 		}
1111 		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1112 			SOCKBUF_UNLOCK(&so->so_snd);
1113 			error = ENOTCONN;
1114 			goto release;
1115 		}
1116 		space = sbspace(&so->so_snd);
1117 		if (flags & MSG_OOB)
1118 			space += 1024;
1119 		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1120 			SOCKBUF_UNLOCK(&so->so_snd);
1121 			error = EMSGSIZE;
1122 			goto release;
1123 		}
1124 		if (space < resid &&
1125 		    (atomic || space < so->so_snd.sb_lowat)) {
1126 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1127 				SOCKBUF_UNLOCK(&so->so_snd);
1128 				error = EWOULDBLOCK;
1129 				goto release;
1130 			}
1131 			error = sbwait(&so->so_snd);
1132 			SOCKBUF_UNLOCK(&so->so_snd);
1133 			if (error)
1134 				goto release;
1135 			goto restart;
1136 		}
1137 		SOCKBUF_UNLOCK(&so->so_snd);
1138 		do {
1139 			if (uio == NULL) {
1140 				resid = 0;
1141 				if (flags & MSG_EOR)
1142 					top->m_flags |= M_EOR;
1143 			} else {
1144 				/*
1145 				 * Copy the data from userland into a mbuf
1146 				 * chain.  If no data is to be copied in,
1147 				 * a single empty mbuf is returned.
1148 				 */
1149 				copy = min(space,
1150 				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1151 				top = m_uiotombuf(uio, M_WAITOK, copy,
1152 				    0, M_PKTHDR |
1153 				    ((flags & MSG_EOR) ? M_EOR : 0));
1154 				if (top == NULL) {
1155 					/* only possible error */
1156 					error = EFAULT;
1157 					goto release;
1158 				}
1159 				space -= resid - uio->uio_resid;
1160 				resid = uio->uio_resid;
1161 			}
1162 			/*
1163 			 * XXX all the SBS_CANTSENDMORE checks previously
1164 			 * done could be out of date after dropping the
1165 			 * socket lock.
1166 			 */
1167 			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1168 			/*
1169 			 * Set EOF on the last send if the user specified
1170 			 * MSG_EOF.
1171 			 */
1172 			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1173 			/* If there is more to send set PRUS_MORETOCOME. */
1174 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1175 			    top, addr, NULL, td);
1176 			top = NULL;
1177 			if (error)
1178 				goto release;
1179 		} while (resid && space > 0);
1180 	} while (resid);
1181 
1182 release:
1183 	sbunlock(&so->so_snd);
1184 out:
1185 	if (top != NULL)
1186 		m_freem(top);
1187 	return (error);
1188 }
1189 
1190 /*
1191  * The part of soreceive() that implements reading non-inline out-of-band
1192  * data from a socket.  For more complete comments, see soreceive(), from
1193  * which this code originated.
1194  *
1195  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1196  * unable to return an mbuf chain to the caller.
1197  */
1198 static int
1199 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1200 {
1201 	struct protosw *pr = so->so_proto;
1202 	struct mbuf *m;
1203 	int error;
1204 
1205 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1206 
1207 	m = m_get(M_WAITOK, MT_DATA);
1208 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1209 	if (error)
1210 		goto bad;
1211 	do {
1212 		error = uiomove(mtod(m, void *),
1213 		    (int) min(uio->uio_resid, m->m_len), uio);
1214 		m = m_free(m);
1215 	} while (uio->uio_resid && error == 0 && m);
1216 bad:
1217 	if (m != NULL)
1218 		m_freem(m);
1219 	return (error);
1220 }
1221 
1222 /*
1223  * Optimized version of soreceive() for stream (TCP) sockets.
1224  */
1225 static int
1226 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1227     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1228 {
1229 	int len = 0, error = 0, flags, oresid;
1230 	struct sockbuf *sb;
1231 	struct mbuf *m, *n = NULL;
1232 	struct sdp_sock *ssk;
1233 
1234 	/* We only do stream sockets. */
1235 	if (so->so_type != SOCK_STREAM)
1236 		return (EINVAL);
1237 	if (psa != NULL)
1238 		*psa = NULL;
1239 	if (controlp != NULL)
1240 		return (EINVAL);
1241 	if (flagsp != NULL)
1242 		flags = *flagsp &~ MSG_EOR;
1243 	else
1244 		flags = 0;
1245 	if (flags & MSG_OOB)
1246 		return (soreceive_rcvoob(so, uio, flags));
1247 	if (mp0 != NULL)
1248 		*mp0 = NULL;
1249 
1250 	sb = &so->so_rcv;
1251 	ssk = sdp_sk(so);
1252 
1253 	/* Prevent other readers from entering the socket. */
1254 	error = sblock(sb, SBLOCKWAIT(flags));
1255 	if (error)
1256 		goto out;
1257 	SOCKBUF_LOCK(sb);
1258 
1259 	/* Easy one, no space to copyout anything. */
1260 	if (uio->uio_resid == 0) {
1261 		error = EINVAL;
1262 		goto out;
1263 	}
1264 	oresid = uio->uio_resid;
1265 
1266 	/* We will never ever get anything unless we are connected. */
1267 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1268 		/* When disconnecting there may be still some data left. */
1269 		if (sbavail(sb))
1270 			goto deliver;
1271 		if (!(so->so_state & SS_ISDISCONNECTED))
1272 			error = ENOTCONN;
1273 		goto out;
1274 	}
1275 
1276 	/* Socket buffer is empty and we shall not block. */
1277 	if (sbavail(sb) == 0 &&
1278 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1279 		error = EAGAIN;
1280 		goto out;
1281 	}
1282 
1283 restart:
1284 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1285 
1286 	/* Abort if socket has reported problems. */
1287 	if (so->so_error) {
1288 		if (sbavail(sb))
1289 			goto deliver;
1290 		if (oresid > uio->uio_resid)
1291 			goto out;
1292 		error = so->so_error;
1293 		if (!(flags & MSG_PEEK))
1294 			so->so_error = 0;
1295 		goto out;
1296 	}
1297 
1298 	/* Door is closed.  Deliver what is left, if any. */
1299 	if (sb->sb_state & SBS_CANTRCVMORE) {
1300 		if (sbavail(sb))
1301 			goto deliver;
1302 		else
1303 			goto out;
1304 	}
1305 
1306 	/* Socket buffer got some data that we shall deliver now. */
1307 	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1308 	    ((so->so_state & SS_NBIO) ||
1309 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1310 	     sbavail(sb) >= sb->sb_lowat ||
1311 	     sbavail(sb) >= uio->uio_resid ||
1312 	     sbavail(sb) >= sb->sb_hiwat) ) {
1313 		goto deliver;
1314 	}
1315 
1316 	/* On MSG_WAITALL we must wait until all data or error arrives. */
1317 	if ((flags & MSG_WAITALL) &&
1318 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1319 		goto deliver;
1320 
1321 	/*
1322 	 * Wait and block until (more) data comes in.
1323 	 * NB: Drops the sockbuf lock during wait.
1324 	 */
1325 	error = sbwait(sb);
1326 	if (error)
1327 		goto out;
1328 	goto restart;
1329 
1330 deliver:
1331 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1332 	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1333 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1334 
1335 	/* Statistics. */
1336 	if (uio->uio_td)
1337 		uio->uio_td->td_ru.ru_msgrcv++;
1338 
1339 	/* Fill uio until full or current end of socket buffer is reached. */
1340 	len = min(uio->uio_resid, sbavail(sb));
1341 	if (mp0 != NULL) {
1342 		/* Dequeue as many mbufs as possible. */
1343 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1344 			for (*mp0 = m = sb->sb_mb;
1345 			     m != NULL && m->m_len <= len;
1346 			     m = m->m_next) {
1347 				len -= m->m_len;
1348 				uio->uio_resid -= m->m_len;
1349 				sbfree(sb, m);
1350 				n = m;
1351 			}
1352 			sb->sb_mb = m;
1353 			if (sb->sb_mb == NULL)
1354 				SB_EMPTY_FIXUP(sb);
1355 			n->m_next = NULL;
1356 		}
1357 		/* Copy the remainder. */
1358 		if (len > 0) {
1359 			KASSERT(sb->sb_mb != NULL,
1360 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1361 
1362 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1363 			if (m == NULL)
1364 				len = 0;	/* Don't flush data from sockbuf. */
1365 			else
1366 				uio->uio_resid -= m->m_len;
1367 			if (*mp0 != NULL)
1368 				n->m_next = m;
1369 			else
1370 				*mp0 = m;
1371 			if (*mp0 == NULL) {
1372 				error = ENOBUFS;
1373 				goto out;
1374 			}
1375 		}
1376 	} else {
1377 		/* NB: Must unlock socket buffer as uiomove may sleep. */
1378 		SOCKBUF_UNLOCK(sb);
1379 		error = m_mbuftouio(uio, sb->sb_mb, len);
1380 		SOCKBUF_LOCK(sb);
1381 		if (error)
1382 			goto out;
1383 	}
1384 	SBLASTRECORDCHK(sb);
1385 	SBLASTMBUFCHK(sb);
1386 
1387 	/*
1388 	 * Remove the delivered data from the socket buffer unless we
1389 	 * were only peeking.
1390 	 */
1391 	if (!(flags & MSG_PEEK)) {
1392 		if (len > 0)
1393 			sbdrop_locked(sb, len);
1394 
1395 		/* Notify protocol that we drained some data. */
1396 		SOCKBUF_UNLOCK(sb);
1397 		SDP_WLOCK(ssk);
1398 		sdp_do_posts(ssk);
1399 		SDP_WUNLOCK(ssk);
1400 		SOCKBUF_LOCK(sb);
1401 	}
1402 
1403 	/*
1404 	 * For MSG_WAITALL we may have to loop again and wait for
1405 	 * more data to come in.
1406 	 */
1407 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1408 		goto restart;
1409 out:
1410 	SOCKBUF_LOCK_ASSERT(sb);
1411 	SBLASTRECORDCHK(sb);
1412 	SBLASTMBUFCHK(sb);
1413 	SOCKBUF_UNLOCK(sb);
1414 	sbunlock(sb);
1415 	return (error);
1416 }
1417 
1418 /*
1419  * Abort is used to teardown a connection typically while sitting in
1420  * the accept queue.
1421  */
1422 void
1423 sdp_abort(struct socket *so)
1424 {
1425 	struct sdp_sock *ssk;
1426 
1427 	ssk = sdp_sk(so);
1428 	SDP_WLOCK(ssk);
1429 	/*
1430 	 * If we have not yet dropped, do it now.
1431 	 */
1432 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1433 	    !(ssk->flags & SDP_DROPPED))
1434 		sdp_drop(ssk, ECONNABORTED);
1435 	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1436 	    ssk, ssk->flags));
1437 	SDP_WUNLOCK(ssk);
1438 }
1439 
1440 /*
1441  * Close a SDP socket and initiate a friendly disconnect.
1442  */
1443 static void
1444 sdp_close(struct socket *so)
1445 {
1446 	struct sdp_sock *ssk;
1447 
1448 	ssk = sdp_sk(so);
1449 	SDP_WLOCK(ssk);
1450 	/*
1451 	 * If we have not yet dropped, do it now.
1452 	 */
1453 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1454 	    !(ssk->flags & SDP_DROPPED))
1455 		sdp_start_disconnect(ssk);
1456 
1457 	/*
1458 	 * If we've still not dropped let the socket layer know we're
1459 	 * holding on to the socket and pcb for a while.
1460 	 */
1461 	if (!(ssk->flags & SDP_DROPPED)) {
1462 		SOCK_LOCK(so);
1463 		so->so_state |= SS_PROTOREF;
1464 		SOCK_UNLOCK(so);
1465 		ssk->flags |= SDP_SOCKREF;
1466 	}
1467 	SDP_WUNLOCK(ssk);
1468 }
1469 
1470 /*
1471  * User requests out-of-band data.
1472  */
1473 static int
1474 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1475 {
1476 	int error = 0;
1477 	struct sdp_sock *ssk;
1478 
1479 	ssk = sdp_sk(so);
1480 	SDP_WLOCK(ssk);
1481 	if (!rx_ring_trylock(&ssk->rx_ring)) {
1482 		SDP_WUNLOCK(ssk);
1483 		return (ECONNRESET);
1484 	}
1485 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1486 		error = ECONNRESET;
1487 		goto out;
1488 	}
1489 	if ((so->so_oobmark == 0 &&
1490 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1491 	    so->so_options & SO_OOBINLINE ||
1492 	    ssk->oobflags & SDP_HADOOB) {
1493 		error = EINVAL;
1494 		goto out;
1495 	}
1496 	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1497 		error = EWOULDBLOCK;
1498 		goto out;
1499 	}
1500 	m->m_len = 1;
1501 	*mtod(m, caddr_t) = ssk->iobc;
1502 	if ((flags & MSG_PEEK) == 0)
1503 		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1504 out:
1505 	rx_ring_unlock(&ssk->rx_ring);
1506 	SDP_WUNLOCK(ssk);
1507 	return (error);
1508 }
1509 
1510 void
1511 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1512 {
1513 	struct mbuf *m;
1514 	struct socket *so;
1515 
1516 	so = ssk->socket;
1517 	if (so == NULL)
1518 		return;
1519 
1520 	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1521 	sohasoutofband(so);
1522 	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1523 	if (!(so->so_options & SO_OOBINLINE)) {
1524 		for (m = mb; m->m_next != NULL; m = m->m_next);
1525 		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1526 		ssk->oobflags |= SDP_HAVEOOB;
1527 		m->m_len--;
1528 		mb->m_pkthdr.len--;
1529 	}
1530 }
1531 
1532 /*
1533  * Notify a sdp socket of an asynchronous error.
1534  *
1535  * Do not wake up user since there currently is no mechanism for
1536  * reporting soft errors (yet - a kqueue filter may be added).
1537  */
1538 struct sdp_sock *
1539 sdp_notify(struct sdp_sock *ssk, int error)
1540 {
1541 
1542 	SDP_WLOCK_ASSERT(ssk);
1543 
1544 	if ((ssk->flags & SDP_TIMEWAIT) ||
1545 	    (ssk->flags & SDP_DROPPED))
1546 		return (ssk);
1547 
1548 	/*
1549 	 * Ignore some errors if we are hooked up.
1550 	 */
1551 	if (ssk->state == TCPS_ESTABLISHED &&
1552 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1553 	     error == EHOSTDOWN))
1554 		return (ssk);
1555 	ssk->softerror = error;
1556 	return sdp_drop(ssk, error);
1557 }
1558 
1559 static void
1560 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1561 {
1562 	struct in_addr faddr;
1563 
1564 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1565 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1566 		return;
1567 
1568 	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1569 }
1570 
1571 static int
1572 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1573     struct thread *td)
1574 {
1575 	return (EOPNOTSUPP);
1576 }
1577 
1578 static void
1579 sdp_keepalive_timeout(void *data)
1580 {
1581 	struct sdp_sock *ssk;
1582 
1583 	ssk = data;
1584 	/* Callout canceled. */
1585         if (!callout_active(&ssk->keep2msl))
1586                 return;
1587 	/* Callout rescheduled as a different kind of timer. */
1588 	if (callout_pending(&ssk->keep2msl))
1589 		goto out;
1590         callout_deactivate(&ssk->keep2msl);
1591 	if (ssk->flags & SDP_DROPPED ||
1592 	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1593 		goto out;
1594 	sdp_post_keepalive(ssk);
1595 	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1596 	    sdp_keepalive_timeout, ssk);
1597 out:
1598 	SDP_WUNLOCK(ssk);
1599 }
1600 
1601 
1602 void
1603 sdp_start_keepalive_timer(struct socket *so)
1604 {
1605 	struct sdp_sock *ssk;
1606 
1607 	ssk = sdp_sk(so);
1608 	if (!callout_pending(&ssk->keep2msl))
1609                 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1610                     sdp_keepalive_timeout, ssk);
1611 }
1612 
1613 static void
1614 sdp_stop_keepalive_timer(struct socket *so)
1615 {
1616 	struct sdp_sock *ssk;
1617 
1618 	ssk = sdp_sk(so);
1619 	callout_stop(&ssk->keep2msl);
1620 }
1621 
1622 /*
1623  * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1624  * socket option arguments.  When it re-acquires the lock after the copy, it
1625  * has to revalidate that the connection is still valid for the socket
1626  * option.
1627  */
1628 #define SDP_WLOCK_RECHECK(inp) do {					\
1629 	SDP_WLOCK(ssk);							\
1630 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1631 		SDP_WUNLOCK(ssk);					\
1632 		return (ECONNRESET);					\
1633 	}								\
1634 } while(0)
1635 
1636 static int
1637 sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1638 {
1639 	int	error, opt, optval;
1640 	struct sdp_sock *ssk;
1641 
1642 	error = 0;
1643 	ssk = sdp_sk(so);
1644 	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1645 		SDP_WLOCK(ssk);
1646 		if (so->so_options & SO_KEEPALIVE)
1647 			sdp_start_keepalive_timer(so);
1648 		else
1649 			sdp_stop_keepalive_timer(so);
1650 		SDP_WUNLOCK(ssk);
1651 	}
1652 	if (sopt->sopt_level != IPPROTO_TCP)
1653 		return (error);
1654 
1655 	SDP_WLOCK(ssk);
1656 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1657 		SDP_WUNLOCK(ssk);
1658 		return (ECONNRESET);
1659 	}
1660 
1661 	switch (sopt->sopt_dir) {
1662 	case SOPT_SET:
1663 		switch (sopt->sopt_name) {
1664 		case TCP_NODELAY:
1665 			SDP_WUNLOCK(ssk);
1666 			error = sooptcopyin(sopt, &optval, sizeof optval,
1667 			    sizeof optval);
1668 			if (error)
1669 				return (error);
1670 
1671 			SDP_WLOCK_RECHECK(ssk);
1672 			opt = SDP_NODELAY;
1673 			if (optval)
1674 				ssk->flags |= opt;
1675 			else
1676 				ssk->flags &= ~opt;
1677 			sdp_do_posts(ssk);
1678 			SDP_WUNLOCK(ssk);
1679 			break;
1680 
1681 		default:
1682 			SDP_WUNLOCK(ssk);
1683 			error = ENOPROTOOPT;
1684 			break;
1685 		}
1686 		break;
1687 
1688 	case SOPT_GET:
1689 		switch (sopt->sopt_name) {
1690 		case TCP_NODELAY:
1691 			optval = ssk->flags & SDP_NODELAY;
1692 			SDP_WUNLOCK(ssk);
1693 			error = sooptcopyout(sopt, &optval, sizeof optval);
1694 			break;
1695 		default:
1696 			SDP_WUNLOCK(ssk);
1697 			error = ENOPROTOOPT;
1698 			break;
1699 		}
1700 		break;
1701 	}
1702 	return (error);
1703 }
1704 #undef SDP_WLOCK_RECHECK
1705 
1706 int sdp_mod_count = 0;
1707 int sdp_mod_usec = 0;
1708 
1709 void
1710 sdp_set_default_moderation(struct sdp_sock *ssk)
1711 {
1712 	struct ib_cq_attr attr;
1713 	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1714 		return;
1715 	memset(&attr, 0, sizeof(attr));
1716 	attr.moderation.cq_count = sdp_mod_count;
1717 	attr.moderation.cq_period = sdp_mod_usec;
1718 
1719 	ib_modify_cq(ssk->rx_ring.cq, &attr, IB_CQ_MODERATION);
1720 }
1721 
1722 static void
1723 sdp_dev_add(struct ib_device *device)
1724 {
1725 	struct ib_fmr_pool_param param;
1726 	struct sdp_device *sdp_dev;
1727 
1728 	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1729 	sdp_dev->pd = ib_alloc_pd(device);
1730 	if (IS_ERR(sdp_dev->pd))
1731 		goto out_pd;
1732         sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
1733         if (IS_ERR(sdp_dev->mr))
1734 		goto out_mr;
1735 	memset(&param, 0, sizeof param);
1736 	param.max_pages_per_fmr = SDP_FMR_SIZE;
1737 	param.page_shift = PAGE_SHIFT;
1738 	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1739 	param.pool_size = SDP_FMR_POOL_SIZE;
1740 	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1741 	param.cache = 1;
1742 	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1743 	if (IS_ERR(sdp_dev->fmr_pool))
1744 		goto out_fmr;
1745 	ib_set_client_data(device, &sdp_client, sdp_dev);
1746 	return;
1747 
1748 out_fmr:
1749 	ib_dereg_mr(sdp_dev->mr);
1750 out_mr:
1751 	ib_dealloc_pd(sdp_dev->pd);
1752 out_pd:
1753 	free(sdp_dev, M_SDP);
1754 }
1755 
1756 static void
1757 sdp_dev_rem(struct ib_device *device)
1758 {
1759 	struct sdp_device *sdp_dev;
1760 	struct sdp_sock *ssk;
1761 
1762 	SDP_LIST_WLOCK();
1763 	LIST_FOREACH(ssk, &sdp_list, list) {
1764 		if (ssk->ib_device != device)
1765 			continue;
1766 		SDP_WLOCK(ssk);
1767 		if ((ssk->flags & SDP_DESTROY) == 0)
1768 			ssk = sdp_notify(ssk, ECONNRESET);
1769 		if (ssk)
1770 			SDP_WUNLOCK(ssk);
1771 	}
1772 	SDP_LIST_WUNLOCK();
1773 	/*
1774 	 * XXX Do I need to wait between these two?
1775 	 */
1776 	sdp_dev = ib_get_client_data(device, &sdp_client);
1777 	if (!sdp_dev)
1778 		return;
1779 	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1780 	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1781 	ib_dereg_mr(sdp_dev->mr);
1782 	ib_dealloc_pd(sdp_dev->pd);
1783 	free(sdp_dev, M_SDP);
1784 }
1785 
1786 struct ib_client sdp_client =
1787     { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1788 
1789 
1790 static int
1791 sdp_pcblist(SYSCTL_HANDLER_ARGS)
1792 {
1793 	int error, n, i;
1794 	struct sdp_sock *ssk;
1795 	struct xinpgen xig;
1796 
1797 	/*
1798 	 * The process of preparing the TCB list is too time-consuming and
1799 	 * resource-intensive to repeat twice on every request.
1800 	 */
1801 	if (req->oldptr == NULL) {
1802 		n = sdp_count;
1803 		n += imax(n / 8, 10);
1804 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1805 		return (0);
1806 	}
1807 
1808 	if (req->newptr != NULL)
1809 		return (EPERM);
1810 
1811 	/*
1812 	 * OK, now we're committed to doing something.
1813 	 */
1814 	SDP_LIST_RLOCK();
1815 	n = sdp_count;
1816 	SDP_LIST_RUNLOCK();
1817 
1818 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1819 		+ n * sizeof(struct xtcpcb));
1820 	if (error != 0)
1821 		return (error);
1822 
1823 	xig.xig_len = sizeof xig;
1824 	xig.xig_count = n;
1825 	xig.xig_gen = 0;
1826 	xig.xig_sogen = so_gencnt;
1827 	error = SYSCTL_OUT(req, &xig, sizeof xig);
1828 	if (error)
1829 		return (error);
1830 
1831 	SDP_LIST_RLOCK();
1832 	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1833 	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1834 		struct xtcpcb xt;
1835 
1836 		SDP_RLOCK(ssk);
1837 		if (ssk->flags & SDP_TIMEWAIT) {
1838 			if (ssk->cred != NULL)
1839 				error = cr_cansee(req->td->td_ucred,
1840 				    ssk->cred);
1841 			else
1842 				error = EINVAL;	/* Skip this inp. */
1843 		} else if (ssk->socket)
1844 			error = cr_canseesocket(req->td->td_ucred,
1845 			    ssk->socket);
1846 		else
1847 			error = EINVAL;
1848 		if (error) {
1849 			error = 0;
1850 			goto next;
1851 		}
1852 
1853 		bzero(&xt, sizeof(xt));
1854 		xt.xt_len = sizeof xt;
1855 		xt.xt_inp.inp_gencnt = 0;
1856 		xt.xt_inp.inp_vflag = INP_IPV4;
1857 		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1858 		xt.xt_inp.inp_lport = ssk->lport;
1859 		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1860 		xt.xt_inp.inp_fport = ssk->fport;
1861 		xt.t_state = ssk->state;
1862 		if (ssk->socket != NULL)
1863 			sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket);
1864 		xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
1865 		SDP_RUNLOCK(ssk);
1866 		error = SYSCTL_OUT(req, &xt, sizeof xt);
1867 		if (error)
1868 			break;
1869 		i++;
1870 		continue;
1871 next:
1872 		SDP_RUNLOCK(ssk);
1873 	}
1874 	if (!error) {
1875 		/*
1876 		 * Give the user an updated idea of our state.
1877 		 * If the generation differs from what we told
1878 		 * her before, she knows that something happened
1879 		 * while we were processing this request, and it
1880 		 * might be necessary to retry.
1881 		 */
1882 		xig.xig_gen = 0;
1883 		xig.xig_sogen = so_gencnt;
1884 		xig.xig_count = sdp_count;
1885 		error = SYSCTL_OUT(req, &xig, sizeof xig);
1886 	}
1887 	SDP_LIST_RUNLOCK();
1888 	return (error);
1889 }
1890 
1891 static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1892 
1893 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1894     CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1895     "List of active SDP connections");
1896 
1897 static void
1898 sdp_zone_change(void *tag)
1899 {
1900 
1901 	uma_zone_set_max(sdp_zone, maxsockets);
1902 }
1903 
1904 static void
1905 sdp_init(void)
1906 {
1907 
1908 	LIST_INIT(&sdp_list);
1909 	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1910 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1911 	uma_zone_set_max(sdp_zone, maxsockets);
1912 	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1913 		EVENTHANDLER_PRI_ANY);
1914 	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1915 	ib_register_client(&sdp_client);
1916 }
1917 
1918 extern struct domain sdpdomain;
1919 
1920 struct pr_usrreqs sdp_usrreqs = {
1921 	.pru_abort =		sdp_abort,
1922 	.pru_accept =		sdp_accept,
1923 	.pru_attach =		sdp_attach,
1924 	.pru_bind =		sdp_bind,
1925 	.pru_connect =		sdp_connect,
1926 	.pru_control =		sdp_control,
1927 	.pru_detach =		sdp_detach,
1928 	.pru_disconnect =	sdp_disconnect,
1929 	.pru_listen =		sdp_listen,
1930 	.pru_peeraddr =		sdp_getpeeraddr,
1931 	.pru_rcvoob =		sdp_rcvoob,
1932 	.pru_send =		sdp_send,
1933 	.pru_sosend =		sdp_sosend,
1934 	.pru_soreceive =	sdp_sorecv,
1935 	.pru_shutdown =		sdp_shutdown,
1936 	.pru_sockaddr =		sdp_getsockaddr,
1937 	.pru_close =		sdp_close,
1938 };
1939 
1940 struct protosw sdpsw[] = {
1941 {
1942 	.pr_type =		SOCK_STREAM,
1943 	.pr_domain =		&sdpdomain,
1944 	.pr_protocol =		IPPROTO_IP,
1945 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1946 	.pr_ctlinput =		sdp_ctlinput,
1947 	.pr_ctloutput =		sdp_ctloutput,
1948 	.pr_usrreqs =		&sdp_usrreqs
1949 },
1950 {
1951 	.pr_type =		SOCK_STREAM,
1952 	.pr_domain =		&sdpdomain,
1953 	.pr_protocol =		IPPROTO_TCP,
1954 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1955 	.pr_ctlinput =		sdp_ctlinput,
1956 	.pr_ctloutput =		sdp_ctloutput,
1957 	.pr_usrreqs =		&sdp_usrreqs
1958 },
1959 };
1960 
1961 struct domain sdpdomain = {
1962 	.dom_family =		AF_INET_SDP,
1963 	.dom_name =		"SDP",
1964 	.dom_init =		sdp_init,
1965 	.dom_protosw =		sdpsw,
1966 	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1967 };
1968 
1969 DOMAIN_SET(sdp);
1970 
1971 int sdp_debug_level = 1;
1972 int sdp_data_debug_level = 0;
1973