xref: /freebsd/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (revision 2bfd8b5b9419b0ceb3dd0295fdf413d32969e5b2)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
7  * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
34  */
35 
36 /*
37  *
38  * Copyright (c) 2010 Isilon Systems, Inc.
39  * Copyright (c) 2010 iX Systems, Inc.
40  * Copyright (c) 2010 Panasas, Inc.
41  * All rights reserved.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice unmodified, this list of conditions, and the following
48  *    disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
54  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
55  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
56  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
57  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
58  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
62  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63  *
64  */
65 #include <sys/cdefs.h>
66 __FBSDID("$FreeBSD$");
67 
68 #include <sys/param.h>
69 #include <sys/eventhandler.h>
70 #include <sys/kernel.h>
71 #include <sys/malloc.h>
72 
73 #include "sdp.h"
74 
75 #include <net/if.h>
76 #include <net/route.h>
77 #include <net/vnet.h>
78 #include <sys/sysctl.h>
79 
80 uma_zone_t	sdp_zone;
81 struct rwlock	sdp_lock;
82 LIST_HEAD(, sdp_sock) sdp_list;
83 
84 struct workqueue_struct *rx_comp_wq;
85 
86 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
87 #define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
88 #define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
89 #define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
90 #define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
91 #define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
92 #define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
93 #define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
94 
95 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
96 
97 static void sdp_stop_keepalive_timer(struct socket *so);
98 
99 /*
100  * SDP protocol interface to socket abstraction.
101  */
102 /*
103  * sdp_sendspace and sdp_recvspace are the default send and receive window
104  * sizes, respectively.
105  */
106 u_long	sdp_sendspace = 1024*32;
107 u_long	sdp_recvspace = 1024*64;
108 
109 static int sdp_count;
110 
111 /*
112  * Disable async. CMA events for sockets which are being torn down.
113  */
114 static void
115 sdp_destroy_cma(struct sdp_sock *ssk)
116 {
117 
118 	if (ssk->id == NULL)
119 		return;
120 	rdma_destroy_id(ssk->id);
121 	ssk->id = NULL;
122 }
123 
124 static int
125 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
126 {
127 	struct sockaddr_in *sin;
128 	struct sockaddr_in null;
129 	int error;
130 
131 	SDP_WLOCK_ASSERT(ssk);
132 
133 	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
134 		return (EINVAL);
135 	/* rdma_bind_addr handles bind races.  */
136 	SDP_WUNLOCK(ssk);
137 	if (ssk->id == NULL)
138 		ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
139 	if (ssk->id == NULL) {
140 		SDP_WLOCK(ssk);
141 		return (ENOMEM);
142 	}
143 	if (nam == NULL) {
144 		null.sin_family = AF_INET;
145 		null.sin_len = sizeof(null);
146 		null.sin_addr.s_addr = INADDR_ANY;
147 		null.sin_port = 0;
148 		bzero(&null.sin_zero, sizeof(null.sin_zero));
149 		nam = (struct sockaddr *)&null;
150 	}
151 	error = -rdma_bind_addr(ssk->id, nam);
152 	SDP_WLOCK(ssk);
153 	if (error == 0) {
154 		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
155 		ssk->laddr = sin->sin_addr.s_addr;
156 		ssk->lport = sin->sin_port;
157 	} else
158 		sdp_destroy_cma(ssk);
159 	return (error);
160 }
161 
162 static void
163 sdp_pcbfree(struct sdp_sock *ssk)
164 {
165 
166 	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
167 	KASSERT((ssk->flags & SDP_DESTROY) == 0,
168 	    ("ssk %p already destroyed", ssk));
169 
170 	sdp_dbg(ssk->socket, "Freeing pcb");
171 	SDP_WLOCK_ASSERT(ssk);
172 	ssk->flags |= SDP_DESTROY;
173 	SDP_WUNLOCK(ssk);
174 	SDP_LIST_WLOCK();
175 	sdp_count--;
176 	LIST_REMOVE(ssk, list);
177 	SDP_LIST_WUNLOCK();
178 	crfree(ssk->cred);
179 	ssk->qp_active = 0;
180 	if (ssk->qp) {
181 		ib_destroy_qp(ssk->qp);
182 		ssk->qp = NULL;
183 	}
184 	sdp_tx_ring_destroy(ssk);
185 	sdp_rx_ring_destroy(ssk);
186 	sdp_destroy_cma(ssk);
187 	rw_destroy(&ssk->rx_ring.destroyed_lock);
188 	rw_destroy(&ssk->lock);
189 	uma_zfree(sdp_zone, ssk);
190 }
191 
192 /*
193  * Common routines to return a socket address.
194  */
195 static struct sockaddr *
196 sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
197 {
198 	struct sockaddr_in *sin;
199 
200 	sin = malloc(sizeof *sin, M_SONAME,
201 		M_WAITOK | M_ZERO);
202 	sin->sin_family = AF_INET;
203 	sin->sin_len = sizeof(*sin);
204 	sin->sin_addr = *addr_p;
205 	sin->sin_port = port;
206 
207 	return (struct sockaddr *)sin;
208 }
209 
210 static int
211 sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
212 {
213 	struct sdp_sock *ssk;
214 	struct in_addr addr;
215 	in_port_t port;
216 
217 	ssk = sdp_sk(so);
218 	SDP_RLOCK(ssk);
219 	port = ssk->lport;
220 	addr.s_addr = ssk->laddr;
221 	SDP_RUNLOCK(ssk);
222 
223 	*nam = sdp_sockaddr(port, &addr);
224 	return 0;
225 }
226 
227 static int
228 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
229 {
230 	struct sdp_sock *ssk;
231 	struct in_addr addr;
232 	in_port_t port;
233 
234 	ssk = sdp_sk(so);
235 	SDP_RLOCK(ssk);
236 	port = ssk->fport;
237 	addr.s_addr = ssk->faddr;
238 	SDP_RUNLOCK(ssk);
239 
240 	*nam = sdp_sockaddr(port, &addr);
241 	return 0;
242 }
243 
244 static void
245 sdp_pcbnotifyall(struct in_addr faddr, int errno,
246     struct sdp_sock *(*notify)(struct sdp_sock *, int))
247 {
248 	struct sdp_sock *ssk, *ssk_temp;
249 
250 	SDP_LIST_WLOCK();
251 	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
252 		SDP_WLOCK(ssk);
253 		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
254 			SDP_WUNLOCK(ssk);
255 			continue;
256 		}
257 		if ((ssk->flags & SDP_DESTROY) == 0)
258 			if ((*notify)(ssk, errno))
259 				SDP_WUNLOCK(ssk);
260 	}
261 	SDP_LIST_WUNLOCK();
262 }
263 
264 #if 0
265 static void
266 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
267 {
268 	struct sdp_sock *ssk;
269 
270 	SDP_LIST_RLOCK();
271 	LIST_FOREACH(ssk, &sdp_list, list) {
272 		SDP_WLOCK(ssk);
273 		func(ssk, arg);
274 		SDP_WUNLOCK(ssk);
275 	}
276 	SDP_LIST_RUNLOCK();
277 }
278 #endif
279 
280 static void
281 sdp_output_reset(struct sdp_sock *ssk)
282 {
283 	struct rdma_cm_id *id;
284 
285 	SDP_WLOCK_ASSERT(ssk);
286 	if (ssk->id) {
287 		id = ssk->id;
288 		ssk->qp_active = 0;
289 		SDP_WUNLOCK(ssk);
290 		rdma_disconnect(id);
291 		SDP_WLOCK(ssk);
292 	}
293 	ssk->state = TCPS_CLOSED;
294 }
295 
296 /*
297  * Attempt to close a SDP socket, marking it as dropped, and freeing
298  * the socket if we hold the only reference.
299  */
300 static struct sdp_sock *
301 sdp_closed(struct sdp_sock *ssk)
302 {
303 	struct socket *so;
304 
305 	SDP_WLOCK_ASSERT(ssk);
306 
307 	ssk->flags |= SDP_DROPPED;
308 	so = ssk->socket;
309 	soisdisconnected(so);
310 	if (ssk->flags & SDP_SOCKREF) {
311 		ssk->flags &= ~SDP_SOCKREF;
312 		SDP_WUNLOCK(ssk);
313 		sorele(so);
314 		return (NULL);
315 	}
316 	return (ssk);
317 }
318 
319 /*
320  * Perform timer based shutdowns which can not operate in
321  * callout context.
322  */
323 static void
324 sdp_shutdown_task(void *data, int pending)
325 {
326 	struct sdp_sock *ssk;
327 
328 	ssk = data;
329 	SDP_WLOCK(ssk);
330 	/*
331 	 * I don't think this can race with another call to pcbfree()
332 	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
333 	 */
334 	if (ssk->flags & SDP_DESTROY)
335 		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
336 		    ssk);
337 	if (ssk->flags & SDP_DISCON)
338 		sdp_output_reset(ssk);
339 	/* We have to clear this so sdp_detach() will call pcbfree(). */
340 	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
341 	if ((ssk->flags & SDP_DROPPED) == 0 &&
342 	    sdp_closed(ssk) == NULL)
343 		return;
344 	if (ssk->socket == NULL) {
345 		sdp_pcbfree(ssk);
346 		return;
347 	}
348 	SDP_WUNLOCK(ssk);
349 }
350 
351 /*
352  * 2msl has expired, schedule the shutdown task.
353  */
354 static void
355 sdp_2msl_timeout(void *data)
356 {
357 	struct sdp_sock *ssk;
358 
359 	ssk = data;
360 	/* Callout canceled. */
361         if (!callout_active(&ssk->keep2msl))
362 		goto out;
363         callout_deactivate(&ssk->keep2msl);
364 	/* Should be impossible, defensive programming. */
365 	if ((ssk->flags & SDP_TIMEWAIT) == 0)
366 		goto out;
367 	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
368 out:
369 	SDP_WUNLOCK(ssk);
370 	return;
371 }
372 
373 /*
374  * Schedule the 2msl wait timer.
375  */
376 static void
377 sdp_2msl_wait(struct sdp_sock *ssk)
378 {
379 
380 	SDP_WLOCK_ASSERT(ssk);
381 	ssk->flags |= SDP_TIMEWAIT;
382 	ssk->state = TCPS_TIME_WAIT;
383 	soisdisconnected(ssk->socket);
384 	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
385 }
386 
387 /*
388  * Timed out waiting for the final fin/ack from rdma_disconnect().
389  */
390 static void
391 sdp_dreq_timeout(void *data)
392 {
393 	struct sdp_sock *ssk;
394 
395 	ssk = data;
396 	/* Callout canceled. */
397         if (!callout_active(&ssk->keep2msl))
398 		goto out;
399 	/* Callout rescheduled, probably as a different timer. */
400 	if (callout_pending(&ssk->keep2msl))
401 		goto out;
402         callout_deactivate(&ssk->keep2msl);
403 	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
404 		goto out;
405 	if ((ssk->flags & SDP_DREQWAIT) == 0)
406 		goto out;
407 	ssk->flags &= ~SDP_DREQWAIT;
408 	ssk->flags |= SDP_DISCON;
409 	sdp_2msl_wait(ssk);
410 	ssk->qp_active = 0;
411 out:
412 	SDP_WUNLOCK(ssk);
413 }
414 
415 /*
416  * Received the final fin/ack.  Cancel the 2msl.
417  */
418 void
419 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
420 {
421 	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
422 	ssk->flags &= ~SDP_DREQWAIT;
423 	sdp_2msl_wait(ssk);
424 }
425 
426 static int
427 sdp_init_sock(struct socket *sk)
428 {
429 	struct sdp_sock *ssk = sdp_sk(sk);
430 
431 	sdp_dbg(sk, "%s\n", __func__);
432 
433 	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
434 	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
435 #ifdef SDP_ZCOPY
436 	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
437 	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
438 	ssk->tx_ring.rdma_inflight = NULL;
439 #endif
440 	atomic_set(&ssk->mseq_ack, 0);
441 	sdp_rx_ring_init(ssk);
442 	ssk->tx_ring.buffer = NULL;
443 
444 	return 0;
445 }
446 
447 /*
448  * Allocate an sdp_sock for the socket and reserve socket buffer space.
449  */
450 static int
451 sdp_attach(struct socket *so, int proto, struct thread *td)
452 {
453 	struct sdp_sock *ssk;
454 	int error;
455 
456 	ssk = sdp_sk(so);
457 	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
458 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
459 		error = soreserve(so, sdp_sendspace, sdp_recvspace);
460 		if (error)
461 			return (error);
462 	}
463 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
464 	so->so_snd.sb_flags |= SB_AUTOSIZE;
465 	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
466 	if (ssk == NULL)
467 		return (ENOBUFS);
468 	rw_init(&ssk->lock, "sdpsock");
469 	ssk->socket = so;
470 	ssk->cred = crhold(so->so_cred);
471 	so->so_pcb = (caddr_t)ssk;
472 	sdp_init_sock(so);
473 	ssk->flags = 0;
474 	ssk->qp_active = 0;
475 	ssk->state = TCPS_CLOSED;
476 	mbufq_init(&ssk->rxctlq, INT_MAX);
477 	SDP_LIST_WLOCK();
478 	LIST_INSERT_HEAD(&sdp_list, ssk, list);
479 	sdp_count++;
480 	SDP_LIST_WUNLOCK();
481 
482 	return (0);
483 }
484 
485 /*
486  * Detach SDP from the socket, potentially leaving it around for the
487  * timewait to expire.
488  */
489 static void
490 sdp_detach(struct socket *so)
491 {
492 	struct sdp_sock *ssk;
493 
494 	ssk = sdp_sk(so);
495 	SDP_WLOCK(ssk);
496 	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
497 	ssk->socket->so_pcb = NULL;
498 	ssk->socket = NULL;
499 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
500 		SDP_WUNLOCK(ssk);
501 	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
502 		sdp_pcbfree(ssk);
503 	else
504 		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
505 }
506 
507 /*
508  * Allocate a local address for the socket.
509  */
510 static int
511 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
512 {
513 	int error = 0;
514 	struct sdp_sock *ssk;
515 	struct sockaddr_in *sin;
516 
517 	sin = (struct sockaddr_in *)nam;
518 	if (sin->sin_family != AF_INET)
519 		return (EAFNOSUPPORT);
520 	if (nam->sa_len != sizeof(*sin))
521 		return (EINVAL);
522 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
523 		return (EAFNOSUPPORT);
524 
525 	ssk = sdp_sk(so);
526 	SDP_WLOCK(ssk);
527 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
528 		error = EINVAL;
529 		goto out;
530 	}
531 	error = sdp_pcbbind(ssk, nam, td->td_ucred);
532 out:
533 	SDP_WUNLOCK(ssk);
534 
535 	return (error);
536 }
537 
538 /*
539  * Prepare to accept connections.
540  */
541 static int
542 sdp_listen(struct socket *so, int backlog, struct thread *td)
543 {
544 	int error = 0;
545 	struct sdp_sock *ssk;
546 
547 	ssk = sdp_sk(so);
548 	SDP_WLOCK(ssk);
549 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
550 		error = EINVAL;
551 		goto out;
552 	}
553 	if (error == 0 && ssk->lport == 0)
554 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
555 	SOCK_LOCK(so);
556 	if (error == 0)
557 		error = solisten_proto_check(so);
558 	if (error == 0) {
559 		solisten_proto(so, backlog);
560 		ssk->state = TCPS_LISTEN;
561 	}
562 	SOCK_UNLOCK(so);
563 
564 out:
565 	SDP_WUNLOCK(ssk);
566 	if (error == 0)
567 		error = -rdma_listen(ssk->id, backlog);
568 	return (error);
569 }
570 
571 /*
572  * Initiate a SDP connection to nam.
573  */
574 static int
575 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
576 {
577 	struct sockaddr_in src;
578 	struct socket *so;
579 	int error;
580 
581 	so = ssk->socket;
582 
583 	SDP_WLOCK_ASSERT(ssk);
584 	if (ssk->lport == 0) {
585 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
586 		if (error)
587 			return error;
588 	}
589 	src.sin_family = AF_INET;
590 	src.sin_len = sizeof(src);
591 	bzero(&src.sin_zero, sizeof(src.sin_zero));
592 	src.sin_port = ssk->lport;
593 	src.sin_addr.s_addr = ssk->laddr;
594 	soisconnecting(so);
595 	SDP_WUNLOCK(ssk);
596 	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
597 	    SDP_RESOLVE_TIMEOUT);
598 	SDP_WLOCK(ssk);
599 	if (error == 0)
600 		ssk->state = TCPS_SYN_SENT;
601 
602 	return 0;
603 }
604 
605 /*
606  * Initiate SDP connection.
607  */
608 static int
609 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
610 {
611 	int error = 0;
612 	struct sdp_sock *ssk;
613 	struct sockaddr_in *sin;
614 
615 	sin = (struct sockaddr_in *)nam;
616 	if (nam->sa_len != sizeof(*sin))
617 		return (EINVAL);
618 	if (sin->sin_family != AF_INET)
619 		return (EAFNOSUPPORT);
620 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
621 		return (EAFNOSUPPORT);
622 	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
623 		return (error);
624 	ssk = sdp_sk(so);
625 	SDP_WLOCK(ssk);
626 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
627 		error = EINVAL;
628 	else
629 		error = sdp_start_connect(ssk, nam, td);
630 	SDP_WUNLOCK(ssk);
631 	return (error);
632 }
633 
634 /*
635  * Drop a SDP socket, reporting
636  * the specified error.  If connection is synchronized,
637  * then send a RST to peer.
638  */
639 static struct sdp_sock *
640 sdp_drop(struct sdp_sock *ssk, int errno)
641 {
642 	struct socket *so;
643 
644 	SDP_WLOCK_ASSERT(ssk);
645 	so = ssk->socket;
646 	if (TCPS_HAVERCVDSYN(ssk->state))
647 		sdp_output_reset(ssk);
648 	if (errno == ETIMEDOUT && ssk->softerror)
649 		errno = ssk->softerror;
650 	so->so_error = errno;
651 	return (sdp_closed(ssk));
652 }
653 
654 /*
655  * User issued close, and wish to trail through shutdown states:
656  * if never received SYN, just forget it.  If got a SYN from peer,
657  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
658  * If already got a FIN from peer, then almost done; go to LAST_ACK
659  * state.  In all other cases, have already sent FIN to peer (e.g.
660  * after PRU_SHUTDOWN), and just have to play tedious game waiting
661  * for peer to send FIN or not respond to keep-alives, etc.
662  * We can let the user exit from the close as soon as the FIN is acked.
663  */
664 static void
665 sdp_usrclosed(struct sdp_sock *ssk)
666 {
667 
668 	SDP_WLOCK_ASSERT(ssk);
669 
670 	switch (ssk->state) {
671 	case TCPS_LISTEN:
672 		ssk->state = TCPS_CLOSED;
673 		SDP_WUNLOCK(ssk);
674 		sdp_destroy_cma(ssk);
675 		SDP_WLOCK(ssk);
676 		/* FALLTHROUGH */
677 	case TCPS_CLOSED:
678 		ssk = sdp_closed(ssk);
679 		/*
680 		 * sdp_closed() should never return NULL here as the socket is
681 		 * still open.
682 		 */
683 		KASSERT(ssk != NULL,
684 		    ("sdp_usrclosed: sdp_closed() returned NULL"));
685 		break;
686 
687 	case TCPS_SYN_SENT:
688 		/* FALLTHROUGH */
689 	case TCPS_SYN_RECEIVED:
690 		ssk->flags |= SDP_NEEDFIN;
691 		break;
692 
693 	case TCPS_ESTABLISHED:
694 		ssk->flags |= SDP_NEEDFIN;
695 		ssk->state = TCPS_FIN_WAIT_1;
696 		break;
697 
698 	case TCPS_CLOSE_WAIT:
699 		ssk->state = TCPS_LAST_ACK;
700 		break;
701 	}
702 	if (ssk->state >= TCPS_FIN_WAIT_2) {
703 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
704 		if (ssk->state == TCPS_FIN_WAIT_2)
705 			sdp_2msl_wait(ssk);
706 		else
707 			soisdisconnected(ssk->socket);
708 	}
709 }
710 
711 static void
712 sdp_output_disconnect(struct sdp_sock *ssk)
713 {
714 
715 	SDP_WLOCK_ASSERT(ssk);
716 	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
717 	    sdp_dreq_timeout, ssk);
718 	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
719 	sdp_post_sends(ssk, M_NOWAIT);
720 }
721 
722 /*
723  * Initiate or continue a disconnect.
724  * If embryonic state, just send reset (once).
725  * If in ``let data drain'' option and linger null, just drop.
726  * Otherwise (hard), mark socket disconnecting and drop
727  * current input data; switch states based on user close, and
728  * send segment to peer (with FIN).
729  */
730 static void
731 sdp_start_disconnect(struct sdp_sock *ssk)
732 {
733 	struct socket *so;
734 	int unread;
735 
736 	so = ssk->socket;
737 	SDP_WLOCK_ASSERT(ssk);
738 	sdp_stop_keepalive_timer(so);
739 	/*
740 	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
741 	 * socket is still open.
742 	 */
743 	if (ssk->state < TCPS_ESTABLISHED) {
744 		ssk = sdp_closed(ssk);
745 		KASSERT(ssk != NULL,
746 		    ("sdp_start_disconnect: sdp_close() returned NULL"));
747 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
748 		ssk = sdp_drop(ssk, 0);
749 		KASSERT(ssk != NULL,
750 		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
751 	} else {
752 		soisdisconnecting(so);
753 		unread = sbused(&so->so_rcv);
754 		sbflush(&so->so_rcv);
755 		sdp_usrclosed(ssk);
756 		if (!(ssk->flags & SDP_DROPPED)) {
757 			if (unread)
758 				sdp_output_reset(ssk);
759 			else
760 				sdp_output_disconnect(ssk);
761 		}
762 	}
763 }
764 
765 /*
766  * User initiated disconnect.
767  */
768 static int
769 sdp_disconnect(struct socket *so)
770 {
771 	struct sdp_sock *ssk;
772 	int error = 0;
773 
774 	ssk = sdp_sk(so);
775 	SDP_WLOCK(ssk);
776 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
777 		error = ECONNRESET;
778 		goto out;
779 	}
780 	sdp_start_disconnect(ssk);
781 out:
782 	SDP_WUNLOCK(ssk);
783 	return (error);
784 }
785 
786 /*
787  * Accept a connection.  Essentially all the work is done at higher levels;
788  * just return the address of the peer, storing through addr.
789  *
790  *
791  * XXX This is broken XXX
792  *
793  * The rationale for acquiring the sdp lock here is somewhat complicated,
794  * and is described in detail in the commit log entry for r175612.  Acquiring
795  * it delays an accept(2) racing with sonewconn(), which inserts the socket
796  * before the address/port fields are initialized.  A better fix would
797  * prevent the socket from being placed in the listen queue until all fields
798  * are fully initialized.
799  */
800 static int
801 sdp_accept(struct socket *so, struct sockaddr **nam)
802 {
803 	struct sdp_sock *ssk = NULL;
804 	struct in_addr addr;
805 	in_port_t port;
806 	int error;
807 
808 	if (so->so_state & SS_ISDISCONNECTED)
809 		return (ECONNABORTED);
810 
811 	port = 0;
812 	addr.s_addr = 0;
813 	error = 0;
814 	ssk = sdp_sk(so);
815 	SDP_WLOCK(ssk);
816 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
817 		error = ECONNABORTED;
818 		goto out;
819 	}
820 	port = ssk->fport;
821 	addr.s_addr = ssk->faddr;
822 out:
823 	SDP_WUNLOCK(ssk);
824 	if (error == 0)
825 		*nam = sdp_sockaddr(port, &addr);
826 	return error;
827 }
828 
829 /*
830  * Mark the connection as being incapable of further output.
831  */
832 static int
833 sdp_shutdown(struct socket *so)
834 {
835 	int error = 0;
836 	struct sdp_sock *ssk;
837 
838 	ssk = sdp_sk(so);
839 	SDP_WLOCK(ssk);
840 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
841 		error = ECONNRESET;
842 		goto out;
843 	}
844 	socantsendmore(so);
845 	sdp_usrclosed(ssk);
846 	if (!(ssk->flags & SDP_DROPPED))
847 		sdp_output_disconnect(ssk);
848 
849 out:
850 	SDP_WUNLOCK(ssk);
851 
852 	return (error);
853 }
854 
855 static void
856 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
857 {
858 	struct mbuf *n;
859 	int ncnt;
860 
861 	SOCKBUF_LOCK_ASSERT(sb);
862 	SBLASTRECORDCHK(sb);
863 	KASSERT(mb->m_flags & M_PKTHDR,
864 		("sdp_append: %p Missing packet header.\n", mb));
865 	n = sb->sb_lastrecord;
866 	/*
867 	 * If the queue is empty just set all pointers and proceed.
868 	 */
869 	if (n == NULL) {
870 		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
871 		for (; mb; mb = mb->m_next) {
872 	                sb->sb_mbtail = mb;
873 			sballoc(sb, mb);
874 		}
875 		return;
876 	}
877 	/*
878 	 * Count the number of mbufs in the current tail.
879 	 */
880 	for (ncnt = 0; n->m_next; n = n->m_next)
881 		ncnt++;
882 	n = sb->sb_lastrecord;
883 	/*
884 	 * If the two chains can fit in a single sdp packet and
885 	 * the last record has not been sent yet (WRITABLE) coalesce
886 	 * them.  The lastrecord remains the same but we must strip the
887 	 * packet header and then let sbcompress do the hard part.
888 	 */
889 	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
890 	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
891 	    ssk->xmit_size_goal) {
892 		m_adj(mb, SDP_HEAD_SIZE);
893 		n->m_pkthdr.len += mb->m_pkthdr.len;
894 		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
895 		m_demote(mb, 1, 0);
896 		sbcompress(sb, mb, sb->sb_mbtail);
897 		return;
898 	}
899 	/*
900 	 * Not compressible, just append to the end and adjust counters.
901 	 */
902 	sb->sb_lastrecord->m_flags |= M_PUSH;
903 	sb->sb_lastrecord->m_nextpkt = mb;
904 	sb->sb_lastrecord = mb;
905 	if (sb->sb_sndptr == NULL)
906 		sb->sb_sndptr = mb;
907 	for (; mb; mb = mb->m_next) {
908 		sb->sb_mbtail = mb;
909 		sballoc(sb, mb);
910 	}
911 }
912 
913 /*
914  * Do a send by putting data in output queue and updating urgent
915  * marker if URG set.  Possibly send more data.  Unlike the other
916  * pru_*() routines, the mbuf chains are our responsibility.  We
917  * must either enqueue them or free them.  The other pru_* routines
918  * generally are caller-frees.
919  *
920  * This comes from sendfile, normal sends will come from sdp_sosend().
921  */
922 static int
923 sdp_send(struct socket *so, int flags, struct mbuf *m,
924     struct sockaddr *nam, struct mbuf *control, struct thread *td)
925 {
926 	struct sdp_sock *ssk;
927 	struct mbuf *n;
928 	int error;
929 	int cnt;
930 
931 	if (nam != NULL) {
932 		if (nam->sa_family != AF_INET) {
933 			if (control)
934 				m_freem(control);
935 			m_freem(m);
936 			return (EAFNOSUPPORT);
937 		}
938 		if (nam->sa_len != sizeof(struct sockaddr_in)) {
939 			if (control)
940 				m_freem(control);
941 			m_freem(m);
942 			return (EINVAL);
943 		}
944 	}
945 
946 	error = 0;
947 	ssk = sdp_sk(so);
948 	KASSERT(m->m_flags & M_PKTHDR,
949 	    ("sdp_send: %p no packet header", m));
950 	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
951 	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
952 	for (n = m, cnt = 0; n->m_next; n = n->m_next)
953 		cnt++;
954 	if (cnt > SDP_MAX_SEND_SGES) {
955 		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
956 		if (n == NULL) {
957 			m_freem(m);
958 			return (EMSGSIZE);
959 		}
960 		m = n;
961 		for (cnt = 0; n->m_next; n = n->m_next)
962 			cnt++;
963 	}
964 	SDP_WLOCK(ssk);
965 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
966 		if (control)
967 			m_freem(control);
968 		if (m)
969 			m_freem(m);
970 		error = ECONNRESET;
971 		goto out;
972 	}
973 	if (control) {
974 		/* SDP doesn't support control messages. */
975 		if (control->m_len) {
976 			m_freem(control);
977 			if (m)
978 				m_freem(m);
979 			error = EINVAL;
980 			goto out;
981 		}
982 		m_freem(control);	/* empty control, just free it */
983 	}
984 	if (!(flags & PRUS_OOB)) {
985 		SOCKBUF_LOCK(&so->so_snd);
986 		sdp_append(ssk, &so->so_snd, m, cnt);
987 		SOCKBUF_UNLOCK(&so->so_snd);
988 		if (nam && ssk->state < TCPS_SYN_SENT) {
989 			/*
990 			 * Do implied connect if not yet connected.
991 			 */
992 			error = sdp_start_connect(ssk, nam, td);
993 			if (error)
994 				goto out;
995 		}
996 		if (flags & PRUS_EOF) {
997 			/*
998 			 * Close the send side of the connection after
999 			 * the data is sent.
1000 			 */
1001 			socantsendmore(so);
1002 			sdp_usrclosed(ssk);
1003 			if (!(ssk->flags & SDP_DROPPED))
1004 				sdp_output_disconnect(ssk);
1005 		} else if (!(ssk->flags & SDP_DROPPED) &&
1006 		    !(flags & PRUS_MORETOCOME))
1007 			sdp_post_sends(ssk, M_NOWAIT);
1008 		SDP_WUNLOCK(ssk);
1009 		return (0);
1010 	} else {
1011 		SOCKBUF_LOCK(&so->so_snd);
1012 		if (sbspace(&so->so_snd) < -512) {
1013 			SOCKBUF_UNLOCK(&so->so_snd);
1014 			m_freem(m);
1015 			error = ENOBUFS;
1016 			goto out;
1017 		}
1018 		/*
1019 		 * According to RFC961 (Assigned Protocols),
1020 		 * the urgent pointer points to the last octet
1021 		 * of urgent data.  We continue, however,
1022 		 * to consider it to indicate the first octet
1023 		 * of data past the urgent section.
1024 		 * Otherwise, snd_up should be one lower.
1025 		 */
1026 		m->m_flags |= M_URG | M_PUSH;
1027 		sdp_append(ssk, &so->so_snd, m, cnt);
1028 		SOCKBUF_UNLOCK(&so->so_snd);
1029 		if (nam && ssk->state < TCPS_SYN_SENT) {
1030 			/*
1031 			 * Do implied connect if not yet connected.
1032 			 */
1033 			error = sdp_start_connect(ssk, nam, td);
1034 			if (error)
1035 				goto out;
1036 		}
1037 		sdp_post_sends(ssk, M_NOWAIT);
1038 		SDP_WUNLOCK(ssk);
1039 		return (0);
1040 	}
1041 out:
1042 	SDP_WUNLOCK(ssk);
1043 	return (error);
1044 }
1045 
1046 /*
1047  * Send on a socket.  If send must go all at once and message is larger than
1048  * send buffering, then hard error.  Lock against other senders.  If must go
1049  * all at once and not enough room now, then inform user that this would
1050  * block and do nothing.  Otherwise, if nonblocking, send as much as
1051  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1052  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1053  * in mbuf chain must be small enough to send all at once.
1054  *
1055  * Returns nonzero on error, timeout or signal; callers must check for short
1056  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1057  * on return.
1058  */
1059 static int
1060 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1061     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1062 {
1063 	struct sdp_sock *ssk;
1064 	long space, resid;
1065 	int atomic;
1066 	int error;
1067 	int copy;
1068 
1069 	if (uio != NULL)
1070 		resid = uio->uio_resid;
1071 	else
1072 		resid = top->m_pkthdr.len;
1073 	atomic = top != NULL;
1074 	if (control != NULL) {
1075 		if (control->m_len) {
1076 			m_freem(control);
1077 			if (top)
1078 				m_freem(top);
1079 			return (EINVAL);
1080 		}
1081 		m_freem(control);
1082 		control = NULL;
1083 	}
1084 	/*
1085 	 * In theory resid should be unsigned.  However, space must be
1086 	 * signed, as it might be less than 0 if we over-committed, and we
1087 	 * must use a signed comparison of space and resid.  On the other
1088 	 * hand, a negative resid causes us to loop sending 0-length
1089 	 * segments to the protocol.
1090 	 *
1091 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1092 	 * type sockets since that's an error.
1093 	 */
1094 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1095 		error = EINVAL;
1096 		goto out;
1097 	}
1098 	if (td != NULL)
1099 		td->td_ru.ru_msgsnd++;
1100 
1101 	ssk = sdp_sk(so);
1102 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1103 	if (error)
1104 		goto out;
1105 
1106 restart:
1107 	do {
1108 		SOCKBUF_LOCK(&so->so_snd);
1109 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1110 			SOCKBUF_UNLOCK(&so->so_snd);
1111 			error = EPIPE;
1112 			goto release;
1113 		}
1114 		if (so->so_error) {
1115 			error = so->so_error;
1116 			so->so_error = 0;
1117 			SOCKBUF_UNLOCK(&so->so_snd);
1118 			goto release;
1119 		}
1120 		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1121 			SOCKBUF_UNLOCK(&so->so_snd);
1122 			error = ENOTCONN;
1123 			goto release;
1124 		}
1125 		space = sbspace(&so->so_snd);
1126 		if (flags & MSG_OOB)
1127 			space += 1024;
1128 		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1129 			SOCKBUF_UNLOCK(&so->so_snd);
1130 			error = EMSGSIZE;
1131 			goto release;
1132 		}
1133 		if (space < resid &&
1134 		    (atomic || space < so->so_snd.sb_lowat)) {
1135 			if ((so->so_state & SS_NBIO) ||
1136 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1137 				SOCKBUF_UNLOCK(&so->so_snd);
1138 				error = EWOULDBLOCK;
1139 				goto release;
1140 			}
1141 			error = sbwait(so, SO_SND);
1142 			SOCKBUF_UNLOCK(&so->so_snd);
1143 			if (error)
1144 				goto release;
1145 			goto restart;
1146 		}
1147 		SOCKBUF_UNLOCK(&so->so_snd);
1148 		do {
1149 			if (uio == NULL) {
1150 				resid = 0;
1151 				if (flags & MSG_EOR)
1152 					top->m_flags |= M_EOR;
1153 			} else {
1154 				/*
1155 				 * Copy the data from userland into a mbuf
1156 				 * chain.  If no data is to be copied in,
1157 				 * a single empty mbuf is returned.
1158 				 */
1159 				copy = min(space,
1160 				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1161 				top = m_uiotombuf(uio, M_WAITOK, copy,
1162 				    0, M_PKTHDR |
1163 				    ((flags & MSG_EOR) ? M_EOR : 0));
1164 				if (top == NULL) {
1165 					/* only possible error */
1166 					error = EFAULT;
1167 					goto release;
1168 				}
1169 				space -= resid - uio->uio_resid;
1170 				resid = uio->uio_resid;
1171 			}
1172 			/*
1173 			 * XXX all the SBS_CANTSENDMORE checks previously
1174 			 * done could be out of date after dropping the
1175 			 * socket lock.
1176 			 */
1177 			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1178 			/*
1179 			 * Set EOF on the last send if the user specified
1180 			 * MSG_EOF.
1181 			 */
1182 			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1183 			/* If there is more to send set PRUS_MORETOCOME. */
1184 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1185 			    top, addr, NULL, td);
1186 			top = NULL;
1187 			if (error)
1188 				goto release;
1189 		} while (resid && space > 0);
1190 	} while (resid);
1191 
1192 release:
1193 	SOCK_IO_SEND_UNLOCK(so);
1194 out:
1195 	if (top != NULL)
1196 		m_freem(top);
1197 	return (error);
1198 }
1199 
1200 /*
1201  * The part of soreceive() that implements reading non-inline out-of-band
1202  * data from a socket.  For more complete comments, see soreceive(), from
1203  * which this code originated.
1204  *
1205  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1206  * unable to return an mbuf chain to the caller.
1207  */
1208 static int
1209 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1210 {
1211 	struct protosw *pr = so->so_proto;
1212 	struct mbuf *m;
1213 	int error;
1214 
1215 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1216 
1217 	m = m_get(M_WAITOK, MT_DATA);
1218 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1219 	if (error)
1220 		goto bad;
1221 	do {
1222 		error = uiomove(mtod(m, void *),
1223 		    (int) min(uio->uio_resid, m->m_len), uio);
1224 		m = m_free(m);
1225 	} while (uio->uio_resid && error == 0 && m);
1226 bad:
1227 	if (m != NULL)
1228 		m_freem(m);
1229 	return (error);
1230 }
1231 
1232 /*
1233  * Optimized version of soreceive() for stream (TCP) sockets.
1234  */
1235 static int
1236 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1237     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1238 {
1239 	int len = 0, error = 0, flags, oresid;
1240 	struct sockbuf *sb;
1241 	struct mbuf *m, *n = NULL;
1242 	struct sdp_sock *ssk;
1243 
1244 	/* We only do stream sockets. */
1245 	if (so->so_type != SOCK_STREAM)
1246 		return (EINVAL);
1247 	if (psa != NULL)
1248 		*psa = NULL;
1249 	if (controlp != NULL)
1250 		return (EINVAL);
1251 	if (flagsp != NULL)
1252 		flags = *flagsp &~ MSG_EOR;
1253 	else
1254 		flags = 0;
1255 	if (flags & MSG_OOB)
1256 		return (soreceive_rcvoob(so, uio, flags));
1257 	if (mp0 != NULL)
1258 		*mp0 = NULL;
1259 
1260 	sb = &so->so_rcv;
1261 	ssk = sdp_sk(so);
1262 
1263 	/* Prevent other readers from entering the socket. */
1264 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
1265 	if (error)
1266 		return (error);
1267 	SOCKBUF_LOCK(sb);
1268 
1269 	/* Easy one, no space to copyout anything. */
1270 	if (uio->uio_resid == 0) {
1271 		error = EINVAL;
1272 		goto out;
1273 	}
1274 	oresid = uio->uio_resid;
1275 
1276 	/* We will never ever get anything unless we are connected. */
1277 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1278 		/* When disconnecting there may be still some data left. */
1279 		if (sbavail(sb))
1280 			goto deliver;
1281 		if (!(so->so_state & SS_ISDISCONNECTED))
1282 			error = ENOTCONN;
1283 		goto out;
1284 	}
1285 
1286 	/* Socket buffer is empty and we shall not block. */
1287 	if (sbavail(sb) == 0 &&
1288 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1289 		error = EAGAIN;
1290 		goto out;
1291 	}
1292 
1293 restart:
1294 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1295 
1296 	/* Abort if socket has reported problems. */
1297 	if (so->so_error) {
1298 		if (sbavail(sb))
1299 			goto deliver;
1300 		if (oresid > uio->uio_resid)
1301 			goto out;
1302 		error = so->so_error;
1303 		if (!(flags & MSG_PEEK))
1304 			so->so_error = 0;
1305 		goto out;
1306 	}
1307 
1308 	/* Door is closed.  Deliver what is left, if any. */
1309 	if (sb->sb_state & SBS_CANTRCVMORE) {
1310 		if (sbavail(sb))
1311 			goto deliver;
1312 		else
1313 			goto out;
1314 	}
1315 
1316 	/* Socket buffer got some data that we shall deliver now. */
1317 	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1318 	    ((so->so_state & SS_NBIO) ||
1319 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1320 	     sbavail(sb) >= sb->sb_lowat ||
1321 	     sbavail(sb) >= uio->uio_resid ||
1322 	     sbavail(sb) >= sb->sb_hiwat) ) {
1323 		goto deliver;
1324 	}
1325 
1326 	/* On MSG_WAITALL we must wait until all data or error arrives. */
1327 	if ((flags & MSG_WAITALL) &&
1328 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1329 		goto deliver;
1330 
1331 	/*
1332 	 * Wait and block until (more) data comes in.
1333 	 * NB: Drops the sockbuf lock during wait.
1334 	 */
1335 	error = sbwait(so, SO_RCV);
1336 	if (error)
1337 		goto out;
1338 	goto restart;
1339 
1340 deliver:
1341 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1342 	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1343 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1344 
1345 	/* Statistics. */
1346 	if (uio->uio_td)
1347 		uio->uio_td->td_ru.ru_msgrcv++;
1348 
1349 	/* Fill uio until full or current end of socket buffer is reached. */
1350 	len = min(uio->uio_resid, sbavail(sb));
1351 	if (mp0 != NULL) {
1352 		/* Dequeue as many mbufs as possible. */
1353 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1354 			for (*mp0 = m = sb->sb_mb;
1355 			     m != NULL && m->m_len <= len;
1356 			     m = m->m_next) {
1357 				len -= m->m_len;
1358 				uio->uio_resid -= m->m_len;
1359 				sbfree(sb, m);
1360 				n = m;
1361 			}
1362 			sb->sb_mb = m;
1363 			if (sb->sb_mb == NULL)
1364 				SB_EMPTY_FIXUP(sb);
1365 			n->m_next = NULL;
1366 		}
1367 		/* Copy the remainder. */
1368 		if (len > 0) {
1369 			KASSERT(sb->sb_mb != NULL,
1370 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1371 
1372 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1373 			if (m == NULL)
1374 				len = 0;	/* Don't flush data from sockbuf. */
1375 			else
1376 				uio->uio_resid -= m->m_len;
1377 			if (*mp0 != NULL)
1378 				n->m_next = m;
1379 			else
1380 				*mp0 = m;
1381 			if (*mp0 == NULL) {
1382 				error = ENOBUFS;
1383 				goto out;
1384 			}
1385 		}
1386 	} else {
1387 		/* NB: Must unlock socket buffer as uiomove may sleep. */
1388 		SOCKBUF_UNLOCK(sb);
1389 		error = m_mbuftouio(uio, sb->sb_mb, len);
1390 		SOCKBUF_LOCK(sb);
1391 		if (error)
1392 			goto out;
1393 	}
1394 	SBLASTRECORDCHK(sb);
1395 	SBLASTMBUFCHK(sb);
1396 
1397 	/*
1398 	 * Remove the delivered data from the socket buffer unless we
1399 	 * were only peeking.
1400 	 */
1401 	if (!(flags & MSG_PEEK)) {
1402 		if (len > 0)
1403 			sbdrop_locked(sb, len);
1404 
1405 		/* Notify protocol that we drained some data. */
1406 		SOCKBUF_UNLOCK(sb);
1407 		SDP_WLOCK(ssk);
1408 		sdp_do_posts(ssk);
1409 		SDP_WUNLOCK(ssk);
1410 		SOCKBUF_LOCK(sb);
1411 	}
1412 
1413 	/*
1414 	 * For MSG_WAITALL we may have to loop again and wait for
1415 	 * more data to come in.
1416 	 */
1417 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1418 		goto restart;
1419 out:
1420 	SBLASTRECORDCHK(sb);
1421 	SBLASTMBUFCHK(sb);
1422 	SOCKBUF_UNLOCK(sb);
1423 	SOCK_IO_RECV_UNLOCK(so);
1424 	return (error);
1425 }
1426 
1427 /*
1428  * Abort is used to teardown a connection typically while sitting in
1429  * the accept queue.
1430  */
1431 void
1432 sdp_abort(struct socket *so)
1433 {
1434 	struct sdp_sock *ssk;
1435 
1436 	ssk = sdp_sk(so);
1437 	SDP_WLOCK(ssk);
1438 	/*
1439 	 * If we have not yet dropped, do it now.
1440 	 */
1441 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1442 	    !(ssk->flags & SDP_DROPPED))
1443 		sdp_drop(ssk, ECONNABORTED);
1444 	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1445 	    ssk, ssk->flags));
1446 	SDP_WUNLOCK(ssk);
1447 }
1448 
1449 /*
1450  * Close a SDP socket and initiate a friendly disconnect.
1451  */
1452 static void
1453 sdp_close(struct socket *so)
1454 {
1455 	struct sdp_sock *ssk;
1456 
1457 	ssk = sdp_sk(so);
1458 	SDP_WLOCK(ssk);
1459 	/*
1460 	 * If we have not yet dropped, do it now.
1461 	 */
1462 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1463 	    !(ssk->flags & SDP_DROPPED))
1464 		sdp_start_disconnect(ssk);
1465 
1466 	/*
1467 	 * If we've still not dropped let the socket layer know we're
1468 	 * holding on to the socket and pcb for a while.
1469 	 */
1470 	if (!(ssk->flags & SDP_DROPPED)) {
1471 		ssk->flags |= SDP_SOCKREF;
1472 		soref(so);
1473 	}
1474 	SDP_WUNLOCK(ssk);
1475 }
1476 
1477 /*
1478  * User requests out-of-band data.
1479  */
1480 static int
1481 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1482 {
1483 	int error = 0;
1484 	struct sdp_sock *ssk;
1485 
1486 	ssk = sdp_sk(so);
1487 	SDP_WLOCK(ssk);
1488 	if (!rx_ring_trylock(&ssk->rx_ring)) {
1489 		SDP_WUNLOCK(ssk);
1490 		return (ECONNRESET);
1491 	}
1492 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1493 		error = ECONNRESET;
1494 		goto out;
1495 	}
1496 	if ((so->so_oobmark == 0 &&
1497 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1498 	    so->so_options & SO_OOBINLINE ||
1499 	    ssk->oobflags & SDP_HADOOB) {
1500 		error = EINVAL;
1501 		goto out;
1502 	}
1503 	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1504 		error = EWOULDBLOCK;
1505 		goto out;
1506 	}
1507 	m->m_len = 1;
1508 	*mtod(m, caddr_t) = ssk->iobc;
1509 	if ((flags & MSG_PEEK) == 0)
1510 		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1511 out:
1512 	rx_ring_unlock(&ssk->rx_ring);
1513 	SDP_WUNLOCK(ssk);
1514 	return (error);
1515 }
1516 
1517 void
1518 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1519 {
1520 	struct mbuf *m;
1521 	struct socket *so;
1522 
1523 	so = ssk->socket;
1524 	if (so == NULL)
1525 		return;
1526 
1527 	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1528 	sohasoutofband(so);
1529 	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1530 	if (!(so->so_options & SO_OOBINLINE)) {
1531 		for (m = mb; m->m_next != NULL; m = m->m_next);
1532 		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1533 		ssk->oobflags |= SDP_HAVEOOB;
1534 		m->m_len--;
1535 		mb->m_pkthdr.len--;
1536 	}
1537 }
1538 
1539 /*
1540  * Notify a sdp socket of an asynchronous error.
1541  *
1542  * Do not wake up user since there currently is no mechanism for
1543  * reporting soft errors (yet - a kqueue filter may be added).
1544  */
1545 struct sdp_sock *
1546 sdp_notify(struct sdp_sock *ssk, int error)
1547 {
1548 
1549 	SDP_WLOCK_ASSERT(ssk);
1550 
1551 	if ((ssk->flags & SDP_TIMEWAIT) ||
1552 	    (ssk->flags & SDP_DROPPED))
1553 		return (ssk);
1554 
1555 	/*
1556 	 * Ignore some errors if we are hooked up.
1557 	 */
1558 	if (ssk->state == TCPS_ESTABLISHED &&
1559 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1560 	     error == EHOSTDOWN))
1561 		return (ssk);
1562 	ssk->softerror = error;
1563 	return sdp_drop(ssk, error);
1564 }
1565 
1566 static void
1567 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1568 {
1569 	struct in_addr faddr;
1570 
1571 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1572 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1573 		return;
1574 
1575 	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1576 }
1577 
1578 static int
1579 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1580     struct thread *td)
1581 {
1582 	return (EOPNOTSUPP);
1583 }
1584 
1585 static void
1586 sdp_keepalive_timeout(void *data)
1587 {
1588 	struct sdp_sock *ssk;
1589 
1590 	ssk = data;
1591 	/* Callout canceled. */
1592         if (!callout_active(&ssk->keep2msl))
1593                 return;
1594 	/* Callout rescheduled as a different kind of timer. */
1595 	if (callout_pending(&ssk->keep2msl))
1596 		goto out;
1597         callout_deactivate(&ssk->keep2msl);
1598 	if (ssk->flags & SDP_DROPPED ||
1599 	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1600 		goto out;
1601 	sdp_post_keepalive(ssk);
1602 	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1603 	    sdp_keepalive_timeout, ssk);
1604 out:
1605 	SDP_WUNLOCK(ssk);
1606 }
1607 
1608 
1609 void
1610 sdp_start_keepalive_timer(struct socket *so)
1611 {
1612 	struct sdp_sock *ssk;
1613 
1614 	ssk = sdp_sk(so);
1615 	if (!callout_pending(&ssk->keep2msl))
1616                 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1617                     sdp_keepalive_timeout, ssk);
1618 }
1619 
1620 static void
1621 sdp_stop_keepalive_timer(struct socket *so)
1622 {
1623 	struct sdp_sock *ssk;
1624 
1625 	ssk = sdp_sk(so);
1626 	callout_stop(&ssk->keep2msl);
1627 }
1628 
1629 /*
1630  * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1631  * socket option arguments.  When it re-acquires the lock after the copy, it
1632  * has to revalidate that the connection is still valid for the socket
1633  * option.
1634  */
1635 #define SDP_WLOCK_RECHECK(inp) do {					\
1636 	SDP_WLOCK(ssk);							\
1637 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1638 		SDP_WUNLOCK(ssk);					\
1639 		return (ECONNRESET);					\
1640 	}								\
1641 } while(0)
1642 
1643 static int
1644 sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1645 {
1646 	int	error, opt, optval;
1647 	struct sdp_sock *ssk;
1648 
1649 	error = 0;
1650 	ssk = sdp_sk(so);
1651 	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1652 		SDP_WLOCK(ssk);
1653 		if (so->so_options & SO_KEEPALIVE)
1654 			sdp_start_keepalive_timer(so);
1655 		else
1656 			sdp_stop_keepalive_timer(so);
1657 		SDP_WUNLOCK(ssk);
1658 	}
1659 	if (sopt->sopt_level != IPPROTO_TCP)
1660 		return (error);
1661 
1662 	SDP_WLOCK(ssk);
1663 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1664 		SDP_WUNLOCK(ssk);
1665 		return (ECONNRESET);
1666 	}
1667 
1668 	switch (sopt->sopt_dir) {
1669 	case SOPT_SET:
1670 		switch (sopt->sopt_name) {
1671 		case TCP_NODELAY:
1672 			SDP_WUNLOCK(ssk);
1673 			error = sooptcopyin(sopt, &optval, sizeof optval,
1674 			    sizeof optval);
1675 			if (error)
1676 				return (error);
1677 
1678 			SDP_WLOCK_RECHECK(ssk);
1679 			opt = SDP_NODELAY;
1680 			if (optval)
1681 				ssk->flags |= opt;
1682 			else
1683 				ssk->flags &= ~opt;
1684 			sdp_do_posts(ssk);
1685 			SDP_WUNLOCK(ssk);
1686 			break;
1687 
1688 		default:
1689 			SDP_WUNLOCK(ssk);
1690 			error = ENOPROTOOPT;
1691 			break;
1692 		}
1693 		break;
1694 
1695 	case SOPT_GET:
1696 		switch (sopt->sopt_name) {
1697 		case TCP_NODELAY:
1698 			optval = ssk->flags & SDP_NODELAY;
1699 			SDP_WUNLOCK(ssk);
1700 			error = sooptcopyout(sopt, &optval, sizeof optval);
1701 			break;
1702 		default:
1703 			SDP_WUNLOCK(ssk);
1704 			error = ENOPROTOOPT;
1705 			break;
1706 		}
1707 		break;
1708 	}
1709 	return (error);
1710 }
1711 #undef SDP_WLOCK_RECHECK
1712 
1713 int sdp_mod_count = 0;
1714 int sdp_mod_usec = 0;
1715 
1716 void
1717 sdp_set_default_moderation(struct sdp_sock *ssk)
1718 {
1719 	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1720 		return;
1721 	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1722 }
1723 
1724 static void
1725 sdp_dev_add(struct ib_device *device)
1726 {
1727 	struct ib_fmr_pool_param param;
1728 	struct sdp_device *sdp_dev;
1729 
1730 	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1731 	sdp_dev->pd = ib_alloc_pd(device, 0);
1732 	if (IS_ERR(sdp_dev->pd))
1733 		goto out_pd;
1734 	memset(&param, 0, sizeof param);
1735 	param.max_pages_per_fmr = SDP_FMR_SIZE;
1736 	param.page_shift = PAGE_SHIFT;
1737 	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1738 	param.pool_size = SDP_FMR_POOL_SIZE;
1739 	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1740 	param.cache = 1;
1741 	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1742 	if (IS_ERR(sdp_dev->fmr_pool))
1743 		goto out_fmr;
1744 	ib_set_client_data(device, &sdp_client, sdp_dev);
1745 	return;
1746 
1747 out_fmr:
1748 	ib_dealloc_pd(sdp_dev->pd);
1749 out_pd:
1750 	free(sdp_dev, M_SDP);
1751 }
1752 
1753 static void
1754 sdp_dev_rem(struct ib_device *device, void *client_data)
1755 {
1756 	struct sdp_device *sdp_dev;
1757 	struct sdp_sock *ssk;
1758 
1759 	SDP_LIST_WLOCK();
1760 	LIST_FOREACH(ssk, &sdp_list, list) {
1761 		if (ssk->ib_device != device)
1762 			continue;
1763 		SDP_WLOCK(ssk);
1764 		if ((ssk->flags & SDP_DESTROY) == 0)
1765 			ssk = sdp_notify(ssk, ECONNRESET);
1766 		if (ssk)
1767 			SDP_WUNLOCK(ssk);
1768 	}
1769 	SDP_LIST_WUNLOCK();
1770 	/*
1771 	 * XXX Do I need to wait between these two?
1772 	 */
1773 	sdp_dev = ib_get_client_data(device, &sdp_client);
1774 	if (!sdp_dev)
1775 		return;
1776 	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1777 	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1778 	ib_dealloc_pd(sdp_dev->pd);
1779 	free(sdp_dev, M_SDP);
1780 }
1781 
1782 struct ib_client sdp_client =
1783     { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1784 
1785 
1786 static int
1787 sdp_pcblist(SYSCTL_HANDLER_ARGS)
1788 {
1789 	int error, n, i;
1790 	struct sdp_sock *ssk;
1791 	struct xinpgen xig;
1792 
1793 	/*
1794 	 * The process of preparing the TCB list is too time-consuming and
1795 	 * resource-intensive to repeat twice on every request.
1796 	 */
1797 	if (req->oldptr == NULL) {
1798 		n = sdp_count;
1799 		n += imax(n / 8, 10);
1800 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1801 		return (0);
1802 	}
1803 
1804 	if (req->newptr != NULL)
1805 		return (EPERM);
1806 
1807 	/*
1808 	 * OK, now we're committed to doing something.
1809 	 */
1810 	SDP_LIST_RLOCK();
1811 	n = sdp_count;
1812 	SDP_LIST_RUNLOCK();
1813 
1814 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1815 		+ n * sizeof(struct xtcpcb));
1816 	if (error != 0)
1817 		return (error);
1818 
1819 	bzero(&xig, sizeof(xig));
1820 	xig.xig_len = sizeof xig;
1821 	xig.xig_count = n;
1822 	xig.xig_gen = 0;
1823 	xig.xig_sogen = so_gencnt;
1824 	error = SYSCTL_OUT(req, &xig, sizeof xig);
1825 	if (error)
1826 		return (error);
1827 
1828 	SDP_LIST_RLOCK();
1829 	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1830 	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1831 		struct xtcpcb xt;
1832 
1833 		SDP_RLOCK(ssk);
1834 		if (ssk->flags & SDP_TIMEWAIT) {
1835 			if (ssk->cred != NULL)
1836 				error = cr_cansee(req->td->td_ucred,
1837 				    ssk->cred);
1838 			else
1839 				error = EINVAL;	/* Skip this inp. */
1840 		} else if (ssk->socket)
1841 			error = cr_canseesocket(req->td->td_ucred,
1842 			    ssk->socket);
1843 		else
1844 			error = EINVAL;
1845 		if (error) {
1846 			error = 0;
1847 			goto next;
1848 		}
1849 
1850 		bzero(&xt, sizeof(xt));
1851 		xt.xt_len = sizeof xt;
1852 		xt.xt_inp.inp_gencnt = 0;
1853 		xt.xt_inp.inp_vflag = INP_IPV4;
1854 		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1855 		xt.xt_inp.inp_lport = ssk->lport;
1856 		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1857 		xt.xt_inp.inp_fport = ssk->fport;
1858 		xt.t_state = ssk->state;
1859 		if (ssk->socket != NULL)
1860 			sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket);
1861 		xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
1862 		SDP_RUNLOCK(ssk);
1863 		error = SYSCTL_OUT(req, &xt, sizeof xt);
1864 		if (error)
1865 			break;
1866 		i++;
1867 		continue;
1868 next:
1869 		SDP_RUNLOCK(ssk);
1870 	}
1871 	if (!error) {
1872 		/*
1873 		 * Give the user an updated idea of our state.
1874 		 * If the generation differs from what we told
1875 		 * her before, she knows that something happened
1876 		 * while we were processing this request, and it
1877 		 * might be necessary to retry.
1878 		 */
1879 		xig.xig_gen = 0;
1880 		xig.xig_sogen = so_gencnt;
1881 		xig.xig_count = sdp_count;
1882 		error = SYSCTL_OUT(req, &xig, sizeof xig);
1883 	}
1884 	SDP_LIST_RUNLOCK();
1885 	return (error);
1886 }
1887 
1888 SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1889     "SDP");
1890 
1891 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1892     CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE,
1893     0, 0, sdp_pcblist, "S,xtcpcb",
1894     "List of active SDP connections");
1895 
1896 static void
1897 sdp_zone_change(void *tag)
1898 {
1899 
1900 	uma_zone_set_max(sdp_zone, maxsockets);
1901 }
1902 
1903 static void
1904 sdp_init(void *arg __unused)
1905 {
1906 
1907 	LIST_INIT(&sdp_list);
1908 	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1909 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1910 	uma_zone_set_max(sdp_zone, maxsockets);
1911 	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1912 		EVENTHANDLER_PRI_ANY);
1913 	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1914 	ib_register_client(&sdp_client);
1915 }
1916 SYSINIT(sdp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, sdp_init, NULL);
1917 
1918 extern struct domain sdpdomain;
1919 
1920 struct pr_usrreqs sdp_usrreqs = {
1921 	.pru_abort =		sdp_abort,
1922 	.pru_accept =		sdp_accept,
1923 	.pru_attach =		sdp_attach,
1924 	.pru_bind =		sdp_bind,
1925 	.pru_connect =		sdp_connect,
1926 	.pru_control =		sdp_control,
1927 	.pru_detach =		sdp_detach,
1928 	.pru_disconnect =	sdp_disconnect,
1929 	.pru_listen =		sdp_listen,
1930 	.pru_peeraddr =		sdp_getpeeraddr,
1931 	.pru_rcvoob =		sdp_rcvoob,
1932 	.pru_send =		sdp_send,
1933 	.pru_sosend =		sdp_sosend,
1934 	.pru_soreceive =	sdp_sorecv,
1935 	.pru_shutdown =		sdp_shutdown,
1936 	.pru_sockaddr =		sdp_getsockaddr,
1937 	.pru_close =		sdp_close,
1938 };
1939 
1940 struct protosw sdpsw[] = {
1941 {
1942 	.pr_type =		SOCK_STREAM,
1943 	.pr_domain =		&sdpdomain,
1944 	.pr_protocol =		IPPROTO_IP,
1945 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1946 	.pr_ctlinput =		sdp_ctlinput,
1947 	.pr_ctloutput =		sdp_ctloutput,
1948 	.pr_usrreqs =		&sdp_usrreqs
1949 },
1950 {
1951 	.pr_type =		SOCK_STREAM,
1952 	.pr_domain =		&sdpdomain,
1953 	.pr_protocol =		IPPROTO_TCP,
1954 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1955 	.pr_ctlinput =		sdp_ctlinput,
1956 	.pr_ctloutput =		sdp_ctloutput,
1957 	.pr_usrreqs =		&sdp_usrreqs
1958 },
1959 };
1960 
1961 struct domain sdpdomain = {
1962 	.dom_family =		AF_INET_SDP,
1963 	.dom_name =		"SDP",
1964 	.dom_protosw =		sdpsw,
1965 	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1966 };
1967 
1968 DOMAIN_SET(sdp);
1969 
1970 int sdp_debug_level = 1;
1971 int sdp_data_debug_level = 0;
1972