xref: /freebsd/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 
2 /*-
3  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
4  *      The Regents of the University of California.  All rights reserved.
5  * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
6  * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
33  */
34 
35 /*
36  *
37  * Copyright (c) 2010 Isilon Systems, Inc.
38  * Copyright (c) 2010 iX Systems, Inc.
39  * Copyright (c) 2010 Panasas, Inc.
40  * All rights reserved.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice unmodified, this list of conditions, and the following
47  *    disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
53  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
56  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
57  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62  *
63  */
64 #include <sys/cdefs.h>
65 __FBSDID("$FreeBSD$");
66 
67 #include "sdp.h"
68 
69 #include <net/if.h>
70 #include <net/route.h>
71 #include <net/vnet.h>
72 
73 uma_zone_t	sdp_zone;
74 struct rwlock	sdp_lock;
75 LIST_HEAD(, sdp_sock) sdp_list;
76 
77 struct workqueue_struct *rx_comp_wq;
78 
79 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
80 #define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
81 #define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
82 #define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
83 #define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
84 #define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
85 #define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
86 #define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
87 
88 MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
89 
90 static void sdp_stop_keepalive_timer(struct socket *so);
91 
92 /*
93  * SDP protocol interface to socket abstraction.
94  */
95 /*
96  * sdp_sendspace and sdp_recvspace are the default send and receive window
97  * sizes, respectively.
98  */
99 u_long	sdp_sendspace = 1024*32;
100 u_long	sdp_recvspace = 1024*64;
101 
102 static int sdp_count;
103 
104 /*
105  * Disable async. CMA events for sockets which are being torn down.
106  */
107 static void
108 sdp_destroy_cma(struct sdp_sock *ssk)
109 {
110 
111 	if (ssk->id == NULL)
112 		return;
113 	rdma_destroy_id(ssk->id);
114 	ssk->id = NULL;
115 }
116 
117 static int
118 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
119 {
120 	struct sockaddr_in *sin;
121 	struct sockaddr_in null;
122 	int error;
123 
124 	SDP_WLOCK_ASSERT(ssk);
125 
126 	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
127 		return (EINVAL);
128 	/* rdma_bind_addr handles bind races.  */
129 	SDP_WUNLOCK(ssk);
130 	if (ssk->id == NULL)
131 		ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP);
132 	if (ssk->id == NULL) {
133 		SDP_WLOCK(ssk);
134 		return (ENOMEM);
135 	}
136 	if (nam == NULL) {
137 		null.sin_family = AF_INET;
138 		null.sin_len = sizeof(null);
139 		null.sin_addr.s_addr = INADDR_ANY;
140 		null.sin_port = 0;
141 		bzero(&null.sin_zero, sizeof(null.sin_zero));
142 		nam = (struct sockaddr *)&null;
143 	}
144 	error = -rdma_bind_addr(ssk->id, nam);
145 	SDP_WLOCK(ssk);
146 	if (error == 0) {
147 		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
148 		ssk->laddr = sin->sin_addr.s_addr;
149 		ssk->lport = sin->sin_port;
150 	} else
151 		sdp_destroy_cma(ssk);
152 	return (error);
153 }
154 
155 static void
156 sdp_pcbfree(struct sdp_sock *ssk)
157 {
158 	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
159 
160 	sdp_dbg(ssk->socket, "Freeing pcb");
161 	SDP_WLOCK_ASSERT(ssk);
162 	ssk->flags |= SDP_DESTROY;
163 	SDP_WUNLOCK(ssk);
164 	SDP_LIST_WLOCK();
165 	sdp_count--;
166 	LIST_REMOVE(ssk, list);
167 	SDP_LIST_WUNLOCK();
168 	crfree(ssk->cred);
169 	sdp_destroy_cma(ssk);
170 	ssk->qp_active = 0;
171 	if (ssk->qp) {
172 		ib_destroy_qp(ssk->qp);
173 		ssk->qp = NULL;
174 	}
175 	sdp_tx_ring_destroy(ssk);
176 	sdp_rx_ring_destroy(ssk);
177 	rw_destroy(&ssk->rx_ring.destroyed_lock);
178 	uma_zfree(sdp_zone, ssk);
179 	rw_destroy(&ssk->lock);
180 }
181 
182 /*
183  * Common routines to return a socket address.
184  */
185 static struct sockaddr *
186 sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
187 {
188 	struct sockaddr_in *sin;
189 
190 	sin = malloc(sizeof *sin, M_SONAME,
191 		M_WAITOK | M_ZERO);
192 	sin->sin_family = AF_INET;
193 	sin->sin_len = sizeof(*sin);
194 	sin->sin_addr = *addr_p;
195 	sin->sin_port = port;
196 
197 	return (struct sockaddr *)sin;
198 }
199 
200 static int
201 sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
202 {
203 	struct sdp_sock *ssk;
204 	struct in_addr addr;
205 	in_port_t port;
206 
207 	ssk = sdp_sk(so);
208 	SDP_RLOCK(ssk);
209 	port = ssk->lport;
210 	addr.s_addr = ssk->laddr;
211 	SDP_RUNLOCK(ssk);
212 
213 	*nam = sdp_sockaddr(port, &addr);
214 	return 0;
215 }
216 
217 static int
218 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
219 {
220 	struct sdp_sock *ssk;
221 	struct in_addr addr;
222 	in_port_t port;
223 
224 	ssk = sdp_sk(so);
225 	SDP_RLOCK(ssk);
226 	port = ssk->fport;
227 	addr.s_addr = ssk->faddr;
228 	SDP_RUNLOCK(ssk);
229 
230 	*nam = sdp_sockaddr(port, &addr);
231 	return 0;
232 }
233 
234 static void
235 sdp_pcbnotifyall(struct in_addr faddr, int errno,
236     struct sdp_sock *(*notify)(struct sdp_sock *, int))
237 {
238 	struct sdp_sock *ssk, *ssk_temp;
239 
240 	SDP_LIST_WLOCK();
241 	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
242 		SDP_WLOCK(ssk);
243 		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
244 			SDP_WUNLOCK(ssk);
245 			continue;
246 		}
247 		if ((ssk->flags & SDP_DESTROY) == 0)
248 			if ((*notify)(ssk, errno))
249 				SDP_WUNLOCK(ssk);
250 	}
251 	SDP_LIST_WUNLOCK();
252 }
253 
254 #if 0
255 static void
256 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
257 {
258 	struct sdp_sock *ssk;
259 
260 	SDP_LIST_RLOCK();
261 	LIST_FOREACH(ssk, &sdp_list, list) {
262 		SDP_WLOCK(ssk);
263 		func(ssk, arg);
264 		SDP_WUNLOCK(ssk);
265 	}
266 	SDP_LIST_RUNLOCK();
267 }
268 #endif
269 
270 static void
271 sdp_output_reset(struct sdp_sock *ssk)
272 {
273 	struct rdma_cm_id *id;
274 
275 	SDP_WLOCK_ASSERT(ssk);
276 	if (ssk->id) {
277 		id = ssk->id;
278 		ssk->qp_active = 0;
279 		SDP_WUNLOCK(ssk);
280 		rdma_disconnect(id);
281 		SDP_WLOCK(ssk);
282 	}
283 	ssk->state = TCPS_CLOSED;
284 }
285 
286 /*
287  * Attempt to close a SDP socket, marking it as dropped, and freeing
288  * the socket if we hold the only reference.
289  */
290 static struct sdp_sock *
291 sdp_closed(struct sdp_sock *ssk)
292 {
293 	struct socket *so;
294 
295 	SDP_WLOCK_ASSERT(ssk);
296 
297 	ssk->flags |= SDP_DROPPED;
298 	so = ssk->socket;
299 	soisdisconnected(so);
300 	if (ssk->flags & SDP_SOCKREF) {
301 		KASSERT(so->so_state & SS_PROTOREF,
302 		    ("sdp_closed: !SS_PROTOREF"));
303 		ssk->flags &= ~SDP_SOCKREF;
304 		SDP_WUNLOCK(ssk);
305 		ACCEPT_LOCK();
306 		SOCK_LOCK(so);
307 		so->so_state &= ~SS_PROTOREF;
308 		sofree(so);
309 		return (NULL);
310 	}
311 	return (ssk);
312 }
313 
314 /*
315  * Perform timer based shutdowns which can not operate in
316  * callout context.
317  */
318 static void
319 sdp_shutdown_task(void *data, int pending)
320 {
321 	struct sdp_sock *ssk;
322 
323 	ssk = data;
324 	SDP_WLOCK(ssk);
325 	/*
326 	 * I don't think this can race with another call to pcbfree()
327 	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
328 	 */
329 	if (ssk->flags & SDP_DESTROY)
330 		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
331 		    ssk);
332 	if (ssk->flags & SDP_DISCON)
333 		sdp_output_reset(ssk);
334 	/* We have to clear this so sdp_detach() will call pcbfree(). */
335 	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
336 	if ((ssk->flags & SDP_DROPPED) == 0 &&
337 	    sdp_closed(ssk) == NULL)
338 		return;
339 	if (ssk->socket == NULL) {
340 		sdp_pcbfree(ssk);
341 		return;
342 	}
343 	SDP_WUNLOCK(ssk);
344 }
345 
346 /*
347  * 2msl has expired, schedule the shutdown task.
348  */
349 static void
350 sdp_2msl_timeout(void *data)
351 {
352 	struct sdp_sock *ssk;
353 
354 	ssk = data;
355 	/* Callout canceled. */
356         if (!callout_active(&ssk->keep2msl))
357 		goto out;
358         callout_deactivate(&ssk->keep2msl);
359 	/* Should be impossible, defensive programming. */
360 	if ((ssk->flags & SDP_TIMEWAIT) == 0)
361 		goto out;
362 	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
363 out:
364 	SDP_WUNLOCK(ssk);
365 	return;
366 }
367 
368 /*
369  * Schedule the 2msl wait timer.
370  */
371 static void
372 sdp_2msl_wait(struct sdp_sock *ssk)
373 {
374 
375 	SDP_WLOCK_ASSERT(ssk);
376 	ssk->flags |= SDP_TIMEWAIT;
377 	ssk->state = TCPS_TIME_WAIT;
378 	soisdisconnected(ssk->socket);
379 	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
380 }
381 
382 /*
383  * Timed out waiting for the final fin/ack from rdma_disconnect().
384  */
385 static void
386 sdp_dreq_timeout(void *data)
387 {
388 	struct sdp_sock *ssk;
389 
390 	ssk = data;
391 	/* Callout canceled. */
392         if (!callout_active(&ssk->keep2msl))
393 		goto out;
394 	/* Callout rescheduled, probably as a different timer. */
395 	if (callout_pending(&ssk->keep2msl))
396 		goto out;
397         callout_deactivate(&ssk->keep2msl);
398 	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
399 		goto out;
400 	if ((ssk->flags & SDP_DREQWAIT) == 0)
401 		goto out;
402 	ssk->flags &= ~SDP_DREQWAIT;
403 	ssk->flags |= SDP_DISCON;
404 	sdp_2msl_wait(ssk);
405 	ssk->qp_active = 0;
406 out:
407 	SDP_WUNLOCK(ssk);
408 }
409 
410 /*
411  * Received the final fin/ack.  Cancel the 2msl.
412  */
413 void
414 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
415 {
416 	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
417 	ssk->flags &= ~SDP_DREQWAIT;
418 	sdp_2msl_wait(ssk);
419 }
420 
421 static int
422 sdp_init_sock(struct socket *sk)
423 {
424 	struct sdp_sock *ssk = sdp_sk(sk);
425 
426 	sdp_dbg(sk, "%s\n", __func__);
427 
428 	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
429 	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
430 #ifdef SDP_ZCOPY
431 	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
432 	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
433 	ssk->tx_ring.rdma_inflight = NULL;
434 #endif
435 	atomic_set(&ssk->mseq_ack, 0);
436 	sdp_rx_ring_init(ssk);
437 	ssk->tx_ring.buffer = NULL;
438 
439 	return 0;
440 }
441 
442 /*
443  * Allocate an sdp_sock for the socket and reserve socket buffer space.
444  */
445 static int
446 sdp_attach(struct socket *so, int proto, struct thread *td)
447 {
448 	struct sdp_sock *ssk;
449 	int error;
450 
451 	ssk = sdp_sk(so);
452 	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
453 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
454 		error = soreserve(so, sdp_sendspace, sdp_recvspace);
455 		if (error)
456 			return (error);
457 	}
458 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
459 	so->so_snd.sb_flags |= SB_AUTOSIZE;
460 	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
461 	if (ssk == NULL)
462 		return (ENOBUFS);
463 	rw_init(&ssk->lock, "sdpsock");
464 	ssk->socket = so;
465 	ssk->cred = crhold(so->so_cred);
466 	so->so_pcb = (caddr_t)ssk;
467 	sdp_init_sock(so);
468 	ssk->flags = 0;
469 	ssk->qp_active = 0;
470 	ssk->state = TCPS_CLOSED;
471 	SDP_LIST_WLOCK();
472 	LIST_INSERT_HEAD(&sdp_list, ssk, list);
473 	sdp_count++;
474 	SDP_LIST_WUNLOCK();
475 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
476 		so->so_linger = TCP_LINGERTIME;
477 
478 	return (0);
479 }
480 
481 /*
482  * Detach SDP from the socket, potentially leaving it around for the
483  * timewait to expire.
484  */
485 static void
486 sdp_detach(struct socket *so)
487 {
488 	struct sdp_sock *ssk;
489 
490 	ssk = sdp_sk(so);
491 	SDP_WLOCK(ssk);
492 	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
493 	ssk->socket->so_pcb = NULL;
494 	ssk->socket = NULL;
495 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
496 		SDP_WUNLOCK(ssk);
497 	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
498 		sdp_pcbfree(ssk);
499 	else
500 		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
501 }
502 
503 /*
504  * Allocate a local address for the socket.
505  */
506 static int
507 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
508 {
509 	int error = 0;
510 	struct sdp_sock *ssk;
511 	struct sockaddr_in *sin;
512 
513 	sin = (struct sockaddr_in *)nam;
514 	if (nam->sa_len != sizeof (*sin))
515 		return (EINVAL);
516 	if (sin->sin_family != AF_INET)
517 		return (EINVAL);
518 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
519 		return (EAFNOSUPPORT);
520 
521 	ssk = sdp_sk(so);
522 	SDP_WLOCK(ssk);
523 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
524 		error = EINVAL;
525 		goto out;
526 	}
527 	error = sdp_pcbbind(ssk, nam, td->td_ucred);
528 out:
529 	SDP_WUNLOCK(ssk);
530 
531 	return (error);
532 }
533 
534 /*
535  * Prepare to accept connections.
536  */
537 static int
538 sdp_listen(struct socket *so, int backlog, struct thread *td)
539 {
540 	int error = 0;
541 	struct sdp_sock *ssk;
542 
543 	ssk = sdp_sk(so);
544 	SDP_WLOCK(ssk);
545 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
546 		error = EINVAL;
547 		goto out;
548 	}
549 	if (error == 0 && ssk->lport == 0)
550 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
551 	SOCK_LOCK(so);
552 	if (error == 0)
553 		error = solisten_proto_check(so);
554 	if (error == 0) {
555 		solisten_proto(so, backlog);
556 		ssk->state = TCPS_LISTEN;
557 	}
558 	SOCK_UNLOCK(so);
559 
560 out:
561 	SDP_WUNLOCK(ssk);
562 	if (error == 0)
563 		error = -rdma_listen(ssk->id, backlog);
564 	return (error);
565 }
566 
567 /*
568  * Initiate a SDP connection to nam.
569  */
570 static int
571 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
572 {
573 	struct sockaddr_in src;
574 	struct socket *so;
575 	int error;
576 
577 	so = ssk->socket;
578 
579 	SDP_WLOCK_ASSERT(ssk);
580 	if (ssk->lport == 0) {
581 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
582 		if (error)
583 			return error;
584 	}
585 	src.sin_family = AF_INET;
586 	src.sin_len = sizeof(src);
587 	bzero(&src.sin_zero, sizeof(src.sin_zero));
588 	src.sin_port = ssk->lport;
589 	src.sin_addr.s_addr = ssk->laddr;
590 	soisconnecting(so);
591 	SDP_WUNLOCK(ssk);
592 	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
593 	    SDP_RESOLVE_TIMEOUT);
594 	SDP_WLOCK(ssk);
595 	if (error == 0)
596 		ssk->state = TCPS_SYN_SENT;
597 
598 	return 0;
599 }
600 
601 /*
602  * Initiate SDP connection.
603  */
604 static int
605 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
606 {
607 	int error = 0;
608 	struct sdp_sock *ssk;
609 	struct sockaddr_in *sin;
610 
611 	sin = (struct sockaddr_in *)nam;
612 	if (nam->sa_len != sizeof (*sin))
613 		return (EINVAL);
614 	if (sin->sin_family != AF_INET)
615 		return (EINVAL);
616 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
617 		return (EAFNOSUPPORT);
618 	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
619 		return (error);
620 	ssk = sdp_sk(so);
621 	SDP_WLOCK(ssk);
622 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
623 		error = EINVAL;
624 	else
625 		error = sdp_start_connect(ssk, nam, td);
626 	SDP_WUNLOCK(ssk);
627 	return (error);
628 }
629 
630 /*
631  * Drop a SDP socket, reporting
632  * the specified error.  If connection is synchronized,
633  * then send a RST to peer.
634  */
635 static struct sdp_sock *
636 sdp_drop(struct sdp_sock *ssk, int errno)
637 {
638 	struct socket *so;
639 
640 	SDP_WLOCK_ASSERT(ssk);
641 	so = ssk->socket;
642 	if (TCPS_HAVERCVDSYN(ssk->state))
643 		sdp_output_reset(ssk);
644 	if (errno == ETIMEDOUT && ssk->softerror)
645 		errno = ssk->softerror;
646 	so->so_error = errno;
647 	return (sdp_closed(ssk));
648 }
649 
650 /*
651  * User issued close, and wish to trail through shutdown states:
652  * if never received SYN, just forget it.  If got a SYN from peer,
653  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
654  * If already got a FIN from peer, then almost done; go to LAST_ACK
655  * state.  In all other cases, have already sent FIN to peer (e.g.
656  * after PRU_SHUTDOWN), and just have to play tedious game waiting
657  * for peer to send FIN or not respond to keep-alives, etc.
658  * We can let the user exit from the close as soon as the FIN is acked.
659  */
660 static void
661 sdp_usrclosed(struct sdp_sock *ssk)
662 {
663 
664 	SDP_WLOCK_ASSERT(ssk);
665 
666 	switch (ssk->state) {
667 	case TCPS_LISTEN:
668 		ssk->state = TCPS_CLOSED;
669 		SDP_WUNLOCK(ssk);
670 		sdp_destroy_cma(ssk);
671 		SDP_WLOCK(ssk);
672 		/* FALLTHROUGH */
673 	case TCPS_CLOSED:
674 		ssk = sdp_closed(ssk);
675 		/*
676 		 * sdp_closed() should never return NULL here as the socket is
677 		 * still open.
678 		 */
679 		KASSERT(ssk != NULL,
680 		    ("sdp_usrclosed: sdp_closed() returned NULL"));
681 		break;
682 
683 	case TCPS_SYN_SENT:
684 		/* FALLTHROUGH */
685 	case TCPS_SYN_RECEIVED:
686 		ssk->flags |= SDP_NEEDFIN;
687 		break;
688 
689 	case TCPS_ESTABLISHED:
690 		ssk->flags |= SDP_NEEDFIN;
691 		ssk->state = TCPS_FIN_WAIT_1;
692 		break;
693 
694 	case TCPS_CLOSE_WAIT:
695 		ssk->state = TCPS_LAST_ACK;
696 		break;
697 	}
698 	if (ssk->state >= TCPS_FIN_WAIT_2) {
699 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
700 		if (ssk->state == TCPS_FIN_WAIT_2)
701 			sdp_2msl_wait(ssk);
702 		else
703 			soisdisconnected(ssk->socket);
704 	}
705 }
706 
707 static void
708 sdp_output_disconnect(struct sdp_sock *ssk)
709 {
710 
711 	SDP_WLOCK_ASSERT(ssk);
712 	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
713 	    sdp_dreq_timeout, ssk);
714 	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
715 	sdp_post_sends(ssk, M_NOWAIT);
716 }
717 
718 /*
719  * Initiate or continue a disconnect.
720  * If embryonic state, just send reset (once).
721  * If in ``let data drain'' option and linger null, just drop.
722  * Otherwise (hard), mark socket disconnecting and drop
723  * current input data; switch states based on user close, and
724  * send segment to peer (with FIN).
725  */
726 static void
727 sdp_start_disconnect(struct sdp_sock *ssk)
728 {
729 	struct socket *so;
730 	int unread;
731 
732 	so = ssk->socket;
733 	SDP_WLOCK_ASSERT(ssk);
734 	sdp_stop_keepalive_timer(so);
735 	/*
736 	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
737 	 * socket is still open.
738 	 */
739 	if (ssk->state < TCPS_ESTABLISHED) {
740 		ssk = sdp_closed(ssk);
741 		KASSERT(ssk != NULL,
742 		    ("sdp_start_disconnect: sdp_close() returned NULL"));
743 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
744 		ssk = sdp_drop(ssk, 0);
745 		KASSERT(ssk != NULL,
746 		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
747 	} else {
748 		soisdisconnecting(so);
749 		unread = so->so_rcv.sb_cc;
750 		sbflush(&so->so_rcv);
751 		sdp_usrclosed(ssk);
752 		if (!(ssk->flags & SDP_DROPPED)) {
753 			if (unread)
754 				sdp_output_reset(ssk);
755 			else
756 				sdp_output_disconnect(ssk);
757 		}
758 	}
759 }
760 
761 /*
762  * User initiated disconnect.
763  */
764 static int
765 sdp_disconnect(struct socket *so)
766 {
767 	struct sdp_sock *ssk;
768 	int error = 0;
769 
770 	ssk = sdp_sk(so);
771 	SDP_WLOCK(ssk);
772 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
773 		error = ECONNRESET;
774 		goto out;
775 	}
776 	sdp_start_disconnect(ssk);
777 out:
778 	SDP_WUNLOCK(ssk);
779 	return (error);
780 }
781 
782 /*
783  * Accept a connection.  Essentially all the work is done at higher levels;
784  * just return the address of the peer, storing through addr.
785  *
786  *
787  * XXX This is broken XXX
788  *
789  * The rationale for acquiring the sdp lock here is somewhat complicated,
790  * and is described in detail in the commit log entry for r175612.  Acquiring
791  * it delays an accept(2) racing with sonewconn(), which inserts the socket
792  * before the address/port fields are initialized.  A better fix would
793  * prevent the socket from being placed in the listen queue until all fields
794  * are fully initialized.
795  */
796 static int
797 sdp_accept(struct socket *so, struct sockaddr **nam)
798 {
799 	struct sdp_sock *ssk = NULL;
800 	struct in_addr addr;
801 	in_port_t port;
802 	int error;
803 
804 	if (so->so_state & SS_ISDISCONNECTED)
805 		return (ECONNABORTED);
806 
807 	port = 0;
808 	addr.s_addr = 0;
809 	error = 0;
810 	ssk = sdp_sk(so);
811 	SDP_WLOCK(ssk);
812 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
813 		error = ECONNABORTED;
814 		goto out;
815 	}
816 	port = ssk->fport;
817 	addr.s_addr = ssk->faddr;
818 out:
819 	SDP_WUNLOCK(ssk);
820 	if (error == 0)
821 		*nam = sdp_sockaddr(port, &addr);
822 	return error;
823 }
824 
825 /*
826  * Mark the connection as being incapable of further output.
827  */
828 static int
829 sdp_shutdown(struct socket *so)
830 {
831 	int error = 0;
832 	struct sdp_sock *ssk;
833 
834 	ssk = sdp_sk(so);
835 	SDP_WLOCK(ssk);
836 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
837 		error = ECONNRESET;
838 		goto out;
839 	}
840 	socantsendmore(so);
841 	sdp_usrclosed(ssk);
842 	if (!(ssk->flags & SDP_DROPPED))
843 		sdp_output_disconnect(ssk);
844 
845 out:
846 	SDP_WUNLOCK(ssk);
847 
848 	return (error);
849 }
850 
851 static void
852 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
853 {
854 	struct mbuf *n;
855 	int ncnt;
856 
857 	SOCKBUF_LOCK_ASSERT(sb);
858 	SBLASTRECORDCHK(sb)
859 	KASSERT(mb->m_flags & M_PKTHDR,
860 		("sdp_append: %p Missing packet header.\n", mb));
861 	n = sb->sb_lastrecord;
862 	/*
863 	 * If the queue is empty just set all pointers and proceed.
864 	 */
865 	if (n == NULL) {
866 		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
867 		for (; mb; mb = mb->m_next) {
868 	                sb->sb_mbtail = mb;
869 			sballoc(sb, mb);
870 		}
871 		return;
872 	}
873 	/*
874 	 * Count the number of mbufs in the current tail.
875 	 */
876 	for (ncnt = 0; n->m_next; n = n->m_next)
877 		ncnt++;
878 	n = sb->sb_lastrecord;
879 	/*
880 	 * If the two chains can fit in a single sdp packet and
881 	 * the last record has not been sent yet (WRITABLE) coalesce
882 	 * them.  The lastrecord remains the same but we must strip the
883 	 * packet header and then let sbcompress do the hard part.
884 	 */
885 	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
886 	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
887 	    ssk->xmit_size_goal) {
888 		m_adj(mb, SDP_HEAD_SIZE);
889 		n->m_pkthdr.len += mb->m_pkthdr.len;
890 		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
891 		m_demote(mb, 1);
892 		sbcompress(sb, mb, sb->sb_mbtail);
893 		return;
894 	}
895 	/*
896 	 * Not compressible, just append to the end and adjust counters.
897 	 */
898 	sb->sb_lastrecord->m_flags |= M_PUSH;
899 	sb->sb_lastrecord->m_nextpkt = mb;
900 	sb->sb_lastrecord = mb;
901 	if (sb->sb_sndptr == NULL)
902 		sb->sb_sndptr = mb;
903 	for (; mb; mb = mb->m_next) {
904 		sb->sb_mbtail = mb;
905 		sballoc(sb, mb);
906 	}
907 }
908 
909 /*
910  * Do a send by putting data in output queue and updating urgent
911  * marker if URG set.  Possibly send more data.  Unlike the other
912  * pru_*() routines, the mbuf chains are our responsibility.  We
913  * must either enqueue them or free them.  The other pru_* routines
914  * generally are caller-frees.
915  *
916  * This comes from sendfile, normal sends will come from sdp_sosend().
917  */
918 static int
919 sdp_send(struct socket *so, int flags, struct mbuf *m,
920     struct sockaddr *nam, struct mbuf *control, struct thread *td)
921 {
922 	struct sdp_sock *ssk;
923 	struct mbuf *n;
924 	int error;
925 	int cnt;
926 
927 	error = 0;
928 	ssk = sdp_sk(so);
929 	KASSERT(m->m_flags & M_PKTHDR,
930 	    ("sdp_send: %p no packet header", m));
931 	M_PREPEND(m, SDP_HEAD_SIZE, M_WAIT);
932 	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
933 	for (n = m, cnt = 0; n->m_next; n = n->m_next)
934 		cnt++;
935 	if (cnt > SDP_MAX_SEND_SGES) {
936 		n = m_collapse(m, M_WAIT, SDP_MAX_SEND_SGES);
937 		if (n == NULL) {
938 			m_freem(m);
939 			return (EMSGSIZE);
940 		}
941 		m = n;
942 		for (cnt = 0; n->m_next; n = n->m_next)
943 			cnt++;
944 	}
945 	SDP_WLOCK(ssk);
946 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
947 		if (control)
948 			m_freem(control);
949 		if (m)
950 			m_freem(m);
951 		error = ECONNRESET;
952 		goto out;
953 	}
954 	if (control) {
955 		/* SDP doesn't support control messages. */
956 		if (control->m_len) {
957 			m_freem(control);
958 			if (m)
959 				m_freem(m);
960 			error = EINVAL;
961 			goto out;
962 		}
963 		m_freem(control);	/* empty control, just free it */
964 	}
965 	if (!(flags & PRUS_OOB)) {
966 		SOCKBUF_LOCK(&so->so_snd);
967 		sdp_append(ssk, &so->so_snd, m, cnt);
968 		SOCKBUF_UNLOCK(&so->so_snd);
969 		if (nam && ssk->state < TCPS_SYN_SENT) {
970 			/*
971 			 * Do implied connect if not yet connected.
972 			 */
973 			error = sdp_start_connect(ssk, nam, td);
974 			if (error)
975 				goto out;
976 		}
977 		if (flags & PRUS_EOF) {
978 			/*
979 			 * Close the send side of the connection after
980 			 * the data is sent.
981 			 */
982 			socantsendmore(so);
983 			sdp_usrclosed(ssk);
984 			if (!(ssk->flags & SDP_DROPPED))
985 				sdp_output_disconnect(ssk);
986 		} else if (!(ssk->flags & SDP_DROPPED) &&
987 		    !(flags & PRUS_MORETOCOME))
988 			sdp_post_sends(ssk, M_NOWAIT);
989 		SDP_WUNLOCK(ssk);
990 		return (0);
991 	} else {
992 		SOCKBUF_LOCK(&so->so_snd);
993 		if (sbspace(&so->so_snd) < -512) {
994 			SOCKBUF_UNLOCK(&so->so_snd);
995 			m_freem(m);
996 			error = ENOBUFS;
997 			goto out;
998 		}
999 		/*
1000 		 * According to RFC961 (Assigned Protocols),
1001 		 * the urgent pointer points to the last octet
1002 		 * of urgent data.  We continue, however,
1003 		 * to consider it to indicate the first octet
1004 		 * of data past the urgent section.
1005 		 * Otherwise, snd_up should be one lower.
1006 		 */
1007 		m->m_flags |= M_URG | M_PUSH;
1008 		sdp_append(ssk, &so->so_snd, m, cnt);
1009 		SOCKBUF_UNLOCK(&so->so_snd);
1010 		if (nam && ssk->state < TCPS_SYN_SENT) {
1011 			/*
1012 			 * Do implied connect if not yet connected.
1013 			 */
1014 			error = sdp_start_connect(ssk, nam, td);
1015 			if (error)
1016 				goto out;
1017 		}
1018 		sdp_post_sends(ssk, M_NOWAIT);
1019 		SDP_WUNLOCK(ssk);
1020 		return (0);
1021 	}
1022 out:
1023 	SDP_WUNLOCK(ssk);
1024 	return (error);
1025 }
1026 
1027 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1028 
1029 /*
1030  * Send on a socket.  If send must go all at once and message is larger than
1031  * send buffering, then hard error.  Lock against other senders.  If must go
1032  * all at once and not enough room now, then inform user that this would
1033  * block and do nothing.  Otherwise, if nonblocking, send as much as
1034  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1035  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1036  * in mbuf chain must be small enough to send all at once.
1037  *
1038  * Returns nonzero on error, timeout or signal; callers must check for short
1039  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1040  * on return.
1041  */
1042 static int
1043 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1044     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1045 {
1046 	struct sdp_sock *ssk;
1047 	long space, resid;
1048 	int atomic;
1049 	int error;
1050 	int copy;
1051 
1052 	if (uio != NULL)
1053 		resid = uio->uio_resid;
1054 	else
1055 		resid = top->m_pkthdr.len;
1056 	atomic = top != NULL;
1057 	if (control != NULL) {
1058 		if (control->m_len) {
1059 			m_freem(control);
1060 			if (top)
1061 				m_freem(top);
1062 			return (EINVAL);
1063 		}
1064 		m_freem(control);
1065 		control = NULL;
1066 	}
1067 	/*
1068 	 * In theory resid should be unsigned.  However, space must be
1069 	 * signed, as it might be less than 0 if we over-committed, and we
1070 	 * must use a signed comparison of space and resid.  On the other
1071 	 * hand, a negative resid causes us to loop sending 0-length
1072 	 * segments to the protocol.
1073 	 *
1074 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1075 	 * type sockets since that's an error.
1076 	 */
1077 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1078 		error = EINVAL;
1079 		goto out;
1080 	}
1081 	if (td != NULL)
1082 		td->td_ru.ru_msgsnd++;
1083 
1084 	ssk = sdp_sk(so);
1085 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1086 	if (error)
1087 		goto out;
1088 
1089 restart:
1090 	do {
1091 		SOCKBUF_LOCK(&so->so_snd);
1092 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1093 			SOCKBUF_UNLOCK(&so->so_snd);
1094 			error = EPIPE;
1095 			goto release;
1096 		}
1097 		if (so->so_error) {
1098 			error = so->so_error;
1099 			so->so_error = 0;
1100 			SOCKBUF_UNLOCK(&so->so_snd);
1101 			goto release;
1102 		}
1103 		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1104 			SOCKBUF_UNLOCK(&so->so_snd);
1105 			error = ENOTCONN;
1106 			goto release;
1107 		}
1108 		space = sbspace(&so->so_snd);
1109 		if (flags & MSG_OOB)
1110 			space += 1024;
1111 		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1112 			SOCKBUF_UNLOCK(&so->so_snd);
1113 			error = EMSGSIZE;
1114 			goto release;
1115 		}
1116 		if (space < resid &&
1117 		    (atomic || space < so->so_snd.sb_lowat)) {
1118 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1119 				SOCKBUF_UNLOCK(&so->so_snd);
1120 				error = EWOULDBLOCK;
1121 				goto release;
1122 			}
1123 			error = sbwait(&so->so_snd);
1124 			SOCKBUF_UNLOCK(&so->so_snd);
1125 			if (error)
1126 				goto release;
1127 			goto restart;
1128 		}
1129 		SOCKBUF_UNLOCK(&so->so_snd);
1130 		do {
1131 			if (uio == NULL) {
1132 				resid = 0;
1133 				if (flags & MSG_EOR)
1134 					top->m_flags |= M_EOR;
1135 			} else {
1136 				/*
1137 				 * Copy the data from userland into a mbuf
1138 				 * chain.  If no data is to be copied in,
1139 				 * a single empty mbuf is returned.
1140 				 */
1141 				copy = min(space,
1142 				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1143 				top = m_uiotombuf(uio, M_WAITOK, copy,
1144 				    0, M_PKTHDR |
1145 				    ((flags & MSG_EOR) ? M_EOR : 0));
1146 				if (top == NULL) {
1147 					/* only possible error */
1148 					error = EFAULT;
1149 					goto release;
1150 				}
1151 				space -= resid - uio->uio_resid;
1152 				resid = uio->uio_resid;
1153 			}
1154 			/*
1155 			 * XXX all the SBS_CANTSENDMORE checks previously
1156 			 * done could be out of date after dropping the
1157 			 * socket lock.
1158 			 */
1159 			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1160 			/*
1161 			 * Set EOF on the last send if the user specified
1162 			 * MSG_EOF.
1163 			 */
1164 			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1165 			/* If there is more to send set PRUS_MORETOCOME. */
1166 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1167 			    top, addr, NULL, td);
1168 			top = NULL;
1169 			if (error)
1170 				goto release;
1171 		} while (resid && space > 0);
1172 	} while (resid);
1173 
1174 release:
1175 	sbunlock(&so->so_snd);
1176 out:
1177 	if (top != NULL)
1178 		m_freem(top);
1179 	return (error);
1180 }
1181 
1182 /*
1183  * The part of soreceive() that implements reading non-inline out-of-band
1184  * data from a socket.  For more complete comments, see soreceive(), from
1185  * which this code originated.
1186  *
1187  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1188  * unable to return an mbuf chain to the caller.
1189  */
1190 static int
1191 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1192 {
1193 	struct protosw *pr = so->so_proto;
1194 	struct mbuf *m;
1195 	int error;
1196 
1197 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1198 
1199 	m = m_get(M_WAIT, MT_DATA);
1200 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1201 	if (error)
1202 		goto bad;
1203 	do {
1204 		error = uiomove(mtod(m, void *),
1205 		    (int) min(uio->uio_resid, m->m_len), uio);
1206 		m = m_free(m);
1207 	} while (uio->uio_resid && error == 0 && m);
1208 bad:
1209 	if (m != NULL)
1210 		m_freem(m);
1211 	return (error);
1212 }
1213 
1214 /*
1215  * Optimized version of soreceive() for stream (TCP) sockets.
1216  */
1217 static int
1218 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1219     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1220 {
1221 	int len = 0, error = 0, flags, oresid;
1222 	struct sockbuf *sb;
1223 	struct mbuf *m, *n = NULL;
1224 	struct sdp_sock *ssk;
1225 
1226 	/* We only do stream sockets. */
1227 	if (so->so_type != SOCK_STREAM)
1228 		return (EINVAL);
1229 	if (psa != NULL)
1230 		*psa = NULL;
1231 	if (controlp != NULL)
1232 		return (EINVAL);
1233 	if (flagsp != NULL)
1234 		flags = *flagsp &~ MSG_EOR;
1235 	else
1236 		flags = 0;
1237 	if (flags & MSG_OOB)
1238 		return (soreceive_rcvoob(so, uio, flags));
1239 	if (mp0 != NULL)
1240 		*mp0 = NULL;
1241 
1242 	sb = &so->so_rcv;
1243 	ssk = sdp_sk(so);
1244 
1245 	/* Prevent other readers from entering the socket. */
1246 	error = sblock(sb, SBLOCKWAIT(flags));
1247 	if (error)
1248 		goto out;
1249 	SOCKBUF_LOCK(sb);
1250 
1251 	/* Easy one, no space to copyout anything. */
1252 	if (uio->uio_resid == 0) {
1253 		error = EINVAL;
1254 		goto out;
1255 	}
1256 	oresid = uio->uio_resid;
1257 
1258 	/* We will never ever get anything unless we are connected. */
1259 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1260 		/* When disconnecting there may be still some data left. */
1261 		if (sb->sb_cc > 0)
1262 			goto deliver;
1263 		if (!(so->so_state & SS_ISDISCONNECTED))
1264 			error = ENOTCONN;
1265 		goto out;
1266 	}
1267 
1268 	/* Socket buffer is empty and we shall not block. */
1269 	if (sb->sb_cc == 0 &&
1270 	    ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1271 		error = EAGAIN;
1272 		goto out;
1273 	}
1274 
1275 restart:
1276 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1277 
1278 	/* Abort if socket has reported problems. */
1279 	if (so->so_error) {
1280 		if (sb->sb_cc > 0)
1281 			goto deliver;
1282 		if (oresid > uio->uio_resid)
1283 			goto out;
1284 		error = so->so_error;
1285 		if (!(flags & MSG_PEEK))
1286 			so->so_error = 0;
1287 		goto out;
1288 	}
1289 
1290 	/* Door is closed.  Deliver what is left, if any. */
1291 	if (sb->sb_state & SBS_CANTRCVMORE) {
1292 		if (sb->sb_cc > 0)
1293 			goto deliver;
1294 		else
1295 			goto out;
1296 	}
1297 
1298 	/* Socket buffer got some data that we shall deliver now. */
1299 	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1300 	    ((sb->sb_flags & SS_NBIO) ||
1301 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1302 	     sb->sb_cc >= sb->sb_lowat ||
1303 	     sb->sb_cc >= uio->uio_resid ||
1304 	     sb->sb_cc >= sb->sb_hiwat) ) {
1305 		goto deliver;
1306 	}
1307 
1308 	/* On MSG_WAITALL we must wait until all data or error arrives. */
1309 	if ((flags & MSG_WAITALL) &&
1310 	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1311 		goto deliver;
1312 
1313 	/*
1314 	 * Wait and block until (more) data comes in.
1315 	 * NB: Drops the sockbuf lock during wait.
1316 	 */
1317 	error = sbwait(sb);
1318 	if (error)
1319 		goto out;
1320 	goto restart;
1321 
1322 deliver:
1323 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1324 	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1325 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1326 
1327 	/* Statistics. */
1328 	if (uio->uio_td)
1329 		uio->uio_td->td_ru.ru_msgrcv++;
1330 
1331 	/* Fill uio until full or current end of socket buffer is reached. */
1332 	len = min(uio->uio_resid, sb->sb_cc);
1333 	if (mp0 != NULL) {
1334 		/* Dequeue as many mbufs as possible. */
1335 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1336 			for (*mp0 = m = sb->sb_mb;
1337 			     m != NULL && m->m_len <= len;
1338 			     m = m->m_next) {
1339 				len -= m->m_len;
1340 				uio->uio_resid -= m->m_len;
1341 				sbfree(sb, m);
1342 				n = m;
1343 			}
1344 			sb->sb_mb = m;
1345 			if (sb->sb_mb == NULL)
1346 				SB_EMPTY_FIXUP(sb);
1347 			n->m_next = NULL;
1348 		}
1349 		/* Copy the remainder. */
1350 		if (len > 0) {
1351 			KASSERT(sb->sb_mb != NULL,
1352 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1353 
1354 			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
1355 			if (m == NULL)
1356 				len = 0;	/* Don't flush data from sockbuf. */
1357 			else
1358 				uio->uio_resid -= m->m_len;
1359 			if (*mp0 != NULL)
1360 				n->m_next = m;
1361 			else
1362 				*mp0 = m;
1363 			if (*mp0 == NULL) {
1364 				error = ENOBUFS;
1365 				goto out;
1366 			}
1367 		}
1368 	} else {
1369 		/* NB: Must unlock socket buffer as uiomove may sleep. */
1370 		SOCKBUF_UNLOCK(sb);
1371 		error = m_mbuftouio(uio, sb->sb_mb, len);
1372 		SOCKBUF_LOCK(sb);
1373 		if (error)
1374 			goto out;
1375 	}
1376 	SBLASTRECORDCHK(sb);
1377 	SBLASTMBUFCHK(sb);
1378 
1379 	/*
1380 	 * Remove the delivered data from the socket buffer unless we
1381 	 * were only peeking.
1382 	 */
1383 	if (!(flags & MSG_PEEK)) {
1384 		if (len > 0)
1385 			sbdrop_locked(sb, len);
1386 
1387 		/* Notify protocol that we drained some data. */
1388 		SOCKBUF_UNLOCK(sb);
1389 		SDP_WLOCK(ssk);
1390 		sdp_do_posts(ssk);
1391 		SDP_WUNLOCK(ssk);
1392 		SOCKBUF_LOCK(sb);
1393 	}
1394 
1395 	/*
1396 	 * For MSG_WAITALL we may have to loop again and wait for
1397 	 * more data to come in.
1398 	 */
1399 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1400 		goto restart;
1401 out:
1402 	SOCKBUF_LOCK_ASSERT(sb);
1403 	SBLASTRECORDCHK(sb);
1404 	SBLASTMBUFCHK(sb);
1405 	SOCKBUF_UNLOCK(sb);
1406 	sbunlock(sb);
1407 	return (error);
1408 }
1409 
1410 /*
1411  * Abort is used to teardown a connection typically while sitting in
1412  * the accept queue.
1413  */
1414 void
1415 sdp_abort(struct socket *so)
1416 {
1417 	struct sdp_sock *ssk;
1418 
1419 	ssk = sdp_sk(so);
1420 	SDP_WLOCK(ssk);
1421 	/*
1422 	 * If we have not yet dropped, do it now.
1423 	 */
1424 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1425 	    !(ssk->flags & SDP_DROPPED))
1426 		sdp_drop(ssk, ECONNABORTED);
1427 	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1428 	    ssk, ssk->flags));
1429 	SDP_WUNLOCK(ssk);
1430 }
1431 
1432 /*
1433  * Close a SDP socket and initiate a friendly disconnect.
1434  */
1435 static void
1436 sdp_close(struct socket *so)
1437 {
1438 	struct sdp_sock *ssk;
1439 
1440 	ssk = sdp_sk(so);
1441 	SDP_WLOCK(ssk);
1442 	/*
1443 	 * If we have not yet dropped, do it now.
1444 	 */
1445 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1446 	    !(ssk->flags & SDP_DROPPED))
1447 		sdp_start_disconnect(ssk);
1448 
1449 	/*
1450 	 * If we've still not dropped let the socket layer know we're
1451 	 * holding on to the socket and pcb for a while.
1452 	 */
1453 	if (!(ssk->flags & SDP_DROPPED)) {
1454 		SOCK_LOCK(so);
1455 		so->so_state |= SS_PROTOREF;
1456 		SOCK_UNLOCK(so);
1457 		ssk->flags |= SDP_SOCKREF;
1458 	}
1459 	SDP_WUNLOCK(ssk);
1460 }
1461 
1462 /*
1463  * User requests out-of-band data.
1464  */
1465 static int
1466 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1467 {
1468 	int error = 0;
1469 	struct sdp_sock *ssk;
1470 
1471 	ssk = sdp_sk(so);
1472 	SDP_WLOCK(ssk);
1473 	if (!rx_ring_trylock(&ssk->rx_ring)) {
1474 		SDP_WUNLOCK(ssk);
1475 		return (ECONNRESET);
1476 	}
1477 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1478 		error = ECONNRESET;
1479 		goto out;
1480 	}
1481 	if ((so->so_oobmark == 0 &&
1482 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1483 	    so->so_options & SO_OOBINLINE ||
1484 	    ssk->oobflags & SDP_HADOOB) {
1485 		error = EINVAL;
1486 		goto out;
1487 	}
1488 	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1489 		error = EWOULDBLOCK;
1490 		goto out;
1491 	}
1492 	m->m_len = 1;
1493 	*mtod(m, caddr_t) = ssk->iobc;
1494 	if ((flags & MSG_PEEK) == 0)
1495 		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1496 out:
1497 	rx_ring_unlock(&ssk->rx_ring);
1498 	SDP_WUNLOCK(ssk);
1499 	return (error);
1500 }
1501 
1502 void
1503 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1504 {
1505 	struct mbuf *m;
1506 	struct socket *so;
1507 
1508 	so = ssk->socket;
1509 	if (so == NULL)
1510 		return;
1511 
1512 	so->so_oobmark = so->so_rcv.sb_cc + mb->m_pkthdr.len - 1;
1513 	sohasoutofband(so);
1514 	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1515 	if (!(so->so_options & SO_OOBINLINE)) {
1516 		for (m = mb; m->m_next != NULL; m = m->m_next);
1517 		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1518 		ssk->oobflags |= SDP_HAVEOOB;
1519 		m->m_len--;
1520 		mb->m_pkthdr.len--;
1521 	}
1522 }
1523 
1524 /*
1525  * Notify a sdp socket of an asynchronous error.
1526  *
1527  * Do not wake up user since there currently is no mechanism for
1528  * reporting soft errors (yet - a kqueue filter may be added).
1529  */
1530 struct sdp_sock *
1531 sdp_notify(struct sdp_sock *ssk, int error)
1532 {
1533 
1534 	SDP_WLOCK_ASSERT(ssk);
1535 
1536 	if ((ssk->flags & SDP_TIMEWAIT) ||
1537 	    (ssk->flags & SDP_DROPPED))
1538 		return (ssk);
1539 
1540 	/*
1541 	 * Ignore some errors if we are hooked up.
1542 	 */
1543 	if (ssk->state == TCPS_ESTABLISHED &&
1544 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1545 	     error == EHOSTDOWN))
1546 		return (ssk);
1547 	ssk->softerror = error;
1548 	return sdp_drop(ssk, error);
1549 }
1550 
1551 static void
1552 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1553 {
1554 	struct in_addr faddr;
1555 
1556 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1557 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1558 		return;
1559 
1560 	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1561 }
1562 
1563 static int
1564 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1565     struct thread *td)
1566 {
1567 	return (EOPNOTSUPP);
1568 }
1569 
1570 static void
1571 sdp_keepalive_timeout(void *data)
1572 {
1573 	struct sdp_sock *ssk;
1574 
1575 	ssk = data;
1576 	/* Callout canceled. */
1577         if (!callout_active(&ssk->keep2msl))
1578                 return;
1579 	/* Callout rescheduled as a different kind of timer. */
1580 	if (callout_pending(&ssk->keep2msl))
1581 		goto out;
1582         callout_deactivate(&ssk->keep2msl);
1583 	if (ssk->flags & SDP_DROPPED ||
1584 	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1585 		goto out;
1586 	sdp_post_keepalive(ssk);
1587 	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1588 	    sdp_keepalive_timeout, ssk);
1589 out:
1590 	SDP_WUNLOCK(ssk);
1591 }
1592 
1593 
1594 void
1595 sdp_start_keepalive_timer(struct socket *so)
1596 {
1597 	struct sdp_sock *ssk;
1598 
1599 	ssk = sdp_sk(so);
1600 	if (!callout_pending(&ssk->keep2msl))
1601                 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1602                     sdp_keepalive_timeout, ssk);
1603 }
1604 
1605 static void
1606 sdp_stop_keepalive_timer(struct socket *so)
1607 {
1608 	struct sdp_sock *ssk;
1609 
1610 	ssk = sdp_sk(so);
1611 	callout_stop(&ssk->keep2msl);
1612 }
1613 
1614 /*
1615  * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1616  * socket option arguments.  When it re-acquires the lock after the copy, it
1617  * has to revalidate that the connection is still valid for the socket
1618  * option.
1619  */
1620 #define SDP_WLOCK_RECHECK(inp) do {					\
1621 	SDP_WLOCK(ssk);							\
1622 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1623 		SDP_WUNLOCK(ssk);					\
1624 		return (ECONNRESET);					\
1625 	}								\
1626 } while(0)
1627 
1628 static int
1629 sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1630 {
1631 	int	error, opt, optval;
1632 	struct sdp_sock *ssk;
1633 
1634 	error = 0;
1635 	ssk = sdp_sk(so);
1636 	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1637 		SDP_WLOCK(ssk);
1638 		if (so->so_options & SO_KEEPALIVE)
1639 			sdp_start_keepalive_timer(so);
1640 		else
1641 			sdp_stop_keepalive_timer(so);
1642 		SDP_WUNLOCK(ssk);
1643 	}
1644 	if (sopt->sopt_level != IPPROTO_TCP)
1645 		return (error);
1646 
1647 	SDP_WLOCK(ssk);
1648 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1649 		SDP_WUNLOCK(ssk);
1650 		return (ECONNRESET);
1651 	}
1652 
1653 	switch (sopt->sopt_dir) {
1654 	case SOPT_SET:
1655 		switch (sopt->sopt_name) {
1656 		case TCP_NODELAY:
1657 			SDP_WUNLOCK(ssk);
1658 			error = sooptcopyin(sopt, &optval, sizeof optval,
1659 			    sizeof optval);
1660 			if (error)
1661 				return (error);
1662 
1663 			SDP_WLOCK_RECHECK(ssk);
1664 			opt = SDP_NODELAY;
1665 			if (optval)
1666 				ssk->flags |= opt;
1667 			else
1668 				ssk->flags &= ~opt;
1669 			sdp_do_posts(ssk);
1670 			SDP_WUNLOCK(ssk);
1671 			break;
1672 
1673 		default:
1674 			SDP_WUNLOCK(ssk);
1675 			error = ENOPROTOOPT;
1676 			break;
1677 		}
1678 		break;
1679 
1680 	case SOPT_GET:
1681 		switch (sopt->sopt_name) {
1682 		case TCP_NODELAY:
1683 			optval = ssk->flags & SDP_NODELAY;
1684 			SDP_WUNLOCK(ssk);
1685 			error = sooptcopyout(sopt, &optval, sizeof optval);
1686 			break;
1687 		default:
1688 			SDP_WUNLOCK(ssk);
1689 			error = ENOPROTOOPT;
1690 			break;
1691 		}
1692 		break;
1693 	}
1694 	return (error);
1695 }
1696 #undef SDP_WLOCK_RECHECK
1697 
1698 int sdp_mod_count = 0;
1699 int sdp_mod_usec = 0;
1700 
1701 void
1702 sdp_set_default_moderation(struct sdp_sock *ssk)
1703 {
1704 	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1705 		return;
1706 	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1707 }
1708 
1709 
1710 static void
1711 sdp_dev_add(struct ib_device *device)
1712 {
1713 	struct ib_fmr_pool_param param;
1714 	struct sdp_device *sdp_dev;
1715 
1716 	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1717 	sdp_dev->pd = ib_alloc_pd(device);
1718 	if (IS_ERR(sdp_dev->pd))
1719 		goto out_pd;
1720         sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
1721         if (IS_ERR(sdp_dev->mr))
1722 		goto out_mr;
1723 	memset(&param, 0, sizeof param);
1724 	param.max_pages_per_fmr = SDP_FMR_SIZE;
1725 	param.page_shift = PAGE_SHIFT;
1726 	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1727 	param.pool_size = SDP_FMR_POOL_SIZE;
1728 	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1729 	param.cache = 1;
1730 	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1731 	if (IS_ERR(sdp_dev->fmr_pool))
1732 		goto out_fmr;
1733 	ib_set_client_data(device, &sdp_client, sdp_dev);
1734 	return;
1735 
1736 out_fmr:
1737 	ib_dereg_mr(sdp_dev->mr);
1738 out_mr:
1739 	ib_dealloc_pd(sdp_dev->pd);
1740 out_pd:
1741 	free(sdp_dev, M_SDP);
1742 }
1743 
1744 static void
1745 sdp_dev_rem(struct ib_device *device)
1746 {
1747 	struct sdp_device *sdp_dev;
1748 	struct sdp_sock *ssk;
1749 
1750 	SDP_LIST_WLOCK();
1751 	LIST_FOREACH(ssk, &sdp_list, list) {
1752 		if (ssk->ib_device != device)
1753 			continue;
1754 		SDP_WLOCK(ssk);
1755 		if ((ssk->flags & SDP_DESTROY) == 0)
1756 			ssk = sdp_notify(ssk, ECONNRESET);
1757 		if (ssk)
1758 			SDP_WUNLOCK(ssk);
1759 	}
1760 	SDP_LIST_WUNLOCK();
1761 	/*
1762 	 * XXX Do I need to wait between these two?
1763 	 */
1764 	sdp_dev = ib_get_client_data(device, &sdp_client);
1765 	if (!sdp_dev)
1766 		return;
1767 	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1768 	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1769 	ib_dereg_mr(sdp_dev->mr);
1770 	ib_dealloc_pd(sdp_dev->pd);
1771 	free(sdp_dev, M_SDP);
1772 }
1773 
1774 struct ib_client sdp_client =
1775     { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1776 
1777 
1778 static int
1779 sdp_pcblist(SYSCTL_HANDLER_ARGS)
1780 {
1781 	int error, n, i;
1782 	struct sdp_sock *ssk;
1783 	struct xinpgen xig;
1784 
1785 	/*
1786 	 * The process of preparing the TCB list is too time-consuming and
1787 	 * resource-intensive to repeat twice on every request.
1788 	 */
1789 	if (req->oldptr == NULL) {
1790 		n = sdp_count;
1791 		n += imax(n / 8, 10);
1792 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1793 		return (0);
1794 	}
1795 
1796 	if (req->newptr != NULL)
1797 		return (EPERM);
1798 
1799 	/*
1800 	 * OK, now we're committed to doing something.
1801 	 */
1802 	SDP_LIST_RLOCK();
1803 	n = sdp_count;
1804 	SDP_LIST_RUNLOCK();
1805 
1806 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1807 		+ n * sizeof(struct xtcpcb));
1808 	if (error != 0)
1809 		return (error);
1810 
1811 	xig.xig_len = sizeof xig;
1812 	xig.xig_count = n;
1813 	xig.xig_gen = 0;
1814 	xig.xig_sogen = so_gencnt;
1815 	error = SYSCTL_OUT(req, &xig, sizeof xig);
1816 	if (error)
1817 		return (error);
1818 
1819 	SDP_LIST_RLOCK();
1820 	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1821 	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1822 		struct xtcpcb xt;
1823 
1824 		SDP_RLOCK(ssk);
1825 		if (ssk->flags & SDP_TIMEWAIT) {
1826 			if (ssk->cred != NULL)
1827 				error = cr_cansee(req->td->td_ucred,
1828 				    ssk->cred);
1829 			else
1830 				error = EINVAL;	/* Skip this inp. */
1831 		} else if (ssk->socket)
1832 			error = cr_canseesocket(req->td->td_ucred,
1833 			    ssk->socket);
1834 		else
1835 			error = EINVAL;
1836 		if (error) {
1837 			error = 0;
1838 			goto next;
1839 		}
1840 
1841 		bzero(&xt, sizeof(xt));
1842 		xt.xt_len = sizeof xt;
1843 		xt.xt_inp.inp_gencnt = 0;
1844 		xt.xt_inp.inp_vflag = INP_IPV4;
1845 		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1846 		xt.xt_inp.inp_lport = ssk->lport;
1847 		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1848 		xt.xt_inp.inp_fport = ssk->fport;
1849 		xt.xt_tp.t_state = ssk->state;
1850 		if (ssk->socket != NULL)
1851 			sotoxsocket(ssk->socket, &xt.xt_socket);
1852 		else
1853 			bzero(&xt.xt_socket, sizeof xt.xt_socket);
1854 		xt.xt_socket.xso_protocol = IPPROTO_TCP;
1855 		SDP_RUNLOCK(ssk);
1856 		error = SYSCTL_OUT(req, &xt, sizeof xt);
1857 		if (error)
1858 			break;
1859 		i++;
1860 		continue;
1861 next:
1862 		SDP_RUNLOCK(ssk);
1863 	}
1864 	if (!error) {
1865 		/*
1866 		 * Give the user an updated idea of our state.
1867 		 * If the generation differs from what we told
1868 		 * her before, she knows that something happened
1869 		 * while we were processing this request, and it
1870 		 * might be necessary to retry.
1871 		 */
1872 		xig.xig_gen = 0;
1873 		xig.xig_sogen = so_gencnt;
1874 		xig.xig_count = sdp_count;
1875 		error = SYSCTL_OUT(req, &xig, sizeof xig);
1876 	}
1877 	SDP_LIST_RUNLOCK();
1878 	return (error);
1879 }
1880 
1881 SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1882 
1883 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1884     CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1885     "List of active SDP connections");
1886 
1887 static void
1888 sdp_zone_change(void *tag)
1889 {
1890 
1891 	uma_zone_set_max(sdp_zone, maxsockets);
1892 }
1893 
1894 static void
1895 sdp_init(void)
1896 {
1897 
1898 	LIST_INIT(&sdp_list);
1899 	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1900 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1901 	uma_zone_set_max(sdp_zone, maxsockets);
1902 	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1903 		EVENTHANDLER_PRI_ANY);
1904 	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1905 	ib_register_client(&sdp_client);
1906 }
1907 
1908 extern struct domain sdpdomain;
1909 
1910 struct pr_usrreqs sdp_usrreqs = {
1911 	.pru_abort =		sdp_abort,
1912 	.pru_accept =		sdp_accept,
1913 	.pru_attach =		sdp_attach,
1914 	.pru_bind =		sdp_bind,
1915 	.pru_connect =		sdp_connect,
1916 	.pru_control =		sdp_control,
1917 	.pru_detach =		sdp_detach,
1918 	.pru_disconnect =	sdp_disconnect,
1919 	.pru_listen =		sdp_listen,
1920 	.pru_peeraddr =		sdp_getpeeraddr,
1921 	.pru_rcvoob =		sdp_rcvoob,
1922 	.pru_send =		sdp_send,
1923 	.pru_sosend =		sdp_sosend,
1924 	.pru_soreceive =	sdp_sorecv,
1925 	.pru_shutdown =		sdp_shutdown,
1926 	.pru_sockaddr =		sdp_getsockaddr,
1927 	.pru_close =		sdp_close,
1928 };
1929 
1930 struct protosw sdpsw[] = {
1931 {
1932 	.pr_type =		SOCK_STREAM,
1933 	.pr_domain =		&sdpdomain,
1934 	.pr_protocol =		IPPROTO_IP,
1935 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1936 	.pr_ctlinput =		sdp_ctlinput,
1937 	.pr_ctloutput =		sdp_ctloutput,
1938 	.pr_usrreqs =		&sdp_usrreqs
1939 },
1940 {
1941 	.pr_type =		SOCK_STREAM,
1942 	.pr_domain =		&sdpdomain,
1943 	.pr_protocol =		IPPROTO_TCP,
1944 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1945 	.pr_ctlinput =		sdp_ctlinput,
1946 	.pr_ctloutput =		sdp_ctloutput,
1947 	.pr_usrreqs =		&sdp_usrreqs
1948 },
1949 };
1950 
1951 struct domain sdpdomain = {
1952 	.dom_family =		AF_INET_SDP,
1953 	.dom_name =		"SDP",
1954 	.dom_init =		sdp_init,
1955 	.dom_protosw =		sdpsw,
1956 	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1957 };
1958 
1959 DOMAIN_SET(sdp);
1960 
1961 int sdp_debug_level = 1;
1962 int sdp_data_debug_level = 0;
1963