xref: /freebsd/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (revision 0b3105a37d7adcadcb720112fed4dc4e8040be99)
1 
2 /*-
3  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
4  *      The Regents of the University of California.  All rights reserved.
5  * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
6  * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
33  */
34 
35 /*
36  *
37  * Copyright (c) 2010 Isilon Systems, Inc.
38  * Copyright (c) 2010 iX Systems, Inc.
39  * Copyright (c) 2010 Panasas, Inc.
40  * All rights reserved.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice unmodified, this list of conditions, and the following
47  *    disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
53  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
56  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
57  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62  *
63  */
64 #include <sys/cdefs.h>
65 __FBSDID("$FreeBSD$");
66 
67 #include "sdp.h"
68 
69 #include <net/if.h>
70 #include <net/route.h>
71 #include <net/vnet.h>
72 #include <sys/sysctl.h>
73 
74 uma_zone_t	sdp_zone;
75 struct rwlock	sdp_lock;
76 LIST_HEAD(, sdp_sock) sdp_list;
77 
78 struct workqueue_struct *rx_comp_wq;
79 
80 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
81 #define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
82 #define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
83 #define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
84 #define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
85 #define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
86 #define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
87 #define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
88 
89 static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
90 
91 static void sdp_stop_keepalive_timer(struct socket *so);
92 
93 /*
94  * SDP protocol interface to socket abstraction.
95  */
96 /*
97  * sdp_sendspace and sdp_recvspace are the default send and receive window
98  * sizes, respectively.
99  */
100 u_long	sdp_sendspace = 1024*32;
101 u_long	sdp_recvspace = 1024*64;
102 
103 static int sdp_count;
104 
105 /*
106  * Disable async. CMA events for sockets which are being torn down.
107  */
108 static void
109 sdp_destroy_cma(struct sdp_sock *ssk)
110 {
111 
112 	if (ssk->id == NULL)
113 		return;
114 	rdma_destroy_id(ssk->id);
115 	ssk->id = NULL;
116 }
117 
118 static int
119 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
120 {
121 	struct sockaddr_in *sin;
122 	struct sockaddr_in null;
123 	int error;
124 
125 	SDP_WLOCK_ASSERT(ssk);
126 
127 	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
128 		return (EINVAL);
129 	/* rdma_bind_addr handles bind races.  */
130 	SDP_WUNLOCK(ssk);
131 	if (ssk->id == NULL)
132 		ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
133 	if (ssk->id == NULL) {
134 		SDP_WLOCK(ssk);
135 		return (ENOMEM);
136 	}
137 	if (nam == NULL) {
138 		null.sin_family = AF_INET;
139 		null.sin_len = sizeof(null);
140 		null.sin_addr.s_addr = INADDR_ANY;
141 		null.sin_port = 0;
142 		bzero(&null.sin_zero, sizeof(null.sin_zero));
143 		nam = (struct sockaddr *)&null;
144 	}
145 	error = -rdma_bind_addr(ssk->id, nam);
146 	SDP_WLOCK(ssk);
147 	if (error == 0) {
148 		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
149 		ssk->laddr = sin->sin_addr.s_addr;
150 		ssk->lport = sin->sin_port;
151 	} else
152 		sdp_destroy_cma(ssk);
153 	return (error);
154 }
155 
156 static void
157 sdp_pcbfree(struct sdp_sock *ssk)
158 {
159 	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
160 
161 	sdp_dbg(ssk->socket, "Freeing pcb");
162 	SDP_WLOCK_ASSERT(ssk);
163 	ssk->flags |= SDP_DESTROY;
164 	SDP_WUNLOCK(ssk);
165 	SDP_LIST_WLOCK();
166 	sdp_count--;
167 	LIST_REMOVE(ssk, list);
168 	SDP_LIST_WUNLOCK();
169 	crfree(ssk->cred);
170 	sdp_destroy_cma(ssk);
171 	ssk->qp_active = 0;
172 	if (ssk->qp) {
173 		ib_destroy_qp(ssk->qp);
174 		ssk->qp = NULL;
175 	}
176 	sdp_tx_ring_destroy(ssk);
177 	sdp_rx_ring_destroy(ssk);
178 	rw_destroy(&ssk->rx_ring.destroyed_lock);
179 	uma_zfree(sdp_zone, ssk);
180 	rw_destroy(&ssk->lock);
181 }
182 
183 /*
184  * Common routines to return a socket address.
185  */
186 static struct sockaddr *
187 sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
188 {
189 	struct sockaddr_in *sin;
190 
191 	sin = malloc(sizeof *sin, M_SONAME,
192 		M_WAITOK | M_ZERO);
193 	sin->sin_family = AF_INET;
194 	sin->sin_len = sizeof(*sin);
195 	sin->sin_addr = *addr_p;
196 	sin->sin_port = port;
197 
198 	return (struct sockaddr *)sin;
199 }
200 
201 static int
202 sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
203 {
204 	struct sdp_sock *ssk;
205 	struct in_addr addr;
206 	in_port_t port;
207 
208 	ssk = sdp_sk(so);
209 	SDP_RLOCK(ssk);
210 	port = ssk->lport;
211 	addr.s_addr = ssk->laddr;
212 	SDP_RUNLOCK(ssk);
213 
214 	*nam = sdp_sockaddr(port, &addr);
215 	return 0;
216 }
217 
218 static int
219 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
220 {
221 	struct sdp_sock *ssk;
222 	struct in_addr addr;
223 	in_port_t port;
224 
225 	ssk = sdp_sk(so);
226 	SDP_RLOCK(ssk);
227 	port = ssk->fport;
228 	addr.s_addr = ssk->faddr;
229 	SDP_RUNLOCK(ssk);
230 
231 	*nam = sdp_sockaddr(port, &addr);
232 	return 0;
233 }
234 
235 static void
236 sdp_pcbnotifyall(struct in_addr faddr, int errno,
237     struct sdp_sock *(*notify)(struct sdp_sock *, int))
238 {
239 	struct sdp_sock *ssk, *ssk_temp;
240 
241 	SDP_LIST_WLOCK();
242 	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
243 		SDP_WLOCK(ssk);
244 		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
245 			SDP_WUNLOCK(ssk);
246 			continue;
247 		}
248 		if ((ssk->flags & SDP_DESTROY) == 0)
249 			if ((*notify)(ssk, errno))
250 				SDP_WUNLOCK(ssk);
251 	}
252 	SDP_LIST_WUNLOCK();
253 }
254 
255 #if 0
256 static void
257 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
258 {
259 	struct sdp_sock *ssk;
260 
261 	SDP_LIST_RLOCK();
262 	LIST_FOREACH(ssk, &sdp_list, list) {
263 		SDP_WLOCK(ssk);
264 		func(ssk, arg);
265 		SDP_WUNLOCK(ssk);
266 	}
267 	SDP_LIST_RUNLOCK();
268 }
269 #endif
270 
271 static void
272 sdp_output_reset(struct sdp_sock *ssk)
273 {
274 	struct rdma_cm_id *id;
275 
276 	SDP_WLOCK_ASSERT(ssk);
277 	if (ssk->id) {
278 		id = ssk->id;
279 		ssk->qp_active = 0;
280 		SDP_WUNLOCK(ssk);
281 		rdma_disconnect(id);
282 		SDP_WLOCK(ssk);
283 	}
284 	ssk->state = TCPS_CLOSED;
285 }
286 
287 /*
288  * Attempt to close a SDP socket, marking it as dropped, and freeing
289  * the socket if we hold the only reference.
290  */
291 static struct sdp_sock *
292 sdp_closed(struct sdp_sock *ssk)
293 {
294 	struct socket *so;
295 
296 	SDP_WLOCK_ASSERT(ssk);
297 
298 	ssk->flags |= SDP_DROPPED;
299 	so = ssk->socket;
300 	soisdisconnected(so);
301 	if (ssk->flags & SDP_SOCKREF) {
302 		KASSERT(so->so_state & SS_PROTOREF,
303 		    ("sdp_closed: !SS_PROTOREF"));
304 		ssk->flags &= ~SDP_SOCKREF;
305 		SDP_WUNLOCK(ssk);
306 		ACCEPT_LOCK();
307 		SOCK_LOCK(so);
308 		so->so_state &= ~SS_PROTOREF;
309 		sofree(so);
310 		return (NULL);
311 	}
312 	return (ssk);
313 }
314 
315 /*
316  * Perform timer based shutdowns which can not operate in
317  * callout context.
318  */
319 static void
320 sdp_shutdown_task(void *data, int pending)
321 {
322 	struct sdp_sock *ssk;
323 
324 	ssk = data;
325 	SDP_WLOCK(ssk);
326 	/*
327 	 * I don't think this can race with another call to pcbfree()
328 	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
329 	 */
330 	if (ssk->flags & SDP_DESTROY)
331 		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
332 		    ssk);
333 	if (ssk->flags & SDP_DISCON)
334 		sdp_output_reset(ssk);
335 	/* We have to clear this so sdp_detach() will call pcbfree(). */
336 	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
337 	if ((ssk->flags & SDP_DROPPED) == 0 &&
338 	    sdp_closed(ssk) == NULL)
339 		return;
340 	if (ssk->socket == NULL) {
341 		sdp_pcbfree(ssk);
342 		return;
343 	}
344 	SDP_WUNLOCK(ssk);
345 }
346 
347 /*
348  * 2msl has expired, schedule the shutdown task.
349  */
350 static void
351 sdp_2msl_timeout(void *data)
352 {
353 	struct sdp_sock *ssk;
354 
355 	ssk = data;
356 	/* Callout canceled. */
357         if (!callout_active(&ssk->keep2msl))
358 		goto out;
359         callout_deactivate(&ssk->keep2msl);
360 	/* Should be impossible, defensive programming. */
361 	if ((ssk->flags & SDP_TIMEWAIT) == 0)
362 		goto out;
363 	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
364 out:
365 	SDP_WUNLOCK(ssk);
366 	return;
367 }
368 
369 /*
370  * Schedule the 2msl wait timer.
371  */
372 static void
373 sdp_2msl_wait(struct sdp_sock *ssk)
374 {
375 
376 	SDP_WLOCK_ASSERT(ssk);
377 	ssk->flags |= SDP_TIMEWAIT;
378 	ssk->state = TCPS_TIME_WAIT;
379 	soisdisconnected(ssk->socket);
380 	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
381 }
382 
383 /*
384  * Timed out waiting for the final fin/ack from rdma_disconnect().
385  */
386 static void
387 sdp_dreq_timeout(void *data)
388 {
389 	struct sdp_sock *ssk;
390 
391 	ssk = data;
392 	/* Callout canceled. */
393         if (!callout_active(&ssk->keep2msl))
394 		goto out;
395 	/* Callout rescheduled, probably as a different timer. */
396 	if (callout_pending(&ssk->keep2msl))
397 		goto out;
398         callout_deactivate(&ssk->keep2msl);
399 	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
400 		goto out;
401 	if ((ssk->flags & SDP_DREQWAIT) == 0)
402 		goto out;
403 	ssk->flags &= ~SDP_DREQWAIT;
404 	ssk->flags |= SDP_DISCON;
405 	sdp_2msl_wait(ssk);
406 	ssk->qp_active = 0;
407 out:
408 	SDP_WUNLOCK(ssk);
409 }
410 
411 /*
412  * Received the final fin/ack.  Cancel the 2msl.
413  */
414 void
415 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
416 {
417 	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
418 	ssk->flags &= ~SDP_DREQWAIT;
419 	sdp_2msl_wait(ssk);
420 }
421 
422 static int
423 sdp_init_sock(struct socket *sk)
424 {
425 	struct sdp_sock *ssk = sdp_sk(sk);
426 
427 	sdp_dbg(sk, "%s\n", __func__);
428 
429 	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
430 	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
431 #ifdef SDP_ZCOPY
432 	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
433 	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
434 	ssk->tx_ring.rdma_inflight = NULL;
435 #endif
436 	atomic_set(&ssk->mseq_ack, 0);
437 	sdp_rx_ring_init(ssk);
438 	ssk->tx_ring.buffer = NULL;
439 
440 	return 0;
441 }
442 
443 /*
444  * Allocate an sdp_sock for the socket and reserve socket buffer space.
445  */
446 static int
447 sdp_attach(struct socket *so, int proto, struct thread *td)
448 {
449 	struct sdp_sock *ssk;
450 	int error;
451 
452 	ssk = sdp_sk(so);
453 	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
454 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
455 		error = soreserve(so, sdp_sendspace, sdp_recvspace);
456 		if (error)
457 			return (error);
458 	}
459 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
460 	so->so_snd.sb_flags |= SB_AUTOSIZE;
461 	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
462 	if (ssk == NULL)
463 		return (ENOBUFS);
464 	rw_init(&ssk->lock, "sdpsock");
465 	ssk->socket = so;
466 	ssk->cred = crhold(so->so_cred);
467 	so->so_pcb = (caddr_t)ssk;
468 	sdp_init_sock(so);
469 	ssk->flags = 0;
470 	ssk->qp_active = 0;
471 	ssk->state = TCPS_CLOSED;
472 	SDP_LIST_WLOCK();
473 	LIST_INSERT_HEAD(&sdp_list, ssk, list);
474 	sdp_count++;
475 	SDP_LIST_WUNLOCK();
476 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
477 		so->so_linger = TCP_LINGERTIME;
478 
479 	return (0);
480 }
481 
482 /*
483  * Detach SDP from the socket, potentially leaving it around for the
484  * timewait to expire.
485  */
486 static void
487 sdp_detach(struct socket *so)
488 {
489 	struct sdp_sock *ssk;
490 
491 	ssk = sdp_sk(so);
492 	SDP_WLOCK(ssk);
493 	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
494 	ssk->socket->so_pcb = NULL;
495 	ssk->socket = NULL;
496 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
497 		SDP_WUNLOCK(ssk);
498 	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
499 		sdp_pcbfree(ssk);
500 	else
501 		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
502 }
503 
504 /*
505  * Allocate a local address for the socket.
506  */
507 static int
508 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
509 {
510 	int error = 0;
511 	struct sdp_sock *ssk;
512 	struct sockaddr_in *sin;
513 
514 	sin = (struct sockaddr_in *)nam;
515 	if (nam->sa_len != sizeof (*sin))
516 		return (EINVAL);
517 	if (sin->sin_family != AF_INET)
518 		return (EINVAL);
519 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
520 		return (EAFNOSUPPORT);
521 
522 	ssk = sdp_sk(so);
523 	SDP_WLOCK(ssk);
524 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
525 		error = EINVAL;
526 		goto out;
527 	}
528 	error = sdp_pcbbind(ssk, nam, td->td_ucred);
529 out:
530 	SDP_WUNLOCK(ssk);
531 
532 	return (error);
533 }
534 
535 /*
536  * Prepare to accept connections.
537  */
538 static int
539 sdp_listen(struct socket *so, int backlog, struct thread *td)
540 {
541 	int error = 0;
542 	struct sdp_sock *ssk;
543 
544 	ssk = sdp_sk(so);
545 	SDP_WLOCK(ssk);
546 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
547 		error = EINVAL;
548 		goto out;
549 	}
550 	if (error == 0 && ssk->lport == 0)
551 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
552 	SOCK_LOCK(so);
553 	if (error == 0)
554 		error = solisten_proto_check(so);
555 	if (error == 0) {
556 		solisten_proto(so, backlog);
557 		ssk->state = TCPS_LISTEN;
558 	}
559 	SOCK_UNLOCK(so);
560 
561 out:
562 	SDP_WUNLOCK(ssk);
563 	if (error == 0)
564 		error = -rdma_listen(ssk->id, backlog);
565 	return (error);
566 }
567 
568 /*
569  * Initiate a SDP connection to nam.
570  */
571 static int
572 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
573 {
574 	struct sockaddr_in src;
575 	struct socket *so;
576 	int error;
577 
578 	so = ssk->socket;
579 
580 	SDP_WLOCK_ASSERT(ssk);
581 	if (ssk->lport == 0) {
582 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
583 		if (error)
584 			return error;
585 	}
586 	src.sin_family = AF_INET;
587 	src.sin_len = sizeof(src);
588 	bzero(&src.sin_zero, sizeof(src.sin_zero));
589 	src.sin_port = ssk->lport;
590 	src.sin_addr.s_addr = ssk->laddr;
591 	soisconnecting(so);
592 	SDP_WUNLOCK(ssk);
593 	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
594 	    SDP_RESOLVE_TIMEOUT);
595 	SDP_WLOCK(ssk);
596 	if (error == 0)
597 		ssk->state = TCPS_SYN_SENT;
598 
599 	return 0;
600 }
601 
602 /*
603  * Initiate SDP connection.
604  */
605 static int
606 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
607 {
608 	int error = 0;
609 	struct sdp_sock *ssk;
610 	struct sockaddr_in *sin;
611 
612 	sin = (struct sockaddr_in *)nam;
613 	if (nam->sa_len != sizeof (*sin))
614 		return (EINVAL);
615 	if (sin->sin_family != AF_INET)
616 		return (EINVAL);
617 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
618 		return (EAFNOSUPPORT);
619 	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
620 		return (error);
621 	ssk = sdp_sk(so);
622 	SDP_WLOCK(ssk);
623 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
624 		error = EINVAL;
625 	else
626 		error = sdp_start_connect(ssk, nam, td);
627 	SDP_WUNLOCK(ssk);
628 	return (error);
629 }
630 
631 /*
632  * Drop a SDP socket, reporting
633  * the specified error.  If connection is synchronized,
634  * then send a RST to peer.
635  */
636 static struct sdp_sock *
637 sdp_drop(struct sdp_sock *ssk, int errno)
638 {
639 	struct socket *so;
640 
641 	SDP_WLOCK_ASSERT(ssk);
642 	so = ssk->socket;
643 	if (TCPS_HAVERCVDSYN(ssk->state))
644 		sdp_output_reset(ssk);
645 	if (errno == ETIMEDOUT && ssk->softerror)
646 		errno = ssk->softerror;
647 	so->so_error = errno;
648 	return (sdp_closed(ssk));
649 }
650 
651 /*
652  * User issued close, and wish to trail through shutdown states:
653  * if never received SYN, just forget it.  If got a SYN from peer,
654  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
655  * If already got a FIN from peer, then almost done; go to LAST_ACK
656  * state.  In all other cases, have already sent FIN to peer (e.g.
657  * after PRU_SHUTDOWN), and just have to play tedious game waiting
658  * for peer to send FIN or not respond to keep-alives, etc.
659  * We can let the user exit from the close as soon as the FIN is acked.
660  */
661 static void
662 sdp_usrclosed(struct sdp_sock *ssk)
663 {
664 
665 	SDP_WLOCK_ASSERT(ssk);
666 
667 	switch (ssk->state) {
668 	case TCPS_LISTEN:
669 		ssk->state = TCPS_CLOSED;
670 		SDP_WUNLOCK(ssk);
671 		sdp_destroy_cma(ssk);
672 		SDP_WLOCK(ssk);
673 		/* FALLTHROUGH */
674 	case TCPS_CLOSED:
675 		ssk = sdp_closed(ssk);
676 		/*
677 		 * sdp_closed() should never return NULL here as the socket is
678 		 * still open.
679 		 */
680 		KASSERT(ssk != NULL,
681 		    ("sdp_usrclosed: sdp_closed() returned NULL"));
682 		break;
683 
684 	case TCPS_SYN_SENT:
685 		/* FALLTHROUGH */
686 	case TCPS_SYN_RECEIVED:
687 		ssk->flags |= SDP_NEEDFIN;
688 		break;
689 
690 	case TCPS_ESTABLISHED:
691 		ssk->flags |= SDP_NEEDFIN;
692 		ssk->state = TCPS_FIN_WAIT_1;
693 		break;
694 
695 	case TCPS_CLOSE_WAIT:
696 		ssk->state = TCPS_LAST_ACK;
697 		break;
698 	}
699 	if (ssk->state >= TCPS_FIN_WAIT_2) {
700 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
701 		if (ssk->state == TCPS_FIN_WAIT_2)
702 			sdp_2msl_wait(ssk);
703 		else
704 			soisdisconnected(ssk->socket);
705 	}
706 }
707 
708 static void
709 sdp_output_disconnect(struct sdp_sock *ssk)
710 {
711 
712 	SDP_WLOCK_ASSERT(ssk);
713 	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
714 	    sdp_dreq_timeout, ssk);
715 	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
716 	sdp_post_sends(ssk, M_NOWAIT);
717 }
718 
719 /*
720  * Initiate or continue a disconnect.
721  * If embryonic state, just send reset (once).
722  * If in ``let data drain'' option and linger null, just drop.
723  * Otherwise (hard), mark socket disconnecting and drop
724  * current input data; switch states based on user close, and
725  * send segment to peer (with FIN).
726  */
727 static void
728 sdp_start_disconnect(struct sdp_sock *ssk)
729 {
730 	struct socket *so;
731 	int unread;
732 
733 	so = ssk->socket;
734 	SDP_WLOCK_ASSERT(ssk);
735 	sdp_stop_keepalive_timer(so);
736 	/*
737 	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
738 	 * socket is still open.
739 	 */
740 	if (ssk->state < TCPS_ESTABLISHED) {
741 		ssk = sdp_closed(ssk);
742 		KASSERT(ssk != NULL,
743 		    ("sdp_start_disconnect: sdp_close() returned NULL"));
744 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
745 		ssk = sdp_drop(ssk, 0);
746 		KASSERT(ssk != NULL,
747 		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
748 	} else {
749 		soisdisconnecting(so);
750 		unread = sbused(&so->so_rcv);
751 		sbflush(&so->so_rcv);
752 		sdp_usrclosed(ssk);
753 		if (!(ssk->flags & SDP_DROPPED)) {
754 			if (unread)
755 				sdp_output_reset(ssk);
756 			else
757 				sdp_output_disconnect(ssk);
758 		}
759 	}
760 }
761 
762 /*
763  * User initiated disconnect.
764  */
765 static int
766 sdp_disconnect(struct socket *so)
767 {
768 	struct sdp_sock *ssk;
769 	int error = 0;
770 
771 	ssk = sdp_sk(so);
772 	SDP_WLOCK(ssk);
773 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
774 		error = ECONNRESET;
775 		goto out;
776 	}
777 	sdp_start_disconnect(ssk);
778 out:
779 	SDP_WUNLOCK(ssk);
780 	return (error);
781 }
782 
783 /*
784  * Accept a connection.  Essentially all the work is done at higher levels;
785  * just return the address of the peer, storing through addr.
786  *
787  *
788  * XXX This is broken XXX
789  *
790  * The rationale for acquiring the sdp lock here is somewhat complicated,
791  * and is described in detail in the commit log entry for r175612.  Acquiring
792  * it delays an accept(2) racing with sonewconn(), which inserts the socket
793  * before the address/port fields are initialized.  A better fix would
794  * prevent the socket from being placed in the listen queue until all fields
795  * are fully initialized.
796  */
797 static int
798 sdp_accept(struct socket *so, struct sockaddr **nam)
799 {
800 	struct sdp_sock *ssk = NULL;
801 	struct in_addr addr;
802 	in_port_t port;
803 	int error;
804 
805 	if (so->so_state & SS_ISDISCONNECTED)
806 		return (ECONNABORTED);
807 
808 	port = 0;
809 	addr.s_addr = 0;
810 	error = 0;
811 	ssk = sdp_sk(so);
812 	SDP_WLOCK(ssk);
813 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
814 		error = ECONNABORTED;
815 		goto out;
816 	}
817 	port = ssk->fport;
818 	addr.s_addr = ssk->faddr;
819 out:
820 	SDP_WUNLOCK(ssk);
821 	if (error == 0)
822 		*nam = sdp_sockaddr(port, &addr);
823 	return error;
824 }
825 
826 /*
827  * Mark the connection as being incapable of further output.
828  */
829 static int
830 sdp_shutdown(struct socket *so)
831 {
832 	int error = 0;
833 	struct sdp_sock *ssk;
834 
835 	ssk = sdp_sk(so);
836 	SDP_WLOCK(ssk);
837 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
838 		error = ECONNRESET;
839 		goto out;
840 	}
841 	socantsendmore(so);
842 	sdp_usrclosed(ssk);
843 	if (!(ssk->flags & SDP_DROPPED))
844 		sdp_output_disconnect(ssk);
845 
846 out:
847 	SDP_WUNLOCK(ssk);
848 
849 	return (error);
850 }
851 
852 static void
853 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
854 {
855 	struct mbuf *n;
856 	int ncnt;
857 
858 	SOCKBUF_LOCK_ASSERT(sb);
859 	SBLASTRECORDCHK(sb);
860 	KASSERT(mb->m_flags & M_PKTHDR,
861 		("sdp_append: %p Missing packet header.\n", mb));
862 	n = sb->sb_lastrecord;
863 	/*
864 	 * If the queue is empty just set all pointers and proceed.
865 	 */
866 	if (n == NULL) {
867 		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
868 		for (; mb; mb = mb->m_next) {
869 	                sb->sb_mbtail = mb;
870 			sballoc(sb, mb);
871 		}
872 		return;
873 	}
874 	/*
875 	 * Count the number of mbufs in the current tail.
876 	 */
877 	for (ncnt = 0; n->m_next; n = n->m_next)
878 		ncnt++;
879 	n = sb->sb_lastrecord;
880 	/*
881 	 * If the two chains can fit in a single sdp packet and
882 	 * the last record has not been sent yet (WRITABLE) coalesce
883 	 * them.  The lastrecord remains the same but we must strip the
884 	 * packet header and then let sbcompress do the hard part.
885 	 */
886 	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
887 	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
888 	    ssk->xmit_size_goal) {
889 		m_adj(mb, SDP_HEAD_SIZE);
890 		n->m_pkthdr.len += mb->m_pkthdr.len;
891 		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
892 		m_demote(mb, 1, 0);
893 		sbcompress(sb, mb, sb->sb_mbtail);
894 		return;
895 	}
896 	/*
897 	 * Not compressible, just append to the end and adjust counters.
898 	 */
899 	sb->sb_lastrecord->m_flags |= M_PUSH;
900 	sb->sb_lastrecord->m_nextpkt = mb;
901 	sb->sb_lastrecord = mb;
902 	if (sb->sb_sndptr == NULL)
903 		sb->sb_sndptr = mb;
904 	for (; mb; mb = mb->m_next) {
905 		sb->sb_mbtail = mb;
906 		sballoc(sb, mb);
907 	}
908 }
909 
910 /*
911  * Do a send by putting data in output queue and updating urgent
912  * marker if URG set.  Possibly send more data.  Unlike the other
913  * pru_*() routines, the mbuf chains are our responsibility.  We
914  * must either enqueue them or free them.  The other pru_* routines
915  * generally are caller-frees.
916  *
917  * This comes from sendfile, normal sends will come from sdp_sosend().
918  */
919 static int
920 sdp_send(struct socket *so, int flags, struct mbuf *m,
921     struct sockaddr *nam, struct mbuf *control, struct thread *td)
922 {
923 	struct sdp_sock *ssk;
924 	struct mbuf *n;
925 	int error;
926 	int cnt;
927 
928 	error = 0;
929 	ssk = sdp_sk(so);
930 	KASSERT(m->m_flags & M_PKTHDR,
931 	    ("sdp_send: %p no packet header", m));
932 	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
933 	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
934 	for (n = m, cnt = 0; n->m_next; n = n->m_next)
935 		cnt++;
936 	if (cnt > SDP_MAX_SEND_SGES) {
937 		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
938 		if (n == NULL) {
939 			m_freem(m);
940 			return (EMSGSIZE);
941 		}
942 		m = n;
943 		for (cnt = 0; n->m_next; n = n->m_next)
944 			cnt++;
945 	}
946 	SDP_WLOCK(ssk);
947 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
948 		if (control)
949 			m_freem(control);
950 		if (m)
951 			m_freem(m);
952 		error = ECONNRESET;
953 		goto out;
954 	}
955 	if (control) {
956 		/* SDP doesn't support control messages. */
957 		if (control->m_len) {
958 			m_freem(control);
959 			if (m)
960 				m_freem(m);
961 			error = EINVAL;
962 			goto out;
963 		}
964 		m_freem(control);	/* empty control, just free it */
965 	}
966 	if (!(flags & PRUS_OOB)) {
967 		SOCKBUF_LOCK(&so->so_snd);
968 		sdp_append(ssk, &so->so_snd, m, cnt);
969 		SOCKBUF_UNLOCK(&so->so_snd);
970 		if (nam && ssk->state < TCPS_SYN_SENT) {
971 			/*
972 			 * Do implied connect if not yet connected.
973 			 */
974 			error = sdp_start_connect(ssk, nam, td);
975 			if (error)
976 				goto out;
977 		}
978 		if (flags & PRUS_EOF) {
979 			/*
980 			 * Close the send side of the connection after
981 			 * the data is sent.
982 			 */
983 			socantsendmore(so);
984 			sdp_usrclosed(ssk);
985 			if (!(ssk->flags & SDP_DROPPED))
986 				sdp_output_disconnect(ssk);
987 		} else if (!(ssk->flags & SDP_DROPPED) &&
988 		    !(flags & PRUS_MORETOCOME))
989 			sdp_post_sends(ssk, M_NOWAIT);
990 		SDP_WUNLOCK(ssk);
991 		return (0);
992 	} else {
993 		SOCKBUF_LOCK(&so->so_snd);
994 		if (sbspace(&so->so_snd) < -512) {
995 			SOCKBUF_UNLOCK(&so->so_snd);
996 			m_freem(m);
997 			error = ENOBUFS;
998 			goto out;
999 		}
1000 		/*
1001 		 * According to RFC961 (Assigned Protocols),
1002 		 * the urgent pointer points to the last octet
1003 		 * of urgent data.  We continue, however,
1004 		 * to consider it to indicate the first octet
1005 		 * of data past the urgent section.
1006 		 * Otherwise, snd_up should be one lower.
1007 		 */
1008 		m->m_flags |= M_URG | M_PUSH;
1009 		sdp_append(ssk, &so->so_snd, m, cnt);
1010 		SOCKBUF_UNLOCK(&so->so_snd);
1011 		if (nam && ssk->state < TCPS_SYN_SENT) {
1012 			/*
1013 			 * Do implied connect if not yet connected.
1014 			 */
1015 			error = sdp_start_connect(ssk, nam, td);
1016 			if (error)
1017 				goto out;
1018 		}
1019 		sdp_post_sends(ssk, M_NOWAIT);
1020 		SDP_WUNLOCK(ssk);
1021 		return (0);
1022 	}
1023 out:
1024 	SDP_WUNLOCK(ssk);
1025 	return (error);
1026 }
1027 
1028 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1029 
1030 /*
1031  * Send on a socket.  If send must go all at once and message is larger than
1032  * send buffering, then hard error.  Lock against other senders.  If must go
1033  * all at once and not enough room now, then inform user that this would
1034  * block and do nothing.  Otherwise, if nonblocking, send as much as
1035  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1036  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1037  * in mbuf chain must be small enough to send all at once.
1038  *
1039  * Returns nonzero on error, timeout or signal; callers must check for short
1040  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1041  * on return.
1042  */
1043 static int
1044 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1045     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1046 {
1047 	struct sdp_sock *ssk;
1048 	long space, resid;
1049 	int atomic;
1050 	int error;
1051 	int copy;
1052 
1053 	if (uio != NULL)
1054 		resid = uio->uio_resid;
1055 	else
1056 		resid = top->m_pkthdr.len;
1057 	atomic = top != NULL;
1058 	if (control != NULL) {
1059 		if (control->m_len) {
1060 			m_freem(control);
1061 			if (top)
1062 				m_freem(top);
1063 			return (EINVAL);
1064 		}
1065 		m_freem(control);
1066 		control = NULL;
1067 	}
1068 	/*
1069 	 * In theory resid should be unsigned.  However, space must be
1070 	 * signed, as it might be less than 0 if we over-committed, and we
1071 	 * must use a signed comparison of space and resid.  On the other
1072 	 * hand, a negative resid causes us to loop sending 0-length
1073 	 * segments to the protocol.
1074 	 *
1075 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1076 	 * type sockets since that's an error.
1077 	 */
1078 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1079 		error = EINVAL;
1080 		goto out;
1081 	}
1082 	if (td != NULL)
1083 		td->td_ru.ru_msgsnd++;
1084 
1085 	ssk = sdp_sk(so);
1086 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1087 	if (error)
1088 		goto out;
1089 
1090 restart:
1091 	do {
1092 		SOCKBUF_LOCK(&so->so_snd);
1093 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1094 			SOCKBUF_UNLOCK(&so->so_snd);
1095 			error = EPIPE;
1096 			goto release;
1097 		}
1098 		if (so->so_error) {
1099 			error = so->so_error;
1100 			so->so_error = 0;
1101 			SOCKBUF_UNLOCK(&so->so_snd);
1102 			goto release;
1103 		}
1104 		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1105 			SOCKBUF_UNLOCK(&so->so_snd);
1106 			error = ENOTCONN;
1107 			goto release;
1108 		}
1109 		space = sbspace(&so->so_snd);
1110 		if (flags & MSG_OOB)
1111 			space += 1024;
1112 		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1113 			SOCKBUF_UNLOCK(&so->so_snd);
1114 			error = EMSGSIZE;
1115 			goto release;
1116 		}
1117 		if (space < resid &&
1118 		    (atomic || space < so->so_snd.sb_lowat)) {
1119 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1120 				SOCKBUF_UNLOCK(&so->so_snd);
1121 				error = EWOULDBLOCK;
1122 				goto release;
1123 			}
1124 			error = sbwait(&so->so_snd);
1125 			SOCKBUF_UNLOCK(&so->so_snd);
1126 			if (error)
1127 				goto release;
1128 			goto restart;
1129 		}
1130 		SOCKBUF_UNLOCK(&so->so_snd);
1131 		do {
1132 			if (uio == NULL) {
1133 				resid = 0;
1134 				if (flags & MSG_EOR)
1135 					top->m_flags |= M_EOR;
1136 			} else {
1137 				/*
1138 				 * Copy the data from userland into a mbuf
1139 				 * chain.  If no data is to be copied in,
1140 				 * a single empty mbuf is returned.
1141 				 */
1142 				copy = min(space,
1143 				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1144 				top = m_uiotombuf(uio, M_WAITOK, copy,
1145 				    0, M_PKTHDR |
1146 				    ((flags & MSG_EOR) ? M_EOR : 0));
1147 				if (top == NULL) {
1148 					/* only possible error */
1149 					error = EFAULT;
1150 					goto release;
1151 				}
1152 				space -= resid - uio->uio_resid;
1153 				resid = uio->uio_resid;
1154 			}
1155 			/*
1156 			 * XXX all the SBS_CANTSENDMORE checks previously
1157 			 * done could be out of date after dropping the
1158 			 * socket lock.
1159 			 */
1160 			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1161 			/*
1162 			 * Set EOF on the last send if the user specified
1163 			 * MSG_EOF.
1164 			 */
1165 			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1166 			/* If there is more to send set PRUS_MORETOCOME. */
1167 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1168 			    top, addr, NULL, td);
1169 			top = NULL;
1170 			if (error)
1171 				goto release;
1172 		} while (resid && space > 0);
1173 	} while (resid);
1174 
1175 release:
1176 	sbunlock(&so->so_snd);
1177 out:
1178 	if (top != NULL)
1179 		m_freem(top);
1180 	return (error);
1181 }
1182 
1183 /*
1184  * The part of soreceive() that implements reading non-inline out-of-band
1185  * data from a socket.  For more complete comments, see soreceive(), from
1186  * which this code originated.
1187  *
1188  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1189  * unable to return an mbuf chain to the caller.
1190  */
1191 static int
1192 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1193 {
1194 	struct protosw *pr = so->so_proto;
1195 	struct mbuf *m;
1196 	int error;
1197 
1198 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1199 
1200 	m = m_get(M_WAITOK, MT_DATA);
1201 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1202 	if (error)
1203 		goto bad;
1204 	do {
1205 		error = uiomove(mtod(m, void *),
1206 		    (int) min(uio->uio_resid, m->m_len), uio);
1207 		m = m_free(m);
1208 	} while (uio->uio_resid && error == 0 && m);
1209 bad:
1210 	if (m != NULL)
1211 		m_freem(m);
1212 	return (error);
1213 }
1214 
1215 /*
1216  * Optimized version of soreceive() for stream (TCP) sockets.
1217  */
1218 static int
1219 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1220     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1221 {
1222 	int len = 0, error = 0, flags, oresid;
1223 	struct sockbuf *sb;
1224 	struct mbuf *m, *n = NULL;
1225 	struct sdp_sock *ssk;
1226 
1227 	/* We only do stream sockets. */
1228 	if (so->so_type != SOCK_STREAM)
1229 		return (EINVAL);
1230 	if (psa != NULL)
1231 		*psa = NULL;
1232 	if (controlp != NULL)
1233 		return (EINVAL);
1234 	if (flagsp != NULL)
1235 		flags = *flagsp &~ MSG_EOR;
1236 	else
1237 		flags = 0;
1238 	if (flags & MSG_OOB)
1239 		return (soreceive_rcvoob(so, uio, flags));
1240 	if (mp0 != NULL)
1241 		*mp0 = NULL;
1242 
1243 	sb = &so->so_rcv;
1244 	ssk = sdp_sk(so);
1245 
1246 	/* Prevent other readers from entering the socket. */
1247 	error = sblock(sb, SBLOCKWAIT(flags));
1248 	if (error)
1249 		goto out;
1250 	SOCKBUF_LOCK(sb);
1251 
1252 	/* Easy one, no space to copyout anything. */
1253 	if (uio->uio_resid == 0) {
1254 		error = EINVAL;
1255 		goto out;
1256 	}
1257 	oresid = uio->uio_resid;
1258 
1259 	/* We will never ever get anything unless we are connected. */
1260 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1261 		/* When disconnecting there may be still some data left. */
1262 		if (sbavail(sb))
1263 			goto deliver;
1264 		if (!(so->so_state & SS_ISDISCONNECTED))
1265 			error = ENOTCONN;
1266 		goto out;
1267 	}
1268 
1269 	/* Socket buffer is empty and we shall not block. */
1270 	if (sbavail(sb) == 0 &&
1271 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1272 		error = EAGAIN;
1273 		goto out;
1274 	}
1275 
1276 restart:
1277 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1278 
1279 	/* Abort if socket has reported problems. */
1280 	if (so->so_error) {
1281 		if (sbavail(sb))
1282 			goto deliver;
1283 		if (oresid > uio->uio_resid)
1284 			goto out;
1285 		error = so->so_error;
1286 		if (!(flags & MSG_PEEK))
1287 			so->so_error = 0;
1288 		goto out;
1289 	}
1290 
1291 	/* Door is closed.  Deliver what is left, if any. */
1292 	if (sb->sb_state & SBS_CANTRCVMORE) {
1293 		if (sbavail(sb))
1294 			goto deliver;
1295 		else
1296 			goto out;
1297 	}
1298 
1299 	/* Socket buffer got some data that we shall deliver now. */
1300 	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1301 	    ((so->so_state & SS_NBIO) ||
1302 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1303 	     sbavail(sb) >= sb->sb_lowat ||
1304 	     sbavail(sb) >= uio->uio_resid ||
1305 	     sbavail(sb) >= sb->sb_hiwat) ) {
1306 		goto deliver;
1307 	}
1308 
1309 	/* On MSG_WAITALL we must wait until all data or error arrives. */
1310 	if ((flags & MSG_WAITALL) &&
1311 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1312 		goto deliver;
1313 
1314 	/*
1315 	 * Wait and block until (more) data comes in.
1316 	 * NB: Drops the sockbuf lock during wait.
1317 	 */
1318 	error = sbwait(sb);
1319 	if (error)
1320 		goto out;
1321 	goto restart;
1322 
1323 deliver:
1324 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1325 	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1326 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1327 
1328 	/* Statistics. */
1329 	if (uio->uio_td)
1330 		uio->uio_td->td_ru.ru_msgrcv++;
1331 
1332 	/* Fill uio until full or current end of socket buffer is reached. */
1333 	len = min(uio->uio_resid, sbavail(sb));
1334 	if (mp0 != NULL) {
1335 		/* Dequeue as many mbufs as possible. */
1336 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1337 			for (*mp0 = m = sb->sb_mb;
1338 			     m != NULL && m->m_len <= len;
1339 			     m = m->m_next) {
1340 				len -= m->m_len;
1341 				uio->uio_resid -= m->m_len;
1342 				sbfree(sb, m);
1343 				n = m;
1344 			}
1345 			sb->sb_mb = m;
1346 			if (sb->sb_mb == NULL)
1347 				SB_EMPTY_FIXUP(sb);
1348 			n->m_next = NULL;
1349 		}
1350 		/* Copy the remainder. */
1351 		if (len > 0) {
1352 			KASSERT(sb->sb_mb != NULL,
1353 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1354 
1355 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1356 			if (m == NULL)
1357 				len = 0;	/* Don't flush data from sockbuf. */
1358 			else
1359 				uio->uio_resid -= m->m_len;
1360 			if (*mp0 != NULL)
1361 				n->m_next = m;
1362 			else
1363 				*mp0 = m;
1364 			if (*mp0 == NULL) {
1365 				error = ENOBUFS;
1366 				goto out;
1367 			}
1368 		}
1369 	} else {
1370 		/* NB: Must unlock socket buffer as uiomove may sleep. */
1371 		SOCKBUF_UNLOCK(sb);
1372 		error = m_mbuftouio(uio, sb->sb_mb, len);
1373 		SOCKBUF_LOCK(sb);
1374 		if (error)
1375 			goto out;
1376 	}
1377 	SBLASTRECORDCHK(sb);
1378 	SBLASTMBUFCHK(sb);
1379 
1380 	/*
1381 	 * Remove the delivered data from the socket buffer unless we
1382 	 * were only peeking.
1383 	 */
1384 	if (!(flags & MSG_PEEK)) {
1385 		if (len > 0)
1386 			sbdrop_locked(sb, len);
1387 
1388 		/* Notify protocol that we drained some data. */
1389 		SOCKBUF_UNLOCK(sb);
1390 		SDP_WLOCK(ssk);
1391 		sdp_do_posts(ssk);
1392 		SDP_WUNLOCK(ssk);
1393 		SOCKBUF_LOCK(sb);
1394 	}
1395 
1396 	/*
1397 	 * For MSG_WAITALL we may have to loop again and wait for
1398 	 * more data to come in.
1399 	 */
1400 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1401 		goto restart;
1402 out:
1403 	SOCKBUF_LOCK_ASSERT(sb);
1404 	SBLASTRECORDCHK(sb);
1405 	SBLASTMBUFCHK(sb);
1406 	SOCKBUF_UNLOCK(sb);
1407 	sbunlock(sb);
1408 	return (error);
1409 }
1410 
1411 /*
1412  * Abort is used to teardown a connection typically while sitting in
1413  * the accept queue.
1414  */
1415 void
1416 sdp_abort(struct socket *so)
1417 {
1418 	struct sdp_sock *ssk;
1419 
1420 	ssk = sdp_sk(so);
1421 	SDP_WLOCK(ssk);
1422 	/*
1423 	 * If we have not yet dropped, do it now.
1424 	 */
1425 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1426 	    !(ssk->flags & SDP_DROPPED))
1427 		sdp_drop(ssk, ECONNABORTED);
1428 	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1429 	    ssk, ssk->flags));
1430 	SDP_WUNLOCK(ssk);
1431 }
1432 
1433 /*
1434  * Close a SDP socket and initiate a friendly disconnect.
1435  */
1436 static void
1437 sdp_close(struct socket *so)
1438 {
1439 	struct sdp_sock *ssk;
1440 
1441 	ssk = sdp_sk(so);
1442 	SDP_WLOCK(ssk);
1443 	/*
1444 	 * If we have not yet dropped, do it now.
1445 	 */
1446 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1447 	    !(ssk->flags & SDP_DROPPED))
1448 		sdp_start_disconnect(ssk);
1449 
1450 	/*
1451 	 * If we've still not dropped let the socket layer know we're
1452 	 * holding on to the socket and pcb for a while.
1453 	 */
1454 	if (!(ssk->flags & SDP_DROPPED)) {
1455 		SOCK_LOCK(so);
1456 		so->so_state |= SS_PROTOREF;
1457 		SOCK_UNLOCK(so);
1458 		ssk->flags |= SDP_SOCKREF;
1459 	}
1460 	SDP_WUNLOCK(ssk);
1461 }
1462 
1463 /*
1464  * User requests out-of-band data.
1465  */
1466 static int
1467 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1468 {
1469 	int error = 0;
1470 	struct sdp_sock *ssk;
1471 
1472 	ssk = sdp_sk(so);
1473 	SDP_WLOCK(ssk);
1474 	if (!rx_ring_trylock(&ssk->rx_ring)) {
1475 		SDP_WUNLOCK(ssk);
1476 		return (ECONNRESET);
1477 	}
1478 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1479 		error = ECONNRESET;
1480 		goto out;
1481 	}
1482 	if ((so->so_oobmark == 0 &&
1483 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1484 	    so->so_options & SO_OOBINLINE ||
1485 	    ssk->oobflags & SDP_HADOOB) {
1486 		error = EINVAL;
1487 		goto out;
1488 	}
1489 	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1490 		error = EWOULDBLOCK;
1491 		goto out;
1492 	}
1493 	m->m_len = 1;
1494 	*mtod(m, caddr_t) = ssk->iobc;
1495 	if ((flags & MSG_PEEK) == 0)
1496 		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1497 out:
1498 	rx_ring_unlock(&ssk->rx_ring);
1499 	SDP_WUNLOCK(ssk);
1500 	return (error);
1501 }
1502 
1503 void
1504 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1505 {
1506 	struct mbuf *m;
1507 	struct socket *so;
1508 
1509 	so = ssk->socket;
1510 	if (so == NULL)
1511 		return;
1512 
1513 	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1514 	sohasoutofband(so);
1515 	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1516 	if (!(so->so_options & SO_OOBINLINE)) {
1517 		for (m = mb; m->m_next != NULL; m = m->m_next);
1518 		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1519 		ssk->oobflags |= SDP_HAVEOOB;
1520 		m->m_len--;
1521 		mb->m_pkthdr.len--;
1522 	}
1523 }
1524 
1525 /*
1526  * Notify a sdp socket of an asynchronous error.
1527  *
1528  * Do not wake up user since there currently is no mechanism for
1529  * reporting soft errors (yet - a kqueue filter may be added).
1530  */
1531 struct sdp_sock *
1532 sdp_notify(struct sdp_sock *ssk, int error)
1533 {
1534 
1535 	SDP_WLOCK_ASSERT(ssk);
1536 
1537 	if ((ssk->flags & SDP_TIMEWAIT) ||
1538 	    (ssk->flags & SDP_DROPPED))
1539 		return (ssk);
1540 
1541 	/*
1542 	 * Ignore some errors if we are hooked up.
1543 	 */
1544 	if (ssk->state == TCPS_ESTABLISHED &&
1545 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1546 	     error == EHOSTDOWN))
1547 		return (ssk);
1548 	ssk->softerror = error;
1549 	return sdp_drop(ssk, error);
1550 }
1551 
1552 static void
1553 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1554 {
1555 	struct in_addr faddr;
1556 
1557 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1558 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1559 		return;
1560 
1561 	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1562 }
1563 
1564 static int
1565 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1566     struct thread *td)
1567 {
1568 	return (EOPNOTSUPP);
1569 }
1570 
1571 static void
1572 sdp_keepalive_timeout(void *data)
1573 {
1574 	struct sdp_sock *ssk;
1575 
1576 	ssk = data;
1577 	/* Callout canceled. */
1578         if (!callout_active(&ssk->keep2msl))
1579                 return;
1580 	/* Callout rescheduled as a different kind of timer. */
1581 	if (callout_pending(&ssk->keep2msl))
1582 		goto out;
1583         callout_deactivate(&ssk->keep2msl);
1584 	if (ssk->flags & SDP_DROPPED ||
1585 	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1586 		goto out;
1587 	sdp_post_keepalive(ssk);
1588 	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1589 	    sdp_keepalive_timeout, ssk);
1590 out:
1591 	SDP_WUNLOCK(ssk);
1592 }
1593 
1594 
1595 void
1596 sdp_start_keepalive_timer(struct socket *so)
1597 {
1598 	struct sdp_sock *ssk;
1599 
1600 	ssk = sdp_sk(so);
1601 	if (!callout_pending(&ssk->keep2msl))
1602                 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1603                     sdp_keepalive_timeout, ssk);
1604 }
1605 
1606 static void
1607 sdp_stop_keepalive_timer(struct socket *so)
1608 {
1609 	struct sdp_sock *ssk;
1610 
1611 	ssk = sdp_sk(so);
1612 	callout_stop(&ssk->keep2msl);
1613 }
1614 
1615 /*
1616  * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1617  * socket option arguments.  When it re-acquires the lock after the copy, it
1618  * has to revalidate that the connection is still valid for the socket
1619  * option.
1620  */
1621 #define SDP_WLOCK_RECHECK(inp) do {					\
1622 	SDP_WLOCK(ssk);							\
1623 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1624 		SDP_WUNLOCK(ssk);					\
1625 		return (ECONNRESET);					\
1626 	}								\
1627 } while(0)
1628 
1629 static int
1630 sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1631 {
1632 	int	error, opt, optval;
1633 	struct sdp_sock *ssk;
1634 
1635 	error = 0;
1636 	ssk = sdp_sk(so);
1637 	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1638 		SDP_WLOCK(ssk);
1639 		if (so->so_options & SO_KEEPALIVE)
1640 			sdp_start_keepalive_timer(so);
1641 		else
1642 			sdp_stop_keepalive_timer(so);
1643 		SDP_WUNLOCK(ssk);
1644 	}
1645 	if (sopt->sopt_level != IPPROTO_TCP)
1646 		return (error);
1647 
1648 	SDP_WLOCK(ssk);
1649 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1650 		SDP_WUNLOCK(ssk);
1651 		return (ECONNRESET);
1652 	}
1653 
1654 	switch (sopt->sopt_dir) {
1655 	case SOPT_SET:
1656 		switch (sopt->sopt_name) {
1657 		case TCP_NODELAY:
1658 			SDP_WUNLOCK(ssk);
1659 			error = sooptcopyin(sopt, &optval, sizeof optval,
1660 			    sizeof optval);
1661 			if (error)
1662 				return (error);
1663 
1664 			SDP_WLOCK_RECHECK(ssk);
1665 			opt = SDP_NODELAY;
1666 			if (optval)
1667 				ssk->flags |= opt;
1668 			else
1669 				ssk->flags &= ~opt;
1670 			sdp_do_posts(ssk);
1671 			SDP_WUNLOCK(ssk);
1672 			break;
1673 
1674 		default:
1675 			SDP_WUNLOCK(ssk);
1676 			error = ENOPROTOOPT;
1677 			break;
1678 		}
1679 		break;
1680 
1681 	case SOPT_GET:
1682 		switch (sopt->sopt_name) {
1683 		case TCP_NODELAY:
1684 			optval = ssk->flags & SDP_NODELAY;
1685 			SDP_WUNLOCK(ssk);
1686 			error = sooptcopyout(sopt, &optval, sizeof optval);
1687 			break;
1688 		default:
1689 			SDP_WUNLOCK(ssk);
1690 			error = ENOPROTOOPT;
1691 			break;
1692 		}
1693 		break;
1694 	}
1695 	return (error);
1696 }
1697 #undef SDP_WLOCK_RECHECK
1698 
1699 int sdp_mod_count = 0;
1700 int sdp_mod_usec = 0;
1701 
1702 void
1703 sdp_set_default_moderation(struct sdp_sock *ssk)
1704 {
1705 	struct ib_cq_attr attr;
1706 	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1707 		return;
1708 	memset(&attr, 0, sizeof(attr));
1709 	attr.moderation.cq_count = sdp_mod_count;
1710 	attr.moderation.cq_period = sdp_mod_usec;
1711 
1712 	ib_modify_cq(ssk->rx_ring.cq, &attr, IB_CQ_MODERATION);
1713 }
1714 
1715 static void
1716 sdp_dev_add(struct ib_device *device)
1717 {
1718 	struct ib_fmr_pool_param param;
1719 	struct sdp_device *sdp_dev;
1720 
1721 	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1722 	sdp_dev->pd = ib_alloc_pd(device);
1723 	if (IS_ERR(sdp_dev->pd))
1724 		goto out_pd;
1725         sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
1726         if (IS_ERR(sdp_dev->mr))
1727 		goto out_mr;
1728 	memset(&param, 0, sizeof param);
1729 	param.max_pages_per_fmr = SDP_FMR_SIZE;
1730 	param.page_shift = PAGE_SHIFT;
1731 	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1732 	param.pool_size = SDP_FMR_POOL_SIZE;
1733 	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1734 	param.cache = 1;
1735 	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1736 	if (IS_ERR(sdp_dev->fmr_pool))
1737 		goto out_fmr;
1738 	ib_set_client_data(device, &sdp_client, sdp_dev);
1739 	return;
1740 
1741 out_fmr:
1742 	ib_dereg_mr(sdp_dev->mr);
1743 out_mr:
1744 	ib_dealloc_pd(sdp_dev->pd);
1745 out_pd:
1746 	free(sdp_dev, M_SDP);
1747 }
1748 
1749 static void
1750 sdp_dev_rem(struct ib_device *device)
1751 {
1752 	struct sdp_device *sdp_dev;
1753 	struct sdp_sock *ssk;
1754 
1755 	SDP_LIST_WLOCK();
1756 	LIST_FOREACH(ssk, &sdp_list, list) {
1757 		if (ssk->ib_device != device)
1758 			continue;
1759 		SDP_WLOCK(ssk);
1760 		if ((ssk->flags & SDP_DESTROY) == 0)
1761 			ssk = sdp_notify(ssk, ECONNRESET);
1762 		if (ssk)
1763 			SDP_WUNLOCK(ssk);
1764 	}
1765 	SDP_LIST_WUNLOCK();
1766 	/*
1767 	 * XXX Do I need to wait between these two?
1768 	 */
1769 	sdp_dev = ib_get_client_data(device, &sdp_client);
1770 	if (!sdp_dev)
1771 		return;
1772 	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1773 	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1774 	ib_dereg_mr(sdp_dev->mr);
1775 	ib_dealloc_pd(sdp_dev->pd);
1776 	free(sdp_dev, M_SDP);
1777 }
1778 
1779 struct ib_client sdp_client =
1780     { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1781 
1782 
1783 static int
1784 sdp_pcblist(SYSCTL_HANDLER_ARGS)
1785 {
1786 	int error, n, i;
1787 	struct sdp_sock *ssk;
1788 	struct xinpgen xig;
1789 
1790 	/*
1791 	 * The process of preparing the TCB list is too time-consuming and
1792 	 * resource-intensive to repeat twice on every request.
1793 	 */
1794 	if (req->oldptr == NULL) {
1795 		n = sdp_count;
1796 		n += imax(n / 8, 10);
1797 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1798 		return (0);
1799 	}
1800 
1801 	if (req->newptr != NULL)
1802 		return (EPERM);
1803 
1804 	/*
1805 	 * OK, now we're committed to doing something.
1806 	 */
1807 	SDP_LIST_RLOCK();
1808 	n = sdp_count;
1809 	SDP_LIST_RUNLOCK();
1810 
1811 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1812 		+ n * sizeof(struct xtcpcb));
1813 	if (error != 0)
1814 		return (error);
1815 
1816 	xig.xig_len = sizeof xig;
1817 	xig.xig_count = n;
1818 	xig.xig_gen = 0;
1819 	xig.xig_sogen = so_gencnt;
1820 	error = SYSCTL_OUT(req, &xig, sizeof xig);
1821 	if (error)
1822 		return (error);
1823 
1824 	SDP_LIST_RLOCK();
1825 	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1826 	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1827 		struct xtcpcb xt;
1828 
1829 		SDP_RLOCK(ssk);
1830 		if (ssk->flags & SDP_TIMEWAIT) {
1831 			if (ssk->cred != NULL)
1832 				error = cr_cansee(req->td->td_ucred,
1833 				    ssk->cred);
1834 			else
1835 				error = EINVAL;	/* Skip this inp. */
1836 		} else if (ssk->socket)
1837 			error = cr_canseesocket(req->td->td_ucred,
1838 			    ssk->socket);
1839 		else
1840 			error = EINVAL;
1841 		if (error) {
1842 			error = 0;
1843 			goto next;
1844 		}
1845 
1846 		bzero(&xt, sizeof(xt));
1847 		xt.xt_len = sizeof xt;
1848 		xt.xt_inp.inp_gencnt = 0;
1849 		xt.xt_inp.inp_vflag = INP_IPV4;
1850 		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1851 		xt.xt_inp.inp_lport = ssk->lport;
1852 		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1853 		xt.xt_inp.inp_fport = ssk->fport;
1854 		xt.xt_tp.t_state = ssk->state;
1855 		if (ssk->socket != NULL)
1856 			sotoxsocket(ssk->socket, &xt.xt_socket);
1857 		else
1858 			bzero(&xt.xt_socket, sizeof xt.xt_socket);
1859 		xt.xt_socket.xso_protocol = IPPROTO_TCP;
1860 		SDP_RUNLOCK(ssk);
1861 		error = SYSCTL_OUT(req, &xt, sizeof xt);
1862 		if (error)
1863 			break;
1864 		i++;
1865 		continue;
1866 next:
1867 		SDP_RUNLOCK(ssk);
1868 	}
1869 	if (!error) {
1870 		/*
1871 		 * Give the user an updated idea of our state.
1872 		 * If the generation differs from what we told
1873 		 * her before, she knows that something happened
1874 		 * while we were processing this request, and it
1875 		 * might be necessary to retry.
1876 		 */
1877 		xig.xig_gen = 0;
1878 		xig.xig_sogen = so_gencnt;
1879 		xig.xig_count = sdp_count;
1880 		error = SYSCTL_OUT(req, &xig, sizeof xig);
1881 	}
1882 	SDP_LIST_RUNLOCK();
1883 	return (error);
1884 }
1885 
1886 static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1887 
1888 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1889     CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1890     "List of active SDP connections");
1891 
1892 static void
1893 sdp_zone_change(void *tag)
1894 {
1895 
1896 	uma_zone_set_max(sdp_zone, maxsockets);
1897 }
1898 
1899 static void
1900 sdp_init(void)
1901 {
1902 
1903 	LIST_INIT(&sdp_list);
1904 	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1905 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1906 	uma_zone_set_max(sdp_zone, maxsockets);
1907 	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1908 		EVENTHANDLER_PRI_ANY);
1909 	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1910 	ib_register_client(&sdp_client);
1911 }
1912 
1913 extern struct domain sdpdomain;
1914 
1915 struct pr_usrreqs sdp_usrreqs = {
1916 	.pru_abort =		sdp_abort,
1917 	.pru_accept =		sdp_accept,
1918 	.pru_attach =		sdp_attach,
1919 	.pru_bind =		sdp_bind,
1920 	.pru_connect =		sdp_connect,
1921 	.pru_control =		sdp_control,
1922 	.pru_detach =		sdp_detach,
1923 	.pru_disconnect =	sdp_disconnect,
1924 	.pru_listen =		sdp_listen,
1925 	.pru_peeraddr =		sdp_getpeeraddr,
1926 	.pru_rcvoob =		sdp_rcvoob,
1927 	.pru_send =		sdp_send,
1928 	.pru_sosend =		sdp_sosend,
1929 	.pru_soreceive =	sdp_sorecv,
1930 	.pru_shutdown =		sdp_shutdown,
1931 	.pru_sockaddr =		sdp_getsockaddr,
1932 	.pru_close =		sdp_close,
1933 };
1934 
1935 struct protosw sdpsw[] = {
1936 {
1937 	.pr_type =		SOCK_STREAM,
1938 	.pr_domain =		&sdpdomain,
1939 	.pr_protocol =		IPPROTO_IP,
1940 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1941 	.pr_ctlinput =		sdp_ctlinput,
1942 	.pr_ctloutput =		sdp_ctloutput,
1943 	.pr_usrreqs =		&sdp_usrreqs
1944 },
1945 {
1946 	.pr_type =		SOCK_STREAM,
1947 	.pr_domain =		&sdpdomain,
1948 	.pr_protocol =		IPPROTO_TCP,
1949 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1950 	.pr_ctlinput =		sdp_ctlinput,
1951 	.pr_ctloutput =		sdp_ctloutput,
1952 	.pr_usrreqs =		&sdp_usrreqs
1953 },
1954 };
1955 
1956 struct domain sdpdomain = {
1957 	.dom_family =		AF_INET_SDP,
1958 	.dom_name =		"SDP",
1959 	.dom_init =		sdp_init,
1960 	.dom_protosw =		sdpsw,
1961 	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1962 };
1963 
1964 DOMAIN_SET(sdp);
1965 
1966 int sdp_debug_level = 1;
1967 int sdp_data_debug_level = 0;
1968