xref: /freebsd/sys/kern/uipc_sockbuf.c (revision b3a1f9373a31b644f8a65de1ba35929af3f6a9fe)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_mac.h"
36 #include "opt_param.h"
37 
38 #include <sys/param.h>
39 #include <sys/aio.h> /* for aio_swake proto */
40 #include <sys/domain.h>
41 #include <sys/event.h>
42 #include <sys/eventhandler.h>
43 #include <sys/file.h>	/* for maxfiles */
44 #include <sys/kernel.h>
45 #include <sys/lock.h>
46 #include <sys/mac.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/protosw.h>
52 #include <sys/resourcevar.h>
53 #include <sys/signalvar.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/stat.h>
57 #include <sys/sysctl.h>
58 #include <sys/systm.h>
59 
60 int	maxsockets;
61 
62 void (*aio_swake)(struct socket *, struct sockbuf *);
63 
64 /*
65  * Primitive routines for operating on sockets and socket buffers
66  */
67 
68 u_long	sb_max = SB_MAX;
69 static	u_long sb_max_adj =
70     SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
71 
72 static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
73 
74 #ifdef REGRESSION
75 static int regression_sonewconn_earlytest = 1;
76 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
77     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
78 #endif
79 
80 /*
81  * Procedures to manipulate state flags of socket
82  * and do appropriate wakeups.  Normal sequence from the
83  * active (originating) side is that soisconnecting() is
84  * called during processing of connect() call,
85  * resulting in an eventual call to soisconnected() if/when the
86  * connection is established.  When the connection is torn down
87  * soisdisconnecting() is called during processing of disconnect() call,
88  * and soisdisconnected() is called when the connection to the peer
89  * is totally severed.  The semantics of these routines are such that
90  * connectionless protocols can call soisconnected() and soisdisconnected()
91  * only, bypassing the in-progress calls when setting up a ``connection''
92  * takes no time.
93  *
94  * From the passive side, a socket is created with
95  * two queues of sockets: so_incomp for connections in progress
96  * and so_comp for connections already made and awaiting user acceptance.
97  * As a protocol is preparing incoming connections, it creates a socket
98  * structure queued on so_incomp by calling sonewconn().  When the connection
99  * is established, soisconnected() is called, and transfers the
100  * socket structure to so_comp, making it available to accept().
101  *
102  * If a socket is closed with sockets on either
103  * so_incomp or so_comp, these sockets are dropped.
104  *
105  * If higher level protocols are implemented in
106  * the kernel, the wakeups done here will sometimes
107  * cause software-interrupt process scheduling.
108  */
109 
110 void
111 soisconnecting(so)
112 	register struct socket *so;
113 {
114 
115 	SOCK_LOCK(so);
116 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
117 	so->so_state |= SS_ISCONNECTING;
118 	SOCK_UNLOCK(so);
119 }
120 
121 void
122 soisconnected(so)
123 	struct socket *so;
124 {
125 	struct socket *head;
126 
127 	ACCEPT_LOCK();
128 	SOCK_LOCK(so);
129 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
130 	so->so_state |= SS_ISCONNECTED;
131 	head = so->so_head;
132 	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
133 		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
134 			SOCK_UNLOCK(so);
135 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
136 			head->so_incqlen--;
137 			so->so_qstate &= ~SQ_INCOMP;
138 			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
139 			head->so_qlen++;
140 			so->so_qstate |= SQ_COMP;
141 			ACCEPT_UNLOCK();
142 			sorwakeup(head);
143 			wakeup_one(&head->so_timeo);
144 		} else {
145 			ACCEPT_UNLOCK();
146 			so->so_upcall =
147 			    head->so_accf->so_accept_filter->accf_callback;
148 			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
149 			so->so_rcv.sb_flags |= SB_UPCALL;
150 			so->so_options &= ~SO_ACCEPTFILTER;
151 			SOCK_UNLOCK(so);
152 			so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
153 		}
154 		return;
155 	}
156 	SOCK_UNLOCK(so);
157 	ACCEPT_UNLOCK();
158 	wakeup(&so->so_timeo);
159 	sorwakeup(so);
160 	sowwakeup(so);
161 }
162 
163 void
164 soisdisconnecting(so)
165 	register struct socket *so;
166 {
167 
168 	/*
169 	 * XXXRW: This code assumes that SOCK_LOCK(so) and
170 	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
171 	 */
172 	SOCKBUF_LOCK(&so->so_rcv);
173 	so->so_state &= ~SS_ISCONNECTING;
174 	so->so_state |= SS_ISDISCONNECTING;
175 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
176 	sorwakeup_locked(so);
177 	SOCKBUF_LOCK(&so->so_snd);
178 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
179 	sowwakeup_locked(so);
180 	wakeup(&so->so_timeo);
181 }
182 
183 void
184 soisdisconnected(so)
185 	register struct socket *so;
186 {
187 
188 	/*
189 	 * XXXRW: This code assumes that SOCK_LOCK(so) and
190 	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
191 	 */
192 	SOCKBUF_LOCK(&so->so_rcv);
193 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
194 	so->so_state |= SS_ISDISCONNECTED;
195 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
196 	sorwakeup_locked(so);
197 	SOCKBUF_LOCK(&so->so_snd);
198 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
199 	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
200 	sowwakeup_locked(so);
201 	wakeup(&so->so_timeo);
202 }
203 
204 /*
205  * When an attempt at a new connection is noted on a socket
206  * which accepts connections, sonewconn is called.  If the
207  * connection is possible (subject to space constraints, etc.)
208  * then we allocate a new structure, propoerly linked into the
209  * data structure of the original socket, and return this.
210  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
211  *
212  * note: the ref count on the socket is 0 on return
213  */
214 struct socket *
215 sonewconn(head, connstatus)
216 	register struct socket *head;
217 	int connstatus;
218 {
219 	register struct socket *so;
220 	int over;
221 
222 	ACCEPT_LOCK();
223 	over = (head->so_qlen > 3 * head->so_qlimit / 2);
224 	ACCEPT_UNLOCK();
225 #ifdef REGRESSION
226 	if (regression_sonewconn_earlytest && over)
227 #else
228 	if (over)
229 #endif
230 		return (NULL);
231 	so = soalloc(M_NOWAIT);
232 	if (so == NULL)
233 		return (NULL);
234 	if ((head->so_options & SO_ACCEPTFILTER) != 0)
235 		connstatus = 0;
236 	so->so_head = head;
237 	so->so_type = head->so_type;
238 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
239 	so->so_linger = head->so_linger;
240 	so->so_state = head->so_state | SS_NOFDREF;
241 	so->so_proto = head->so_proto;
242 	so->so_timeo = head->so_timeo;
243 	so->so_cred = crhold(head->so_cred);
244 #ifdef MAC
245 	SOCK_LOCK(head);
246 	mac_create_socket_from_socket(head, so);
247 	SOCK_UNLOCK(head);
248 #endif
249 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
250 	    NULL, NULL, NULL);
251 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
252 	    NULL, NULL, NULL);
253 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
254 	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
255 		sodealloc(so);
256 		return (NULL);
257 	}
258 	so->so_state |= connstatus;
259 	ACCEPT_LOCK();
260 	if (connstatus) {
261 		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
262 		so->so_qstate |= SQ_COMP;
263 		head->so_qlen++;
264 	} else {
265 		/*
266 		 * Keep removing sockets from the head until there's room for
267 		 * us to insert on the tail.  In pre-locking revisions, this
268 		 * was a simple if(), but as we could be racing with other
269 		 * threads and soabort() requires dropping locks, we must
270 		 * loop waiting for the condition to be true.
271 		 */
272 		while (head->so_incqlen > head->so_qlimit) {
273 			struct socket *sp;
274 			sp = TAILQ_FIRST(&head->so_incomp);
275 			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
276 			head->so_incqlen--;
277 			sp->so_qstate &= ~SQ_INCOMP;
278 			sp->so_head = NULL;
279 			ACCEPT_UNLOCK();
280 			soabort(sp);
281 			ACCEPT_LOCK();
282 		}
283 		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
284 		so->so_qstate |= SQ_INCOMP;
285 		head->so_incqlen++;
286 	}
287 	ACCEPT_UNLOCK();
288 	if (connstatus) {
289 		sorwakeup(head);
290 		wakeup_one(&head->so_timeo);
291 	}
292 	return (so);
293 }
294 
295 /*
296  * Socantsendmore indicates that no more data will be sent on the
297  * socket; it would normally be applied to a socket when the user
298  * informs the system that no more data is to be sent, by the protocol
299  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
300  * will be received, and will normally be applied to the socket by a
301  * protocol when it detects that the peer will send no more data.
302  * Data queued for reading in the socket may yet be read.
303  */
304 void
305 socantsendmore_locked(so)
306 	struct socket *so;
307 {
308 
309 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
310 
311 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
312 	sowwakeup_locked(so);
313 	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
314 }
315 
316 void
317 socantsendmore(so)
318 	struct socket *so;
319 {
320 
321 	SOCKBUF_LOCK(&so->so_snd);
322 	socantsendmore_locked(so);
323 	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
324 }
325 
326 void
327 socantrcvmore_locked(so)
328 	struct socket *so;
329 {
330 
331 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
332 
333 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
334 	sorwakeup_locked(so);
335 	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
336 }
337 
338 void
339 socantrcvmore(so)
340 	struct socket *so;
341 {
342 
343 	SOCKBUF_LOCK(&so->so_rcv);
344 	socantrcvmore_locked(so);
345 	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
346 }
347 
348 /*
349  * Wait for data to arrive at/drain from a socket buffer.
350  */
351 int
352 sbwait(sb)
353 	struct sockbuf *sb;
354 {
355 
356 	SOCKBUF_LOCK_ASSERT(sb);
357 
358 	sb->sb_flags |= SB_WAIT;
359 	return (msleep(&sb->sb_cc, &sb->sb_mtx,
360 	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
361 	    sb->sb_timeo));
362 }
363 
364 /*
365  * Lock a sockbuf already known to be locked;
366  * return any error returned from sleep (EINTR).
367  */
368 int
369 sb_lock(sb)
370 	register struct sockbuf *sb;
371 {
372 	int error;
373 
374 	SOCKBUF_LOCK_ASSERT(sb);
375 
376 	while (sb->sb_flags & SB_LOCK) {
377 		sb->sb_flags |= SB_WANT;
378 		error = msleep(&sb->sb_flags, &sb->sb_mtx,
379 		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
380 		    "sblock", 0);
381 		if (error)
382 			return (error);
383 	}
384 	sb->sb_flags |= SB_LOCK;
385 	return (0);
386 }
387 
388 /*
389  * Wakeup processes waiting on a socket buffer.  Do asynchronous
390  * notification via SIGIO if the socket has the SS_ASYNC flag set.
391  *
392  * Called with the socket buffer lock held; will release the lock by the end
393  * of the function.  This allows the caller to acquire the socket buffer lock
394  * while testing for the need for various sorts of wakeup and hold it through
395  * to the point where it's no longer required.  We currently hold the lock
396  * through calls out to other subsystems (with the exception of kqueue), and
397  * then release it to avoid lock order issues.  It's not clear that's
398  * correct.
399  */
400 void
401 sowakeup(so, sb)
402 	register struct socket *so;
403 	register struct sockbuf *sb;
404 {
405 
406 	SOCKBUF_LOCK_ASSERT(sb);
407 
408 	selwakeuppri(&sb->sb_sel, PSOCK);
409 	sb->sb_flags &= ~SB_SEL;
410 	if (sb->sb_flags & SB_WAIT) {
411 		sb->sb_flags &= ~SB_WAIT;
412 		wakeup(&sb->sb_cc);
413 	}
414 	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
415 	SOCKBUF_UNLOCK(sb);
416 	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
417 		pgsigio(&so->so_sigio, SIGIO, 0);
418 	if (sb->sb_flags & SB_UPCALL)
419 		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
420 	if (sb->sb_flags & SB_AIO)
421 		aio_swake(so, sb);
422 	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
423 }
424 
425 /*
426  * Socket buffer (struct sockbuf) utility routines.
427  *
428  * Each socket contains two socket buffers: one for sending data and
429  * one for receiving data.  Each buffer contains a queue of mbufs,
430  * information about the number of mbufs and amount of data in the
431  * queue, and other fields allowing select() statements and notification
432  * on data availability to be implemented.
433  *
434  * Data stored in a socket buffer is maintained as a list of records.
435  * Each record is a list of mbufs chained together with the m_next
436  * field.  Records are chained together with the m_nextpkt field. The upper
437  * level routine soreceive() expects the following conventions to be
438  * observed when placing information in the receive buffer:
439  *
440  * 1. If the protocol requires each message be preceded by the sender's
441  *    name, then a record containing that name must be present before
442  *    any associated data (mbuf's must be of type MT_SONAME).
443  * 2. If the protocol supports the exchange of ``access rights'' (really
444  *    just additional data associated with the message), and there are
445  *    ``rights'' to be received, then a record containing this data
446  *    should be present (mbuf's must be of type MT_RIGHTS).
447  * 3. If a name or rights record exists, then it must be followed by
448  *    a data record, perhaps of zero length.
449  *
450  * Before using a new socket structure it is first necessary to reserve
451  * buffer space to the socket, by calling sbreserve().  This should commit
452  * some of the available buffer space in the system buffer pool for the
453  * socket (currently, it does nothing but enforce limits).  The space
454  * should be released by calling sbrelease() when the socket is destroyed.
455  */
456 
457 int
458 soreserve(so, sndcc, rcvcc)
459 	register struct socket *so;
460 	u_long sndcc, rcvcc;
461 {
462 	struct thread *td = curthread;
463 
464 	SOCKBUF_LOCK(&so->so_snd);
465 	SOCKBUF_LOCK(&so->so_rcv);
466 	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
467 		goto bad;
468 	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
469 		goto bad2;
470 	if (so->so_rcv.sb_lowat == 0)
471 		so->so_rcv.sb_lowat = 1;
472 	if (so->so_snd.sb_lowat == 0)
473 		so->so_snd.sb_lowat = MCLBYTES;
474 	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
475 		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
476 	SOCKBUF_UNLOCK(&so->so_rcv);
477 	SOCKBUF_UNLOCK(&so->so_snd);
478 	return (0);
479 bad2:
480 	sbrelease_locked(&so->so_snd, so);
481 bad:
482 	SOCKBUF_UNLOCK(&so->so_rcv);
483 	SOCKBUF_UNLOCK(&so->so_snd);
484 	return (ENOBUFS);
485 }
486 
487 static int
488 sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
489 {
490 	int error = 0;
491 	u_long old_sb_max = sb_max;
492 
493 	error = SYSCTL_OUT(req, arg1, sizeof(u_long));
494 	if (error || !req->newptr)
495 		return (error);
496 	error = SYSCTL_IN(req, arg1, sizeof(u_long));
497 	if (error)
498 		return (error);
499 	if (sb_max < MSIZE + MCLBYTES) {
500 		sb_max = old_sb_max;
501 		return (EINVAL);
502 	}
503 	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
504 	return (0);
505 }
506 
507 /*
508  * Allot mbufs to a sockbuf.
509  * Attempt to scale mbmax so that mbcnt doesn't become limiting
510  * if buffering efficiency is near the normal case.
511  */
512 int
513 sbreserve_locked(sb, cc, so, td)
514 	struct sockbuf *sb;
515 	u_long cc;
516 	struct socket *so;
517 	struct thread *td;
518 {
519 	rlim_t sbsize_limit;
520 
521 	SOCKBUF_LOCK_ASSERT(sb);
522 
523 	/*
524 	 * td will only be NULL when we're in an interrupt
525 	 * (e.g. in tcp_input())
526 	 */
527 	if (cc > sb_max_adj)
528 		return (0);
529 	if (td != NULL) {
530 		PROC_LOCK(td->td_proc);
531 		sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
532 		PROC_UNLOCK(td->td_proc);
533 	} else
534 		sbsize_limit = RLIM_INFINITY;
535 	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
536 	    sbsize_limit))
537 		return (0);
538 	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
539 	if (sb->sb_lowat > sb->sb_hiwat)
540 		sb->sb_lowat = sb->sb_hiwat;
541 	return (1);
542 }
543 
544 int
545 sbreserve(sb, cc, so, td)
546 	struct sockbuf *sb;
547 	u_long cc;
548 	struct socket *so;
549 	struct thread *td;
550 {
551 	int error;
552 
553 	SOCKBUF_LOCK(sb);
554 	error = sbreserve_locked(sb, cc, so, td);
555 	SOCKBUF_UNLOCK(sb);
556 	return (error);
557 }
558 
559 /*
560  * Free mbufs held by a socket, and reserved mbuf space.
561  */
562 void
563 sbrelease_locked(sb, so)
564 	struct sockbuf *sb;
565 	struct socket *so;
566 {
567 
568 	SOCKBUF_LOCK_ASSERT(sb);
569 
570 	sbflush_locked(sb);
571 	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
572 	    RLIM_INFINITY);
573 	sb->sb_mbmax = 0;
574 }
575 
576 void
577 sbrelease(sb, so)
578 	struct sockbuf *sb;
579 	struct socket *so;
580 {
581 
582 	SOCKBUF_LOCK(sb);
583 	sbrelease_locked(sb, so);
584 	SOCKBUF_UNLOCK(sb);
585 }
586 /*
587  * Routines to add and remove
588  * data from an mbuf queue.
589  *
590  * The routines sbappend() or sbappendrecord() are normally called to
591  * append new mbufs to a socket buffer, after checking that adequate
592  * space is available, comparing the function sbspace() with the amount
593  * of data to be added.  sbappendrecord() differs from sbappend() in
594  * that data supplied is treated as the beginning of a new record.
595  * To place a sender's address, optional access rights, and data in a
596  * socket receive buffer, sbappendaddr() should be used.  To place
597  * access rights and data in a socket receive buffer, sbappendrights()
598  * should be used.  In either case, the new data begins a new record.
599  * Note that unlike sbappend() and sbappendrecord(), these routines check
600  * for the caller that there will be enough space to store the data.
601  * Each fails if there is not enough space, or if it cannot find mbufs
602  * to store additional information in.
603  *
604  * Reliable protocols may use the socket send buffer to hold data
605  * awaiting acknowledgement.  Data is normally copied from a socket
606  * send buffer in a protocol with m_copy for output to a peer,
607  * and then removing the data from the socket buffer with sbdrop()
608  * or sbdroprecord() when the data is acknowledged by the peer.
609  */
610 
611 #ifdef SOCKBUF_DEBUG
612 void
613 sblastrecordchk(struct sockbuf *sb, const char *file, int line)
614 {
615 	struct mbuf *m = sb->sb_mb;
616 
617 	SOCKBUF_LOCK_ASSERT(sb);
618 
619 	while (m && m->m_nextpkt)
620 		m = m->m_nextpkt;
621 
622 	if (m != sb->sb_lastrecord) {
623 		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
624 			__func__, sb->sb_mb, sb->sb_lastrecord, m);
625 		printf("packet chain:\n");
626 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
627 			printf("\t%p\n", m);
628 		panic("%s from %s:%u", __func__, file, line);
629 	}
630 }
631 
632 void
633 sblastmbufchk(struct sockbuf *sb, const char *file, int line)
634 {
635 	struct mbuf *m = sb->sb_mb;
636 	struct mbuf *n;
637 
638 	SOCKBUF_LOCK_ASSERT(sb);
639 
640 	while (m && m->m_nextpkt)
641 		m = m->m_nextpkt;
642 
643 	while (m && m->m_next)
644 		m = m->m_next;
645 
646 	if (m != sb->sb_mbtail) {
647 		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
648 			__func__, sb->sb_mb, sb->sb_mbtail, m);
649 		printf("packet tree:\n");
650 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
651 			printf("\t");
652 			for (n = m; n != NULL; n = n->m_next)
653 				printf("%p ", n);
654 			printf("\n");
655 		}
656 		panic("%s from %s:%u", __func__, file, line);
657 	}
658 }
659 #endif /* SOCKBUF_DEBUG */
660 
661 #define SBLINKRECORD(sb, m0) do {					\
662 	SOCKBUF_LOCK_ASSERT(sb);					\
663 	if ((sb)->sb_lastrecord != NULL)				\
664 		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
665 	else								\
666 		(sb)->sb_mb = (m0);					\
667 	(sb)->sb_lastrecord = (m0);					\
668 } while (/*CONSTCOND*/0)
669 
670 /*
671  * Append mbuf chain m to the last record in the
672  * socket buffer sb.  The additional space associated
673  * the mbuf chain is recorded in sb.  Empty mbufs are
674  * discarded and mbufs are compacted where possible.
675  */
676 void
677 sbappend_locked(sb, m)
678 	struct sockbuf *sb;
679 	struct mbuf *m;
680 {
681 	register struct mbuf *n;
682 
683 	SOCKBUF_LOCK_ASSERT(sb);
684 
685 	if (m == 0)
686 		return;
687 
688 	SBLASTRECORDCHK(sb);
689 	n = sb->sb_mb;
690 	if (n) {
691 		while (n->m_nextpkt)
692 			n = n->m_nextpkt;
693 		do {
694 			if (n->m_flags & M_EOR) {
695 				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
696 				return;
697 			}
698 		} while (n->m_next && (n = n->m_next));
699 	} else {
700 		/*
701 		 * XXX Would like to simply use sb_mbtail here, but
702 		 * XXX I need to verify that I won't miss an EOR that
703 		 * XXX way.
704 		 */
705 		if ((n = sb->sb_lastrecord) != NULL) {
706 			do {
707 				if (n->m_flags & M_EOR) {
708 					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
709 					return;
710 				}
711 			} while (n->m_next && (n = n->m_next));
712 		} else {
713 			/*
714 			 * If this is the first record in the socket buffer,
715 			 * it's also the last record.
716 			 */
717 			sb->sb_lastrecord = m;
718 		}
719 	}
720 	sbcompress(sb, m, n);
721 	SBLASTRECORDCHK(sb);
722 }
723 
724 /*
725  * Append mbuf chain m to the last record in the
726  * socket buffer sb.  The additional space associated
727  * the mbuf chain is recorded in sb.  Empty mbufs are
728  * discarded and mbufs are compacted where possible.
729  */
730 void
731 sbappend(sb, m)
732 	struct sockbuf *sb;
733 	struct mbuf *m;
734 {
735 
736 	SOCKBUF_LOCK(sb);
737 	sbappend_locked(sb, m);
738 	SOCKBUF_UNLOCK(sb);
739 }
740 
741 /*
742  * This version of sbappend() should only be used when the caller
743  * absolutely knows that there will never be more than one record
744  * in the socket buffer, that is, a stream protocol (such as TCP).
745  */
746 void
747 sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
748 {
749 	SOCKBUF_LOCK_ASSERT(sb);
750 
751 	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
752 	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
753 
754 	SBLASTMBUFCHK(sb);
755 
756 	sbcompress(sb, m, sb->sb_mbtail);
757 
758 	sb->sb_lastrecord = sb->sb_mb;
759 	SBLASTRECORDCHK(sb);
760 }
761 
762 /*
763  * This version of sbappend() should only be used when the caller
764  * absolutely knows that there will never be more than one record
765  * in the socket buffer, that is, a stream protocol (such as TCP).
766  */
767 void
768 sbappendstream(struct sockbuf *sb, struct mbuf *m)
769 {
770 
771 	SOCKBUF_LOCK(sb);
772 	sbappendstream_locked(sb, m);
773 	SOCKBUF_UNLOCK(sb);
774 }
775 
776 #ifdef SOCKBUF_DEBUG
777 void
778 sbcheck(sb)
779 	struct sockbuf *sb;
780 {
781 	struct mbuf *m;
782 	struct mbuf *n = 0;
783 	u_long len = 0, mbcnt = 0;
784 
785 	SOCKBUF_LOCK_ASSERT(sb);
786 
787 	for (m = sb->sb_mb; m; m = n) {
788 	    n = m->m_nextpkt;
789 	    for (; m; m = m->m_next) {
790 		len += m->m_len;
791 		mbcnt += MSIZE;
792 		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
793 			mbcnt += m->m_ext.ext_size;
794 	    }
795 	}
796 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
797 		printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
798 		    mbcnt, sb->sb_mbcnt);
799 		panic("sbcheck");
800 	}
801 }
802 #endif
803 
804 /*
805  * As above, except the mbuf chain
806  * begins a new record.
807  */
808 void
809 sbappendrecord_locked(sb, m0)
810 	register struct sockbuf *sb;
811 	register struct mbuf *m0;
812 {
813 	register struct mbuf *m;
814 
815 	SOCKBUF_LOCK_ASSERT(sb);
816 
817 	if (m0 == 0)
818 		return;
819 	m = sb->sb_mb;
820 	if (m)
821 		while (m->m_nextpkt)
822 			m = m->m_nextpkt;
823 	/*
824 	 * Put the first mbuf on the queue.
825 	 * Note this permits zero length records.
826 	 */
827 	sballoc(sb, m0);
828 	SBLASTRECORDCHK(sb);
829 	SBLINKRECORD(sb, m0);
830 	if (m)
831 		m->m_nextpkt = m0;
832 	else
833 		sb->sb_mb = m0;
834 	m = m0->m_next;
835 	m0->m_next = 0;
836 	if (m && (m0->m_flags & M_EOR)) {
837 		m0->m_flags &= ~M_EOR;
838 		m->m_flags |= M_EOR;
839 	}
840 	sbcompress(sb, m, m0);
841 }
842 
843 /*
844  * As above, except the mbuf chain
845  * begins a new record.
846  */
847 void
848 sbappendrecord(sb, m0)
849 	register struct sockbuf *sb;
850 	register struct mbuf *m0;
851 {
852 
853 	SOCKBUF_LOCK(sb);
854 	sbappendrecord_locked(sb, m0);
855 	SOCKBUF_UNLOCK(sb);
856 }
857 
858 /*
859  * As above except that OOB data
860  * is inserted at the beginning of the sockbuf,
861  * but after any other OOB data.
862  */
863 void
864 sbinsertoob_locked(sb, m0)
865 	register struct sockbuf *sb;
866 	register struct mbuf *m0;
867 {
868 	register struct mbuf *m;
869 	register struct mbuf **mp;
870 
871 	SOCKBUF_LOCK_ASSERT(sb);
872 
873 	if (m0 == 0)
874 		return;
875 	for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
876 	    m = *mp;
877 	    again:
878 		switch (m->m_type) {
879 
880 		case MT_OOBDATA:
881 			continue;		/* WANT next train */
882 
883 		case MT_CONTROL:
884 			m = m->m_next;
885 			if (m)
886 				goto again;	/* inspect THIS train further */
887 		}
888 		break;
889 	}
890 	/*
891 	 * Put the first mbuf on the queue.
892 	 * Note this permits zero length records.
893 	 */
894 	sballoc(sb, m0);
895 	m0->m_nextpkt = *mp;
896 	*mp = m0;
897 	m = m0->m_next;
898 	m0->m_next = 0;
899 	if (m && (m0->m_flags & M_EOR)) {
900 		m0->m_flags &= ~M_EOR;
901 		m->m_flags |= M_EOR;
902 	}
903 	sbcompress(sb, m, m0);
904 }
905 
906 /*
907  * As above except that OOB data
908  * is inserted at the beginning of the sockbuf,
909  * but after any other OOB data.
910  */
911 void
912 sbinsertoob(sb, m0)
913 	register struct sockbuf *sb;
914 	register struct mbuf *m0;
915 {
916 
917 	SOCKBUF_LOCK(sb);
918 	sbinsertoob_locked(sb, m0);
919 	SOCKBUF_UNLOCK(sb);
920 }
921 
922 /*
923  * Append address and data, and optionally, control (ancillary) data
924  * to the receive queue of a socket.  If present,
925  * m0 must include a packet header with total length.
926  * Returns 0 if no space in sockbuf or insufficient mbufs.
927  */
928 int
929 sbappendaddr_locked(sb, asa, m0, control)
930 	struct sockbuf *sb;
931 	const struct sockaddr *asa;
932 	struct mbuf *m0, *control;
933 {
934 	struct mbuf *m, *n, *nlast;
935 	int space = asa->sa_len;
936 
937 	SOCKBUF_LOCK_ASSERT(sb);
938 
939 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
940 		panic("sbappendaddr_locked");
941 	if (m0)
942 		space += m0->m_pkthdr.len;
943 	space += m_length(control, &n);
944 
945 	if (space > sbspace(sb))
946 		return (0);
947 #if MSIZE <= 256
948 	if (asa->sa_len > MLEN)
949 		return (0);
950 #endif
951 	MGET(m, M_DONTWAIT, MT_SONAME);
952 	if (m == 0)
953 		return (0);
954 	m->m_len = asa->sa_len;
955 	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
956 	if (n)
957 		n->m_next = m0;		/* concatenate data to control */
958 	else
959 		control = m0;
960 	m->m_next = control;
961 	for (n = m; n->m_next != NULL; n = n->m_next)
962 		sballoc(sb, n);
963 	sballoc(sb, n);
964 	nlast = n;
965 	SBLINKRECORD(sb, m);
966 
967 	sb->sb_mbtail = nlast;
968 	SBLASTMBUFCHK(sb);
969 
970 	SBLASTRECORDCHK(sb);
971 	return (1);
972 }
973 
974 /*
975  * Append address and data, and optionally, control (ancillary) data
976  * to the receive queue of a socket.  If present,
977  * m0 must include a packet header with total length.
978  * Returns 0 if no space in sockbuf or insufficient mbufs.
979  */
980 int
981 sbappendaddr(sb, asa, m0, control)
982 	struct sockbuf *sb;
983 	const struct sockaddr *asa;
984 	struct mbuf *m0, *control;
985 {
986 	int retval;
987 
988 	SOCKBUF_LOCK(sb);
989 	retval = sbappendaddr_locked(sb, asa, m0, control);
990 	SOCKBUF_UNLOCK(sb);
991 	return (retval);
992 }
993 
994 int
995 sbappendcontrol_locked(sb, m0, control)
996 	struct sockbuf *sb;
997 	struct mbuf *control, *m0;
998 {
999 	struct mbuf *m, *n, *mlast;
1000 	int space;
1001 
1002 	SOCKBUF_LOCK_ASSERT(sb);
1003 
1004 	if (control == 0)
1005 		panic("sbappendcontrol_locked");
1006 	space = m_length(control, &n) + m_length(m0, NULL);
1007 
1008 	if (space > sbspace(sb))
1009 		return (0);
1010 	n->m_next = m0;			/* concatenate data to control */
1011 
1012 	SBLASTRECORDCHK(sb);
1013 
1014 	for (m = control; m->m_next; m = m->m_next)
1015 		sballoc(sb, m);
1016 	sballoc(sb, m);
1017 	mlast = m;
1018 	SBLINKRECORD(sb, control);
1019 
1020 	sb->sb_mbtail = mlast;
1021 	SBLASTMBUFCHK(sb);
1022 
1023 	SBLASTRECORDCHK(sb);
1024 	return (1);
1025 }
1026 
1027 int
1028 sbappendcontrol(sb, m0, control)
1029 	struct sockbuf *sb;
1030 	struct mbuf *control, *m0;
1031 {
1032 	int retval;
1033 
1034 	SOCKBUF_LOCK(sb);
1035 	retval = sbappendcontrol_locked(sb, m0, control);
1036 	SOCKBUF_UNLOCK(sb);
1037 	return (retval);
1038 }
1039 
1040 /*
1041  * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
1042  * (n).  If (n) is NULL, the buffer is presumed empty.
1043  *
1044  * When the data is compressed, mbufs in the chain may be handled in one of
1045  * three ways:
1046  *
1047  * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
1048  *     record boundary, and no change in data type).
1049  *
1050  * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
1051  *     an mbuf already in the socket buffer.  This can occur if an
1052  *     appropriate mbuf exists, there is room, and no merging of data types
1053  *     will occur.
1054  *
1055  * (3) The mbuf may be appended to the end of the existing mbuf chain.
1056  *
1057  * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
1058  * end-of-record.
1059  */
1060 void
1061 sbcompress(sb, m, n)
1062 	register struct sockbuf *sb;
1063 	register struct mbuf *m, *n;
1064 {
1065 	register int eor = 0;
1066 	register struct mbuf *o;
1067 
1068 	SOCKBUF_LOCK_ASSERT(sb);
1069 
1070 	while (m) {
1071 		eor |= m->m_flags & M_EOR;
1072 		if (m->m_len == 0 &&
1073 		    (eor == 0 ||
1074 		     (((o = m->m_next) || (o = n)) &&
1075 		      o->m_type == m->m_type))) {
1076 			if (sb->sb_lastrecord == m)
1077 				sb->sb_lastrecord = m->m_next;
1078 			m = m_free(m);
1079 			continue;
1080 		}
1081 		if (n && (n->m_flags & M_EOR) == 0 &&
1082 		    M_WRITABLE(n) &&
1083 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
1084 		    m->m_len <= M_TRAILINGSPACE(n) &&
1085 		    n->m_type == m->m_type) {
1086 			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
1087 			    (unsigned)m->m_len);
1088 			n->m_len += m->m_len;
1089 			sb->sb_cc += m->m_len;
1090 			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
1091 				/* XXX: Probably don't need.*/
1092 				sb->sb_ctl += m->m_len;
1093 			m = m_free(m);
1094 			continue;
1095 		}
1096 		if (n)
1097 			n->m_next = m;
1098 		else
1099 			sb->sb_mb = m;
1100 		sb->sb_mbtail = m;
1101 		sballoc(sb, m);
1102 		n = m;
1103 		m->m_flags &= ~M_EOR;
1104 		m = m->m_next;
1105 		n->m_next = 0;
1106 	}
1107 	if (eor) {
1108 		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
1109 		n->m_flags |= eor;
1110 	}
1111 	SBLASTMBUFCHK(sb);
1112 }
1113 
1114 /*
1115  * Free all mbufs in a sockbuf.
1116  * Check that all resources are reclaimed.
1117  */
1118 void
1119 sbflush_locked(sb)
1120 	register struct sockbuf *sb;
1121 {
1122 
1123 	SOCKBUF_LOCK_ASSERT(sb);
1124 
1125 	if (sb->sb_flags & SB_LOCK)
1126 		panic("sbflush_locked: locked");
1127 	while (sb->sb_mbcnt) {
1128 		/*
1129 		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1130 		 * we would loop forever. Panic instead.
1131 		 */
1132 		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
1133 			break;
1134 		sbdrop_locked(sb, (int)sb->sb_cc);
1135 	}
1136 	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
1137 		panic("sbflush_locked: cc %u || mb %p || mbcnt %u", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
1138 }
1139 
1140 void
1141 sbflush(sb)
1142 	register struct sockbuf *sb;
1143 {
1144 
1145 	SOCKBUF_LOCK(sb);
1146 	sbflush_locked(sb);
1147 	SOCKBUF_UNLOCK(sb);
1148 }
1149 
1150 /*
1151  * Drop data from (the front of) a sockbuf.
1152  */
1153 void
1154 sbdrop_locked(sb, len)
1155 	register struct sockbuf *sb;
1156 	register int len;
1157 {
1158 	register struct mbuf *m;
1159 	struct mbuf *next;
1160 
1161 	SOCKBUF_LOCK_ASSERT(sb);
1162 
1163 	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1164 	while (len > 0) {
1165 		if (m == 0) {
1166 			if (next == 0)
1167 				panic("sbdrop");
1168 			m = next;
1169 			next = m->m_nextpkt;
1170 			continue;
1171 		}
1172 		if (m->m_len > len) {
1173 			m->m_len -= len;
1174 			m->m_data += len;
1175 			sb->sb_cc -= len;
1176 			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
1177 				sb->sb_ctl -= len;
1178 			break;
1179 		}
1180 		len -= m->m_len;
1181 		sbfree(sb, m);
1182 		m = m_free(m);
1183 	}
1184 	while (m && m->m_len == 0) {
1185 		sbfree(sb, m);
1186 		m = m_free(m);
1187 	}
1188 	if (m) {
1189 		sb->sb_mb = m;
1190 		m->m_nextpkt = next;
1191 	} else
1192 		sb->sb_mb = next;
1193 	/*
1194 	 * First part is an inline SB_EMPTY_FIXUP().  Second part
1195 	 * makes sure sb_lastrecord is up-to-date if we dropped
1196 	 * part of the last record.
1197 	 */
1198 	m = sb->sb_mb;
1199 	if (m == NULL) {
1200 		sb->sb_mbtail = NULL;
1201 		sb->sb_lastrecord = NULL;
1202 	} else if (m->m_nextpkt == NULL) {
1203 		sb->sb_lastrecord = m;
1204 	}
1205 }
1206 
1207 /*
1208  * Drop data from (the front of) a sockbuf.
1209  */
1210 void
1211 sbdrop(sb, len)
1212 	register struct sockbuf *sb;
1213 	register int len;
1214 {
1215 
1216 	SOCKBUF_LOCK(sb);
1217 	sbdrop_locked(sb, len);
1218 	SOCKBUF_UNLOCK(sb);
1219 }
1220 
1221 /*
1222  * Drop a record off the front of a sockbuf
1223  * and move the next record to the front.
1224  */
1225 void
1226 sbdroprecord_locked(sb)
1227 	register struct sockbuf *sb;
1228 {
1229 	register struct mbuf *m;
1230 
1231 	SOCKBUF_LOCK_ASSERT(sb);
1232 
1233 	m = sb->sb_mb;
1234 	if (m) {
1235 		sb->sb_mb = m->m_nextpkt;
1236 		do {
1237 			sbfree(sb, m);
1238 			m = m_free(m);
1239 		} while (m);
1240 	}
1241 	SB_EMPTY_FIXUP(sb);
1242 }
1243 
1244 /*
1245  * Drop a record off the front of a sockbuf
1246  * and move the next record to the front.
1247  */
1248 void
1249 sbdroprecord(sb)
1250 	register struct sockbuf *sb;
1251 {
1252 
1253 	SOCKBUF_LOCK(sb);
1254 	sbdroprecord_locked(sb);
1255 	SOCKBUF_UNLOCK(sb);
1256 }
1257 
1258 /*
1259  * Create a "control" mbuf containing the specified data
1260  * with the specified type for presentation on a socket buffer.
1261  */
1262 struct mbuf *
1263 sbcreatecontrol(p, size, type, level)
1264 	caddr_t p;
1265 	register int size;
1266 	int type, level;
1267 {
1268 	register struct cmsghdr *cp;
1269 	struct mbuf *m;
1270 
1271 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
1272 		return ((struct mbuf *) NULL);
1273 	if (CMSG_SPACE((u_int)size) > MLEN)
1274 		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
1275 	else
1276 		m = m_get(M_DONTWAIT, MT_CONTROL);
1277 	if (m == NULL)
1278 		return ((struct mbuf *) NULL);
1279 	cp = mtod(m, struct cmsghdr *);
1280 	m->m_len = 0;
1281 	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
1282 	    ("sbcreatecontrol: short mbuf"));
1283 	if (p != NULL)
1284 		(void)memcpy(CMSG_DATA(cp), p, size);
1285 	m->m_len = CMSG_SPACE(size);
1286 	cp->cmsg_len = CMSG_LEN(size);
1287 	cp->cmsg_level = level;
1288 	cp->cmsg_type = type;
1289 	return (m);
1290 }
1291 
1292 /*
1293  * Some routines that return EOPNOTSUPP for entry points that are not
1294  * supported by a protocol.  Fill in as needed.
1295  */
1296 void
1297 pru_abort_notsupp(struct socket *so)
1298 {
1299 
1300 }
1301 
1302 int
1303 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
1304 {
1305 	return EOPNOTSUPP;
1306 }
1307 
1308 int
1309 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
1310 {
1311 	return EOPNOTSUPP;
1312 }
1313 
1314 int
1315 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
1316 {
1317 	return EOPNOTSUPP;
1318 }
1319 
1320 int
1321 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
1322 {
1323 	return EOPNOTSUPP;
1324 }
1325 
1326 int
1327 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
1328 {
1329 	return EOPNOTSUPP;
1330 }
1331 
1332 int
1333 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
1334 	struct ifnet *ifp, struct thread *td)
1335 {
1336 	return EOPNOTSUPP;
1337 }
1338 
1339 void
1340 pru_detach_notsupp(struct socket *so)
1341 {
1342 
1343 }
1344 
1345 int
1346 pru_disconnect_notsupp(struct socket *so)
1347 {
1348 	return EOPNOTSUPP;
1349 }
1350 
1351 int
1352 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
1353 {
1354 	return EOPNOTSUPP;
1355 }
1356 
1357 int
1358 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
1359 {
1360 	return EOPNOTSUPP;
1361 }
1362 
1363 int
1364 pru_rcvd_notsupp(struct socket *so, int flags)
1365 {
1366 	return EOPNOTSUPP;
1367 }
1368 
1369 int
1370 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
1371 {
1372 	return EOPNOTSUPP;
1373 }
1374 
1375 int
1376 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
1377 	struct sockaddr *addr, struct mbuf *control, struct thread *td)
1378 {
1379 	return EOPNOTSUPP;
1380 }
1381 
1382 /*
1383  * This isn't really a ``null'' operation, but it's the default one
1384  * and doesn't do anything destructive.
1385  */
1386 int
1387 pru_sense_null(struct socket *so, struct stat *sb)
1388 {
1389 	sb->st_blksize = so->so_snd.sb_hiwat;
1390 	return 0;
1391 }
1392 
1393 int
1394 pru_shutdown_notsupp(struct socket *so)
1395 {
1396 	return EOPNOTSUPP;
1397 }
1398 
1399 int
1400 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
1401 {
1402 	return EOPNOTSUPP;
1403 }
1404 
1405 int
1406 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
1407 	struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1408 {
1409 	return EOPNOTSUPP;
1410 }
1411 
1412 int
1413 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
1414 	struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
1415 	int *flagsp)
1416 {
1417 	return EOPNOTSUPP;
1418 }
1419 
1420 int
1421 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
1422 	struct thread *td)
1423 {
1424 	return EOPNOTSUPP;
1425 }
1426 
1427 /*
1428  * For protocol types that don't keep cached copies of labels in their
1429  * pcbs, provide a null sosetlabel that does a NOOP.
1430  */
1431 void
1432 pru_sosetlabel_null(struct socket *so)
1433 {
1434 
1435 }
1436 
1437 /*
1438  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1439  */
1440 struct sockaddr *
1441 sodupsockaddr(const struct sockaddr *sa, int mflags)
1442 {
1443 	struct sockaddr *sa2;
1444 
1445 	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
1446 	if (sa2)
1447 		bcopy(sa, sa2, sa->sa_len);
1448 	return sa2;
1449 }
1450 
1451 /*
1452  * Create an external-format (``xsocket'') structure using the information
1453  * in the kernel-format socket structure pointed to by so.  This is done
1454  * to reduce the spew of irrelevant information over this interface,
1455  * to isolate user code from changes in the kernel structure, and
1456  * potentially to provide information-hiding if we decide that
1457  * some of this information should be hidden from users.
1458  */
1459 void
1460 sotoxsocket(struct socket *so, struct xsocket *xso)
1461 {
1462 	xso->xso_len = sizeof *xso;
1463 	xso->xso_so = so;
1464 	xso->so_type = so->so_type;
1465 	xso->so_options = so->so_options;
1466 	xso->so_linger = so->so_linger;
1467 	xso->so_state = so->so_state;
1468 	xso->so_pcb = so->so_pcb;
1469 	xso->xso_protocol = so->so_proto->pr_protocol;
1470 	xso->xso_family = so->so_proto->pr_domain->dom_family;
1471 	xso->so_qlen = so->so_qlen;
1472 	xso->so_incqlen = so->so_incqlen;
1473 	xso->so_qlimit = so->so_qlimit;
1474 	xso->so_timeo = so->so_timeo;
1475 	xso->so_error = so->so_error;
1476 	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
1477 	xso->so_oobmark = so->so_oobmark;
1478 	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
1479 	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
1480 	xso->so_uid = so->so_cred->cr_uid;
1481 }
1482 
1483 /*
1484  * This does the same for sockbufs.  Note that the xsockbuf structure,
1485  * since it is always embedded in a socket, does not include a self
1486  * pointer nor a length.  We make this entry point public in case
1487  * some other mechanism needs it.
1488  */
1489 void
1490 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
1491 {
1492 	xsb->sb_cc = sb->sb_cc;
1493 	xsb->sb_hiwat = sb->sb_hiwat;
1494 	xsb->sb_mbcnt = sb->sb_mbcnt;
1495 	xsb->sb_mbmax = sb->sb_mbmax;
1496 	xsb->sb_lowat = sb->sb_lowat;
1497 	xsb->sb_flags = sb->sb_flags;
1498 	xsb->sb_timeo = sb->sb_timeo;
1499 }
1500 
1501 /*
1502  * Here is the definition of some of the basic objects in the kern.ipc
1503  * branch of the MIB.
1504  */
1505 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
1506 
1507 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1508 static int dummy;
1509 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
1510 SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
1511     &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
1512 static int
1513 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
1514 {
1515 	int error, newmaxsockets;
1516 
1517 	newmaxsockets = maxsockets;
1518 	error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
1519 	if (error == 0 && req->newptr) {
1520 		if (newmaxsockets > maxsockets) {
1521 			maxsockets = newmaxsockets;
1522 			if (maxsockets > ((maxfiles / 4) * 3)) {
1523 				maxfiles = (maxsockets * 5) / 4;
1524 				maxfilesperproc = (maxfiles * 9) / 10;
1525 			}
1526 			EVENTHANDLER_INVOKE(maxsockets_change);
1527 		} else
1528 			error = EINVAL;
1529 	}
1530 	return (error);
1531 }
1532 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
1533     &maxsockets, 0, sysctl_maxsockets, "IU",
1534     "Maximum number of sockets avaliable");
1535 SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
1536     &sb_efficiency, 0, "");
1537 
1538 /*
1539  * Initialise maxsockets
1540  */
1541 static void init_maxsockets(void *ignored)
1542 {
1543 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
1544 	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
1545 }
1546 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
1547