1.\"- 2.\" Copyright (c) 2006 Robert N. M. Watson 3.\" Copyright (c) 2014 Benjamin J. Kaduk 4.\" All rights reserved. 5.\" 6.\" Redistribution and use in source and binary forms, with or without 7.\" modification, are permitted provided that the following conditions 8.\" are met: 9.\" 1. Redistributions of source code must retain the above copyright 10.\" notice, this list of conditions and the following disclaimer. 11.\" 2. Redistributions in binary form must reproduce the above copyright 12.\" notice, this list of conditions and the following disclaimer in the 13.\" documentation and/or other materials provided with the distribution. 14.\" 15.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25.\" SUCH DAMAGE. 26.\" 27.Dd September 6, 2022 28.Dt SOCKET 9 29.Os 30.Sh NAME 31.Nm socket 32.Nd "kernel socket interface" 33.Sh SYNOPSIS 34.In sys/socket.h 35.In sys/socketvar.h 36.Ft void 37.Fn soabort "struct socket *so" 38.Ft int 39.Fn soaccept "struct socket *so" "struct sockaddr *nam" 40.Ft int 41.Fn socheckuid "struct socket *so" "uid_t uid" 42.Ft int 43.Fn sobind "struct socket *so" "struct sockaddr *nam" "struct thread *td" 44.Ft void 45.Fn soclose "struct socket *so" 46.Ft int 47.Fn soconnect "struct socket *so" "struct sockaddr *nam" "struct thread *td" 48.Ft int 49.Fo socreate 50.Fa "int dom" "struct socket **aso" "int type" "int proto" 51.Fa "struct ucred *cred" "struct thread *td" 52.Fc 53.Ft int 54.Fn sodisconnect "struct socket *so" 55.Ft void 56.Fo sodtor_set 57.Fa "struct socket *so" 58.Fa "void (*func)(struct socket *)" 59.Fc 60.Ft struct sockaddr * 61.Fn sodupsockaddr "const struct sockaddr *sa" "int mflags" 62.Ft void 63.Fn sofree "struct socket *so" 64.Ft void 65.Fn sohasoutofband "struct socket *so" 66.Ft int 67.Fn solisten "struct socket *so" "int backlog" "struct thread *td" 68.Ft void 69.Fn solisten_proto "struct socket *so" "int backlog" 70.Ft int 71.Fn solisten_proto_check "struct socket *so" 72.Ft struct socket * 73.Fn sonewconn "struct socket *head" "int connstatus" 74.Ft int 75.Fo sopoll 76.Fa "struct socket *so" "int events" "struct ucred *active_cred" 77.Fa "struct thread *td" 78.Fc 79.Ft int 80.Fo sopoll_generic 81.Fa "struct socket *so" "int events" "struct ucred *active_cred" 82.Fa "struct thread *td" 83.Fc 84.Ft int 85.Fo soreceive 86.Fa "struct socket *so" "struct sockaddr **psa" "struct uio *uio" 87.Fa "struct mbuf **mp0" "struct mbuf **controlp" "int *flagsp" 88.Fc 89.Ft int 90.Fo soreceive_stream 91.Fa "struct socket *so" "struct sockaddr **paddr" 92.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp" 93.Fa "int *flagsp" 94.Fc 95.Ft int 96.Fo soreceive_dgram 97.Fa "struct socket *so" "struct sockaddr **paddr" 98.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp" 99.Fa "int *flagsp" 100.Fc 101.Ft int 102.Fo soreceive_generic 103.Fa "struct socket *so" "struct sockaddr **paddr" 104.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp" 105.Fa "int *flagsp" 106.Fc 107.Ft int 108.Fn soreserve "struct socket *so" "u_long sndcc" "u_long rcvcc" 109.Ft void 110.Fn sorflush "struct socket *so" 111.Ft int 112.Fo sosend 113.Fa "struct socket *so" "struct sockaddr *addr" "struct uio *uio" 114.Fa "struct mbuf *top" "struct mbuf *control" "int flags" "struct thread *td" 115.Fc 116.Ft int 117.Fo sosend_dgram 118.Fa "struct socket *so" "struct sockaddr *addr" 119.Fa "struct uio *uio" "struct mbuf *top" "struct mbuf *control" 120.Fa "int flags" "struct thread *td" 121.Fc 122.Ft int 123.Fo sosend_generic 124.Fa "struct socket *so" "struct sockaddr *addr" 125.Fa "struct uio *uio" "struct mbuf *top" "struct mbuf *control" 126.Fa "int flags" "struct thread *td" 127.Fc 128.Ft int 129.Fn soshutdown "struct socket *so" "int how" 130.Ft void 131.Fn sotoxsocket "struct socket *so" "struct xsocket *xso" 132.Ft void 133.Fn soupcall_clear "struct socket *so" "int which" 134.Ft void 135.Fo soupcall_set 136.Fa "struct socket *so" "int which" 137.Fa "int (*func)(struct socket *, void *, int)" "void *arg" 138.Fc 139.Ft void 140.Fn sowakeup "struct socket *so" "struct sockbuf *sb" 141.In sys/sockopt.h 142.Ft int 143.Fn sosetopt "struct socket *so" "struct sockopt *sopt" 144.Ft int 145.Fn sogetopt "struct socket *so" "struct sockopt *sopt" 146.Ft int 147.Fn sooptcopyin "struct sockopt *sopt" "void *buf" "size_t len" "size_t minlen" 148.Ft int 149.Fn sooptcopyout "struct sockopt *sopt" "const void *buf" "size_t len" 150.Sh DESCRIPTION 151The kernel 152.Nm 153programming interface permits in-kernel consumers to interact with 154local and network socket objects in a manner similar to that permitted using 155the 156.Xr socket 2 157user API. 158These interfaces are appropriate for use by distributed file systems and 159other network-aware kernel services. 160While the user API operates on file descriptors, the kernel interfaces 161operate directly on 162.Vt "struct socket" 163pointers. 164Some portions of the kernel API exist only to implement the user API, 165and are not expected to be used by kernel code. 166The portions of the socket API used by socket consumers and 167implementations of network protocols will differ; some routines 168are only useful for protocol implementors. 169.Pp 170Except where otherwise indicated, 171.Nm 172functions may sleep, and are not appropriate for use in an interrupt thread 173context or while holding non-sleepable kernel locks. 174.Ss Creating and Destroying Sockets 175A new socket may be created using 176.Fn socreate . 177As with 178.Xr socket 2 , 179arguments specify the requested domain, type, and protocol via 180.Fa dom , type , 181and 182.Fa proto . 183The socket is returned via 184.Fa aso 185on success. 186In addition, the credential used to authorize operations associated with the 187socket will be passed via 188.Fa cred 189(and will be cached for the lifetime of the socket), and the thread 190performing the operation via 191.Fa td . 192.Em Warning : 193authorization of the socket creation operation will be performed 194using the thread credential for some protocols (such as raw sockets). 195.Pp 196Sockets may be closed and freed using 197.Fn soclose , 198which has similar semantics to 199.Xr close 2 . 200.Pp 201In certain circumstances, it is appropriate to destroy a socket without 202waiting for it to disconnect, for which 203.Fn soabort 204is used. 205This is only appropriate for incoming connections which are in a 206partially connected state. 207It must be called on an unreferenced socket, by the thread which 208removed the socket from its listen queue, to prevent races. 209It will call into protocol code, so no socket locks may be held 210over the call. 211The caller of 212.Fn soabort 213is responsible for setting the VNET context. 214The normal path to freeing a socket is 215.Fn sofree , 216which handles reference counting on the socket. 217It should be called whenever a reference is released, and also whenever 218reference flags are cleared in socket or protocol code. 219Calls to 220.Fn sofree 221should not be made from outside the socket layer; outside callers 222should use 223.Fn soclose 224instead. 225.Ss Connections and Addresses 226The 227.Fn sobind 228function is equivalent to the 229.Xr bind 2 230system call, and binds the socket 231.Fa so 232to the address 233.Fa nam . 234The operation would be authorized using the credential on thread 235.Fa td . 236.Pp 237The 238.Fn soconnect 239function is equivalent to the 240.Xr connect 2 241system call, and initiates a connection on the socket 242.Fa so 243to the address 244.Fa nam . 245The operation will be authorized using the credential on thread 246.Fa td . 247Unlike the user system call, 248.Fn soconnect 249returns immediately; the caller may 250.Xr msleep 9 251on 252.Fa so->so_timeo 253while holding the socket mutex and waiting for the 254.Dv SS_ISCONNECTING 255flag to clear or 256.Fa so->so_error 257to become non-zero. 258If 259.Fn soconnect 260fails, the caller must manually clear the 261.Dv SS_ISCONNECTING 262flag. 263.Pp 264A call to 265.Fn sodisconnect 266disconnects the socket without closing it. 267.Pp 268The 269.Fn soshutdown 270function is equivalent to the 271.Xr shutdown 2 272system call, and causes part or all of a connection on a socket to be closed 273down. 274.Pp 275Sockets are transitioned from non-listening status to listening with 276.Fn solisten . 277.Ss Socket Options 278The 279.Fn sogetopt 280function is equivalent to the 281.Xr getsockopt 2 282system call, and retrieves a socket option on socket 283.Fa so . 284The 285.Fn sosetopt 286function is equivalent to the 287.Xr setsockopt 2 288system call, and sets a socket option on socket 289.Fa so . 290.Pp 291The second argument in both 292.Fn sogetopt 293and 294.Fn sosetopt 295is the 296.Fa sopt 297pointer to a 298.Vt "struct sopt" 299describing the socket option operation. 300The caller-allocated structure must be zeroed, and then have its fields 301initialized to specify socket option operation arguments: 302.Bl -tag -width ".Va sopt_valsize" 303.It Va sopt_dir 304Set to 305.Dv SOPT_SET 306or 307.Dv SOPT_GET 308depending on whether this is a get or set operation. 309.It Va sopt_level 310Specify the level in the network stack the operation is targeted at; for 311example, 312.Dv SOL_SOCKET . 313.It Va sopt_name 314Specify the name of the socket option to set. 315.It Va sopt_val 316Kernel space pointer to the argument value for the socket option. 317.It Va sopt_valsize 318Size of the argument value in bytes. 319.El 320.Ss Socket Upcalls 321In order for the owner of a socket to be notified when the socket 322is ready to send or receive data, an upcall may be registered on 323the socket. 324The upcall is a function that will be called by the socket framework 325when a socket buffer associated with the given socket is ready for 326reading or writing. 327.Fn soupcall_set 328is used to register a socket upcall. 329The function 330.Va func 331is registered, and the pointer 332.Va arg 333will be passed as its second argument when it is called by the framework. 334The possible values for 335.Va which 336are 337.Dv SO_RCV 338and 339.Dv SO_SND , 340which register upcalls for receive and send events, respectively. 341The upcall function 342.Fn func 343must return either 344.Dv SU_OK 345or 346.Dv SU_ISCONNECTED , 347depending on whether or not a call to 348.Xr soisconnected 349should be made by the socket framework after the upcall returns. 350The upcall 351.Va func 352cannot call 353.Xr soisconnected 354itself due to lock ordering with the socket buffer lock. 355Only 356.Dv SO_RCV 357upcalls should return 358.Dv SU_ISCONNECTED . 359When a 360.Dv SO_RCV 361upcall returns 362.Dv SU_ISCONNECTED , 363the upcall will be removed from the socket. 364.Pp 365Upcalls are removed from their socket by 366.Fn soupcall_clear . 367The 368.Va which 369argument again specifies whether the sending or receiving upcall is to 370be cleared, with 371.Dv SO_RCV 372or 373.Dv SO_SND . 374.Ss Socket Destructor Callback 375A kernel system can use the 376.Fn sodtor_set 377function to set a destructor for a socket. 378The destructor is called when the socket is about to be freed. 379The destructor is called before the protocol detach routine. 380The destructor can serve as a callback to initiate additional cleanup actions. 381.Ss Socket I/O 382The 383.Fn soreceive 384function is equivalent to the 385.Xr recvmsg 2 386system call, and attempts to receive bytes of data from the socket 387.Fa so , 388optionally blocking awaiting for data if none is ready to read. 389Data may be retrieved directly to kernel or user memory via the 390.Fa uio 391argument, or as an mbuf chain returned to the caller via 392.Fa mp0 , 393avoiding a data copy. 394The 395.Fa uio 396must always be 397.Pf non- Dv NULL . 398If 399.Fa mp0 400is 401.Pf non- Dv NULL , 402only the 403.Fa uio_resid 404of 405.Fa uio 406is used. 407The caller may optionally retrieve a socket address on a protocol with the 408.Dv PR_ADDR 409capability by providing storage via 410.Pf non- Dv NULL 411.Fa psa 412argument. 413The caller may optionally retrieve control data mbufs via a 414.Pf non- Dv NULL 415.Fa controlp 416argument. 417Optional flags may be passed to 418.Fn soreceive 419via a 420.Pf non- Dv NULL 421.Fa flagsp 422argument, and use the same flag name space as the 423.Xr recvmsg 2 424system call. 425.Pp 426The 427.Fn sosend 428function is equivalent to the 429.Xr sendmsg 2 430system call, and attempts to send bytes of data via the socket 431.Fa so , 432optionally blocking if data cannot be immediately sent. 433Data may be sent directly from kernel or user memory via the 434.Fa uio 435argument, or as an mbuf chain via 436.Fa top , 437avoiding a data copy. 438Only one of the 439.Fa uio 440or 441.Fa top 442pointers may be 443.Pf non- Dv NULL . 444An optional destination address may be specified via a 445.Pf non- Dv NULL 446.Fa addr 447argument, which may result in an implicit connect if supported by the 448protocol. 449The caller may optionally send control data mbufs via a 450.Pf non- Dv NULL 451.Fa control 452argument. 453Flags may be passed to 454.Fn sosend 455using the 456.Fa flags 457argument, and use the same flag name space as the 458.Xr sendmsg 2 459system call. 460.Pp 461Kernel callers running in an interrupt thread context, or with a mutex held, 462will wish to use non-blocking sockets and pass the 463.Dv MSG_DONTWAIT 464flag in order to prevent these functions from sleeping. 465.Pp 466A socket can be queried for readability, writability, out-of-band data, 467or end-of-file using 468.Fn sopoll . 469The possible values for 470.Va events 471are as for 472.Xr poll 2 , 473with symbolic values 474.Dv POLLIN , 475.Dv POLLPRI , 476.Dv POLLOUT , 477.Dv POLLRDNORM , 478.Dv POLLWRNORM , 479.Dv POLLRDBAND , 480and 481.Dv POLLINGEOF 482taken from 483.In sys/poll.h . 484.Pp 485Calls to 486.Fn soaccept 487pass through to the protocol's accept routine to accept an incoming connection. 488.Ss Socket Utility Functions 489The uid of a socket's credential may be compared against a 490.Va uid 491with 492.Fn socheckuid . 493.Pp 494A copy of an existing 495.Vt struct sockaddr 496may be made using 497.Fn sodupsockaddr . 498.Pp 499Protocol implementations notify the socket layer of the arrival of 500out-of-band data using 501.Fn sohasoutofband , 502so that the socket layer can notify socket consumers of the available data. 503.Pp 504An 505.Dq external-format 506version of a 507.Vt struct socket 508can be created using 509.Fn sotoxsocket , 510suitable for isolating user code from changes in the kernel structure. 511.Ss Protocol Implementations 512Protocols must supply an implementation for 513.Fn solisten ; 514such protocol implementations can call back into the socket layer using 515.Fn solisten_proto_check 516and 517.Fn solisten_proto 518to check and set the socket-layer listen state. 519These callbacks are provided so that the protocol implementation 520can order the socket layer and protocol locks as necessary. 521Protocols must supply an implementation of 522.Fn soreceive ; 523the functions 524.Fn soreceive_stream , 525.Fn soreceive_dgram , 526and 527.Fn soreceive_generic 528are supplied for use by such implementations. 529.Pp 530Protocol implementations can use 531.Fn sonewconn 532to create a socket and attach protocol state to that socket. 533This can be used to create new sockets available for 534.Fn soaccept 535on a listen socket. 536The returned socket has a reference count of zero. 537.Pp 538Protocols must supply an implementation for 539.Fn sopoll ; 540.Fn sopoll_generic 541is provided for the use by protocol implementations. 542.Pp 543The functions 544.Fn sosend_dgram 545and 546.Fn sosend_generic 547are supplied to assist in protocol implementations of 548.Fn sosend . 549.Pp 550When a protocol creates a new socket structure, it is necessary to 551reserve socket buffer space for that socket, by calling 552.Fn soreserve . 553The rough inverse of this reservation is performed by 554.Fn sorflush , 555which is called automatically by the socket framework. 556.Pp 557When a protocol needs to wake up threads waiting for the socket to 558become ready to read or write, variants of 559.Fn sowakeup 560are used. 561The 562.Fn sowakeup 563function should not be called directly by protocol code, instead use the 564wrappers 565.Fn sorwakeup , 566.Fn sorwakeup_locked , 567.Fn sowwakeup , 568and 569.Fn sowwakeup_locked 570for readers and writers, with the corresponding socket buffer lock 571not already locked, or already held, respectively. 572.Pp 573The functions 574.Fn sooptcopyin 575and 576.Fn sooptcopyout 577are useful for transferring 578.Vt struct sockopt 579data between user and kernel code. 580.Sh SEE ALSO 581.Xr bind 2 , 582.Xr close 2 , 583.Xr connect 2 , 584.Xr getsockopt 2 , 585.Xr recv 2 , 586.Xr send 2 , 587.Xr setsockopt 2 , 588.Xr shutdown 2 , 589.Xr socket 2 , 590.Xr ng_ksocket 4 , 591.Xr intr_event 9 , 592.Xr msleep 9 , 593.Xr ucred 9 594.Sh HISTORY 595The 596.Xr socket 2 597system call appeared in 598.Bx 4.2 . 599This manual page was introduced in 600.Fx 7.0 . 601.Sh AUTHORS 602This manual page was written by 603.An Robert Watson 604and 605.An Benjamin Kaduk . 606.Sh BUGS 607The use of explicitly passed credentials, credentials hung from explicitly 608passed threads, the credential on 609.Dv curthread , 610and the cached credential from 611socket creation time is inconsistent, and may lead to unexpected behaviour. 612It is possible that several of the 613.Fa td 614arguments should be 615.Fa cred 616arguments, or simply not be present at all. 617.Pp 618The caller may need to manually clear 619.Dv SS_ISCONNECTING 620if 621.Fn soconnect 622returns an error. 623.Pp 624The 625.Dv MSG_DONTWAIT 626flag is not implemented for 627.Fn sosend , 628and may not always work with 629.Fn soreceive 630when zero copy sockets are enabled. 631.Pp 632This manual page does not describe how to register socket upcalls or monitor 633a socket for readability/writability without using blocking I/O. 634.Pp 635The 636.Fn soref 637and 638.Fn sorele 639functions are not described, and in most cases should not be used, due to 640confusing and potentially incorrect interactions when 641.Fn sorele 642is last called after 643.Fn soclose . 644