1.\"- 2.\" Copyright (c) 2006 Robert N. M. Watson 3.\" Copyright (c) 2014 Benjamin J. Kaduk 4.\" All rights reserved. 5.\" 6.\" Redistribution and use in source and binary forms, with or without 7.\" modification, are permitted provided that the following conditions 8.\" are met: 9.\" 1. Redistributions of source code must retain the above copyright 10.\" notice, this list of conditions and the following disclaimer. 11.\" 2. Redistributions in binary form must reproduce the above copyright 12.\" notice, this list of conditions and the following disclaimer in the 13.\" documentation and/or other materials provided with the distribution. 14.\" 15.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25.\" SUCH DAMAGE. 26.\" 27.\" $FreeBSD$ 28.\" 29.Dd May 26, 2014 30.Dt SOCKET 9 31.Os 32.Sh NAME 33.Nm socket 34.Nd "kernel socket interface" 35.Sh SYNOPSIS 36.In sys/socket.h 37.In sys/socketvar.h 38.Ft void 39.Fn soabort "struct socket *so" 40.Ft int 41.Fn soaccept "struct socket *so" "struct sockaddr **nam" 42.Ft int 43.Fn socheckuid "struct socket *so" "uid_t uid" 44.Ft int 45.Fn sobind "struct socket *so" "struct sockaddr *nam" "struct thread *td" 46.Ft void 47.Fn soclose "struct socket *so" 48.Ft int 49.Fn soconnect "struct socket *so" "struct sockaddr *nam" "struct thread *td" 50.Ft int 51.Fo socreate 52.Fa "int dom" "struct socket **aso" "int type" "int proto" 53.Fa "struct ucred *cred" "struct thread *td" 54.Fc 55.Ft int 56.Fn sodisconnect "struct socket *so" 57.Ft struct sockaddr * 58.Fn sodupsockaddr "const struct sockaddr *sa" "int mflags" 59.Ft void 60.Fn sofree "struct socket *so" 61.Ft void 62.Fn sohasoutofband "struct socket *so" 63.Ft int 64.Fn solisten "struct socket *so" "int backlog" "struct thread *td" 65.Ft void 66.Fn solisten_proto "struct socket *so" "int backlog" 67.Ft int 68.Fn solisten_proto_check "struct socket *so" 69.Ft struct socket * 70.Fn sonewconn "struct socket *head" "int connstatus" 71.Ft int 72.Fo sopoll 73.Fa "struct socket *so" "int events" "struct ucred *active_cred" 74.Fa "struct thread *td" 75.Fc 76.Ft int 77.Fo sopoll_generic 78.Fa "struct socket *so" "int events" "struct ucred *active_cred" 79.Fa "struct thread *td" 80.Fc 81.Ft int 82.Fo soreceive 83.Fa "struct socket *so" "struct sockaddr **psa" "struct uio *uio" 84.Fa "struct mbuf **mp0" "struct mbuf **controlp" "int *flagsp" 85.Fc 86.Ft int 87.Fo soreceive_stream 88.Fa "struct socket *so" "struct sockaddr **paddr" 89.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp" 90.Fa "int *flagsp" 91.Fc 92.Ft int 93.Fo soreceive_dgram 94.Fa "struct socket *so" "struct sockaddr **paddr" 95.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp" 96.Fa "int *flagsp" 97.Fc 98.Ft int 99.Fo soreceive_generic 100.Fa "struct socket *so" "struct sockaddr **paddr" 101.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp" 102.Fa "int *flagsp" 103.Fc 104.Ft int 105.Fn soreserve "struct socket *so" "u_long sndcc" "u_long rcvcc" 106.Ft void 107.Fn sorflush "struct socket *so" 108.Ft int 109.Fo sosend 110.Fa "struct socket *so" "struct sockaddr *addr" "struct uio *uio" 111.Fa "struct mbuf *top" "struct mbuf *control" "int flags" "struct thread *td" 112.Fc 113.Ft int 114.Fo sosend_dgram 115.Fa "struct socket *so" "struct sockaddr *addr" 116.Fa "struct uio *uio" "struct mbuf *top" "struct mbuf *control" 117.Fa "int flags" "struct thread *td" 118.Fc 119.Ft int 120.Fo sosend_generic 121.Fa "struct socket *so" "struct sockaddr *addr" 122.Fa "struct uio *uio" "struct mbuf *top" "struct mbuf *control" 123.Fa "int flags" "struct thread *td" 124.Fc 125.Ft int 126.Fn soshutdown "struct socket *so" "int how" 127.Ft void 128.Fn sotoxsocket "struct socket *so" "struct xsocket *xso" 129.Ft void 130.Fn soupcall_clear "struct socket *so" "int which" 131.Ft void 132.Fo soupcall_set 133.Fa "struct socket *so" "int which" 134.Fa "int (*func)(struct socket *, void *, int)" "void *arg" 135.Fc 136.Ft void 137.Fn sowakeup "struct socket *so" "struct sockbuf *sb" 138.In sys/sockopt.h 139.Ft int 140.Fn sosetopt "struct socket *so" "struct sockopt *sopt" 141.Ft int 142.Fn sogetopt "struct socket *so" "struct sockopt *sopt" 143.Ft int 144.Fn sooptcopyin "struct sockopt *sopt" "void *buf" "size_t len" "size_t minlen" 145.Ft int 146.Fn sooptcopyout "struct sockopt *sopt" "const void *buf" "size_t len" 147.Sh DESCRIPTION 148The kernel 149.Nm 150programming interface permits in-kernel consumers to interact with 151local and network socket objects in a manner similar to that permitted using 152the 153.Xr socket 2 154user API. 155These interfaces are appropriate for use by distributed file systems and 156other network-aware kernel services. 157While the user API operates on file descriptors, the kernel interfaces 158operate directly on 159.Vt "struct socket" 160pointers. 161Some portions of the kernel API exist only to implement the user API, 162and are not expected to be used by kernel code. 163The portions of the socket API used by socket consumers and 164implementations of network protocols will differ; some routines 165are only useful for protocol implementors. 166.Pp 167Except where otherwise indicated, 168.Nm 169functions may sleep, and are not appropriate for use in an 170.Xr ithread 9 171context or while holding non-sleepable kernel locks. 172.Ss Creating and Destroying Sockets 173A new socket may be created using 174.Fn socreate . 175As with 176.Xr socket 2 , 177arguments specify the requested domain, type, and protocol via 178.Fa dom , type , 179and 180.Fa proto . 181The socket is returned via 182.Fa aso 183on success. 184In addition, the credential used to authorize operations associated with the 185socket will be passed via 186.Fa cred 187(and will be cached for the lifetime of the socket), and the thread 188performing the operation via 189.Fa td . 190.Em Warning : 191authorization of the socket creation operation will be performed 192using the thread credential for some protocols (such as raw sockets). 193.Pp 194Sockets may be closed and freed using 195.Fn soclose , 196which has similar semantics to 197.Xr close 2 . 198.Pp 199In certain circumstances, it is appropriate to destroy a socket without 200waiting for it to disconnect, for which 201.Fn soabort 202is used. 203This is only appropriate for incoming connections which are in a 204partially connected state. 205It must be called on an unreferenced socket, by the thread which 206removed the socket from its listen queue, to prevent races. 207It will call into protocol code, so no socket locks may be held 208over the call. 209The caller of 210.Fn soabort 211is responsible for setting the VNET context. 212The normal path to freeing a socket is 213.Fn sofree , 214which handles reference counting on the socket. 215It should be called whenever a reference is released, and also whenever 216reference flags are cleared in socket or protocol code. 217Calls to 218.Fn sofree 219should not be made from outside the socket layer; outside callers 220should use 221.Fn soclose 222instead. 223.Ss Connections and Addresses 224The 225.Fn sobind 226function is equivalent to the 227.Xr bind 2 228system call, and binds the socket 229.Fa so 230to the address 231.Fa nam . 232The operation would be authorized using the credential on thread 233.Fa td . 234.Pp 235The 236.Fn soconnect 237function is equivalent to the 238.Xr connect 2 239system call, and initiates a connection on the socket 240.Fa so 241to the address 242.Fa nam . 243The operation will be authorized using the credential on thread 244.Fa td . 245Unlike the user system call, 246.Fn soconnect 247returns immediately; the caller may 248.Xr msleep 9 249on 250.Fa so->so_timeo 251while holding the socket mutex and waiting for the 252.Dv SS_ISCONNECTING 253flag to clear or 254.Fa so->so_error 255to become non-zero. 256If 257.Fn soconnect 258fails, the caller must manually clear the 259.Dv SS_ISCONNECTING 260flag. 261.Pp 262A call to 263.Fn sodisconnect 264disconnects the socket without closing it. 265.Pp 266The 267.Fn soshutdown 268function is equivalent to the 269.Xr shutdown 2 270system call, and causes part or all of a connection on a socket to be closed 271down. 272.Pp 273Sockets are transitioned from non-listening status to listening with 274.Fn solisten . 275.Ss Socket Options 276The 277.Fn sogetopt 278function is equivalent to the 279.Xr getsockopt 2 280system call, and retrieves a socket option on socket 281.Fa so . 282The 283.Fn sosetopt 284function is equivalent to the 285.Xr setsockopt 2 286system call, and sets a socket option on socket 287.Fa so . 288.Pp 289The second argument in both 290.Fn sogetopt 291and 292.Fn sosetopt 293is the 294.Fa sopt 295pointer to a 296.Vt "struct sopt" 297describing the socket option operation. 298The caller-allocated structure must be zeroed, and then have its fields 299initialized to specify socket option operation arguments: 300.Bl -tag -width ".Va sopt_valsize" 301.It Va sopt_dir 302Set to 303.Dv SOPT_SET 304or 305.Dv SOPT_GET 306depending on whether this is a get or set operation. 307.It Va sopt_level 308Specify the level in the network stack the operation is targeted at; for 309example, 310.Dv SOL_SOCKET . 311.It Va sopt_name 312Specify the name of the socket option to set. 313.It Va sopt_val 314Kernel space pointer to the argument value for the socket option. 315.It Va sopt_valsize 316Size of the argument value in bytes. 317.El 318.Ss Socket Upcalls 319In order for the owner of a socket to be notified when the socket 320is ready to send or receive data, an upcall may be registered on 321the socket. 322The upcall is a function that will be called by the socket framework 323when a socket buffer associated with the given socket is ready for 324reading or writing. 325.Fn soupcall_set 326is used to register a socket upcall. 327The function 328.Va func 329is registered, and the pointer 330.Va arg 331will be passed as its second argument when it is called by the framework. 332The possible values for 333.Va which 334are 335.Dv SO_RCV 336and 337.Dv SO_SND , 338which register upcalls for receive and send events, respectively. 339The upcall function 340.Fn func 341must return either 342.Dv SU_OK 343or 344.Dv SU_ISCONNECTED , 345depending on whether or not a call to 346.Xr soisconnected 347should be made by the socket framework after the upcall returns. 348The upcall 349.Va func 350cannot call 351.Xr soisconnected 352itself due to lock ordering with the socket buffer lock. 353Only 354.Dv SO_RCV 355upcalls should return 356.Dv SU_ISCONNECTED . 357When a 358.Dv SO_RCV 359upcall returns 360.Dv SU_ISCONNECTED , 361the upcall will be removed from the socket. 362.Pp 363Upcalls are removed from their socket by 364.Fn soupcall_clear . 365The 366.Va which 367argument again specifies whether the sending or receiving upcall is to 368be cleared, with 369.Dv SO_RCV 370or 371.Dv SO_SND . 372.Ss Socket I/O 373The 374.Fn soreceive 375function is equivalent to the 376.Xr recvmsg 2 377system call, and attempts to receive bytes of data from the socket 378.Fa so , 379optionally blocking awaiting for data if none is ready to read. 380Data may be retrieved directly to kernel or user memory via the 381.Fa uio 382argument, or as an mbuf chain returned to the caller via 383.Fa mp0 , 384avoiding a data copy. 385The 386.Fa uio 387must always be 388.Pf non- Dv NULL . 389If 390.Fa mp0 391is 392.Pf non- Dv NULL , 393only the 394.Fa uio_resid 395of 396.Fa uio 397is used. 398The caller may optionally retrieve a socket address on a protocol with the 399.Dv PR_ADDR 400capability by providing storage via 401.Pf non- Dv NULL 402.Fa psa 403argument. 404The caller may optionally retrieve control data mbufs via a 405.Pf non- Dv NULL 406.Fa controlp 407argument. 408Optional flags may be passed to 409.Fn soreceive 410via a 411.Pf non- Dv NULL 412.Fa flagsp 413argument, and use the same flag name space as the 414.Xr recvmsg 2 415system call. 416.Pp 417The 418.Fn sosend 419function is equivalent to the 420.Xr sendmsg 2 421system call, and attempts to send bytes of data via the socket 422.Fa so , 423optionally blocking if data cannot be immediately sent. 424Data may be sent directly from kernel or user memory via the 425.Fa uio 426argument, or as an mbuf chain via 427.Fa top , 428avoiding a data copy. 429Only one of the 430.Fa uio 431or 432.Fa top 433pointers may be 434.Pf non- Dv NULL . 435An optional destination address may be specified via a 436.Pf non- Dv NULL 437.Fa addr 438argument, which may result in an implicit connect if supported by the 439protocol. 440The caller may optionally send control data mbufs via a 441.Pf non- Dv NULL 442.Fa control 443argument. 444Flags may be passed to 445.Fn sosend 446using the 447.Fa flags 448argument, and use the same flag name space as the 449.Xr sendmsg 2 450system call. 451.Pp 452Kernel callers running in 453.Xr ithread 9 454context, or with a mutex held, will wish to use non-blocking sockets and pass 455the 456.Dv MSG_DONTWAIT 457flag in order to prevent these functions from sleeping. 458.Pp 459A socket can be queried for readability, writability, out-of-band data, 460or end-of-file using 461.Fn sopoll . 462The possible values for 463.Va events 464are as for 465.Xr poll 2 , 466with symbolic values 467.Dv POLLIN , 468.Dv POLLPRI , 469.Dv POLLOUT , 470.Dv POLLRDNORM , 471.Dv POLLWRNORM , 472.Dv POLLRDBAND , 473and 474.Dv POLLINGEOF 475taken from 476.In sys/poll.h . 477.Pp 478Calls to 479.Fn soaccept 480pass through to the protocol's accept routine to accept an incoming connection. 481.Ss Socket Utility Functions 482The uid of a socket's credential may be compared against a 483.Va uid 484with 485.Fn socheckuid . 486.Pp 487A copy of an existing 488.Vt struct sockaddr 489may be made using 490.Fn sodupsockaddr . 491.Pp 492Protocol implementations notify the socket layer of the arrival of 493out-of-band data using 494.Fn sohasoutofband , 495so that the socket layer can notify socket consumers of the available data. 496.Pp 497An 498.Dq external-format 499version of a 500.Vt struct socket 501can be created using 502.Fn sotoxsocket , 503suitable for isolating user code from changes in the kernel structure. 504.Ss Protocol Implementations 505Protocols must supply an implementation for 506.Fn solisten ; 507such protocol implementations can call back into the socket layer using 508.Fn solisten_proto_check 509and 510.Fn solisten_proto 511to check and set the socket-layer listen state. 512These callbacks are provided so that the protocol implementation 513can order the socket layer and protocol locks as necessary. 514Protocols must supply an implementation of 515.Fn soreceive ; 516the functions 517.Fn soreceive_stream , 518.Fn soreceive_dgram , 519and 520.Fn soreceive_generic 521are supplied for use by such implementations. 522.Pp 523Protocol implementations can use 524.Fn sonewconn 525to create a socket and attach protocol state to that socket. 526This can be used to create new sockets available for 527.Fn soaccept 528on a listen socket. 529The returned socket has a reference count of zero. 530.Pp 531Protocols must supply an implementation for 532.Fn sopoll ; 533.Fn sopoll_generic 534is provided for the use by protocol implementations. 535.Pp 536The functions 537.Fn sosend_dgram 538and 539.Fn sosend_generic 540are supplied to assist in protocol implementations of 541.Fn sosend . 542.Pp 543When a protocol creates a new socket structure, it is necessary to 544reserve socket buffer space for that socket, by calling 545.Fn soreserve . 546The rough inverse of this reservation is performed by 547.Fn sorflush , 548which is called automatically by the socket framework. 549.Pp 550When a protocol needs to wake up threads waiting for the socket to 551become ready to read or write, variants of 552.Fn sowakeup 553are used. 554The 555.Fn sowakeup 556function should not be called directly by protocol code, instead use the 557wrappers 558.Fn sorwakeup , 559.Fn sorwakeup_locked , 560.Fn sowwakeup , 561and 562.Fn sowwakeup_locked 563for readers and writers, with the corresponding socket buffer lock 564not already locked, or already held, respectively. 565.Pp 566The functions 567.Fn sooptcopyin 568and 569.Fn sooptcopyout 570are useful for transferring 571.Vt struct sockopt 572data between user and kernel code. 573.Sh SEE ALSO 574.Xr bind 2 , 575.Xr close 2 , 576.Xr connect 2 , 577.Xr getsockopt 2 , 578.Xr recv 2 , 579.Xr send 2 , 580.Xr setsockopt 2 , 581.Xr shutdown 2 , 582.Xr socket 2 , 583.Xr ng_ksocket 4 , 584.Xr ithread 9 , 585.Xr msleep 9 , 586.Xr ucred 9 587.Sh HISTORY 588The 589.Xr socket 2 590system call appeared in 591.Bx 4.2 . 592This manual page was introduced in 593.Fx 7.0 . 594.Sh AUTHORS 595This manual page was written by 596.An Robert Watson 597and 598.An Benjamin Kaduk . 599.Sh BUGS 600The use of explicitly passed credentials, credentials hung from explicitly 601passed threads, the credential on 602.Dv curthread , 603and the cached credential from 604socket creation time is inconsistent, and may lead to unexpected behaviour. 605It is possible that several of the 606.Fa td 607arguments should be 608.Fa cred 609arguments, or simply not be present at all. 610.Pp 611The caller may need to manually clear 612.Dv SS_ISCONNECTING 613if 614.Fn soconnect 615returns an error. 616.Pp 617The 618.Dv MSG_DONTWAIT 619flag is not implemented for 620.Fn sosend , 621and may not always work with 622.Fn soreceive 623when zero copy sockets are enabled. 624.Pp 625This manual page does not describe how to register socket upcalls or monitor 626a socket for readability/writability without using blocking I/O. 627.Pp 628The 629.Fn soref 630and 631.Fn sorele 632functions are not described, and in most cases should not be used, due to 633confusing and potentially incorrect interactions when 634.Fn sorele 635is last called after 636.Fn soclose . 637