xref: /freebsd/share/man/man9/mbuf.9 (revision 52ec752989b2e6d4e9a59a8ff25d8ff596d85e62)
1.\" Copyright (c) 2000 FreeBSD Inc.
2.\" All rights reserved.
3.\"
4.\" Redistribution and use in source and binary forms, with or without
5.\" modification, are permitted provided that the following conditions
6.\" are met:
7.\" 1. Redistributions of source code must retain the above copyright
8.\"    notice, this list of conditions and the following disclaimer.
9.\" 2. Redistributions in binary form must reproduce the above copyright
10.\"    notice, this list of conditions and the following disclaimer in the
11.\"    documentation and/or other materials provided with the distribution.
12.\"
13.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16.\" ARE DISCLAIMED.  IN NO EVENT SHALL [your name] OR CONTRIBUTORS BE LIABLE
17.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23.\" SUCH DAMAGE.
24.\"
25.\" $FreeBSD$
26.\"
27.Dd October 17, 2000
28.Dt MBUF 9
29.Os
30.\"
31.Sh NAME
32.Nm mbuf
33.Nd "memory management in the kernel IPC subsystem"
34.\"
35.Sh SYNOPSIS
36.In sys/param.h
37.In sys/systm.h
38.In sys/mbuf.h
39.\"
40.Ss Mbuf allocation macros
41.Fn MGET "struct mbuf *mbuf" "int how" "short type"
42.Fn MGETHDR "struct mbuf *mbuf" "int how" "short type"
43.Fn MCLGET "struct mbuf *mbuf" "int how"
44.Fo MEXTADD
45.Fa "struct mbuf *mbuf"
46.Fa "caddr_t buf"
47.Fa "u_int size"
48.Fa "void (*free)(void *opt_args)"
49.Fa "void *opt_args"
50.Fa "short flags"
51.Fa "int type"
52.Fc
53.Fn MEXTFREE "struct mbuf *mbuf"
54.Fn MEXT_ADD_REF "struct mbuf *mbuf"
55.Fn MEXT_REM_REF "struct mbuf *mbuf"
56.Fn MFREE "struct mbuf *mbuf" "struct mbuf *successor"
57.\"
58.Ss Mbuf utility macros
59.Ft void *
60.Fn mtod "struct mbuf *mbuf" "type"
61.Ft int
62.Fn MEXT_IS_REF "struct mbuf *mbuf"
63.Fn M_ALIGN "struct mbuf *mbuf" "u_int len"
64.Fn MH_ALIGN "struct mbuf *mbuf" "u_int len"
65.Ft int
66.Fn M_LEADINGSPACE "struct mbuf *mbuf"
67.Ft int
68.Fn M_TRAILINGSPACE "struct mbuf *mbuf"
69.Fn M_MOVE_PKTHDR "struct mbuf *to" "struct mbuf *from"
70.Fn M_PREPEND "struct mbuf *mbuf" "int len" "int how"
71.Fn MCHTYPE "struct mbuf *mbuf" "u_int type"
72.Ft int
73.Fn M_WRITABLE "struct mbuf *mbuf"
74.\"
75.Ss Mbuf allocation functions
76.Ft struct mbuf *
77.Fn m_get "int how" "int type"
78.Ft struct mbuf *
79.Fn m_getm "struct mbuf *orig" "int len" "int how" "int type"
80.Ft struct mbuf *
81.Fn m_getclr "int how" "int type"
82.Ft struct mbuf *
83.Fn m_gethdr "int how" "int type"
84.Ft struct mbuf *
85.Fn m_free "struct mbuf *mbuf"
86.Ft void
87.Fn m_freem "struct mbuf *mbuf"
88.\"
89.Ss Mbuf utility functions
90.Ft void
91.Fn m_adj "struct mbuf *mbuf" "int len"
92.Ft struct mbuf *
93.Fn m_prepend "struct mbuf *mbuf" "int len" "int how"
94.Ft struct mbuf *
95.Fn m_pullup "struct mbuf *mbuf" "int len"
96.Ft struct mbuf *
97.Fn m_copym "struct mbuf *mbuf" "int offset" "int len" "int how"
98.Ft struct mbuf *
99.Fn m_copypacket "struct mbuf *mbuf" "int how"
100.Ft struct mbuf *
101.Fn m_dup "struct mbuf *mbuf" "int how"
102.Ft void
103.Fn m_copydata "const struct mbuf *mbuf" "int offset" "int len" "caddr_t buf"
104.Ft void
105.Fn m_copyback "struct mbuf *mbuf" "int offset" "int len" "caddr_t buf"
106.Ft struct mbuf *
107.Fo m_devget
108.Fa "char *buf"
109.Fa "int len"
110.Fa "int offset"
111.Fa "struct ifnet *ifp"
112.Fa "void (*copy)(char *from, caddr_t to, u_int len)"
113.Fc
114.Ft void
115.Fn m_cat "struct mbuf *m" "struct mbuf *n"
116.Ft u_int
117.Fn m_fixhdr "struct mbuf *mbuf"
118.Ft void
119.Fn m_dup_pkthdr "struct mbuf *to" "struct mbuf *from"
120.Ft void
121.Fn m_move_pkthdr "struct mbuf *to" "struct mbuf *from"
122.Ft u_int
123.Fn m_length "struct mbuf *mbuf" "struct mbuf **last"
124.Ft struct mbuf *
125.Fn m_split "struct mbuf *mbuf" "int len" "int how"
126.\"
127.Sh DESCRIPTION
128An
129.Vt mbuf
130is a basic unit of memory management in the kernel IPC subsystem.
131Network packets and socket buffers are stored in
132.Vt mbufs .
133A network packet may span multiple
134.Vt mbufs
135arranged into a
136.Vt mbuf chain
137(linked list),
138which allows adding or trimming
139network headers with little overhead.
140.Pp
141While a developer should not bother with
142.Vt mbuf
143internals without serious
144reason in order to avoid incompatibilities with future changes, it
145is useful to understand the general structure of an
146.Vt mbuf .
147.Pp
148An
149.Vt mbuf
150consists of a variable-sized header and a small internal
151buffer for data.
152The total size of an
153.Vt mbuf ,
154.Dv MSIZE ,
155is a machine-dependent constant defined in
156.In machine/param.h .
157The
158.Vt mbuf
159header includes:
160.Pp
161.Bl -tag -width "m_nextpkt" -offset indent
162.It Va m_next
163.Pq Vt struct mbuf *
164A pointer to the next
165.Vt mbuf
166in the
167.Vt mbuf chain .
168.It Va m_nextpkt
169.Pq Vt struct mbuf *
170A pointer to the next
171.Vt mbuf chain
172in the queue.
173.It Va m_data
174.Pq Vt caddr_t
175A pointer to data attached to this
176.Vt mbuf .
177.It Va m_len
178.Pq Vt int
179The length of the data.
180.It Va m_type
181.Pq Vt short
182The type of the data.
183.It Va m_flags
184.Pq Vt int
185The
186.Vt mbuf
187flags.
188.El
189.Pp
190The
191.Vt mbuf
192flag bits are defined as follows:
193.Bd -literal
194/* mbuf flags */
195#define	M_EXT		0x0001	/* has associated external storage */
196#define	M_PKTHDR	0x0002	/* start of record */
197#define	M_EOR		0x0004	/* end of record */
198#define	M_RDONLY	0x0008	/* associated data marked read-only */
199#define	M_PROTO1	0x0010	/* protocol-specific */
200#define	M_PROTO2	0x0020 	/* protocol-specific */
201#define	M_PROTO3	0x0040	/* protocol-specific */
202#define	M_PROTO4	0x0080	/* protocol-specific */
203#define	M_PROTO5	0x0100	/* protocol-specific */
204#define	M_PROTO6	0x4000	/* protocol-specific (avoid M_BCAST conflict) */
205#define	M_FREELIST	0x8000	/* mbuf is on the free list */
206
207/* mbuf pkthdr flags (also stored in m_flags) */
208#define	M_BCAST		0x0200	/* send/received as link-level broadcast */
209#define	M_MCAST		0x0400	/* send/received as link-level multicast */
210#define	M_FRAG		0x0800	/* packet is fragment of larger packet */
211#define	M_FIRSTFRAG	0x1000	/* packet is first fragment */
212#define	M_LASTFRAG	0x2000	/* packet is last fragment */
213.Ed
214.Pp
215The available
216.Vt mbuf
217types are defined as follows:
218.Bd -literal
219/* mbuf types */
220#define	MT_DATA		1	/* dynamic (data) allocation */
221#define	MT_HEADER	2	/* packet header */
222#define	MT_SONAME	8	/* socket name */
223#define	MT_FTABLE	11	/* fragment reassembly header */
224#define	MT_TAG		13	/* volatile metadata associated to pkts */
225#define	MT_CONTROL	14	/* extra-data protocol message */
226#define	MT_OOBDATA	15	/* expedited data  */
227.Ed
228.Pp
229If the
230.Dv M_PKTHDR
231flag is set, a
232.Vt struct pkthdr Va m_pkthdr
233is added to the
234.Vt mbuf
235header.
236It contains a pointer to the interface
237the packet has been received from
238.Pq Vt struct ifnet Va *rcvif ,
239and the total packet length
240.Pq Vt int Va len .
241.Pp
242If small enough, data is stored in the internal data buffer of an
243.Vt mbuf .
244If the data is sufficiently large, another
245.Vt mbuf
246may be added to the
247.Vt mbuf chain ,
248or external storage may be associated with the
249.Vt mbuf .
250.Dv MHLEN
251bytes of data can fit into an
252.Vt mbuf
253with the
254.Dv M_PKTHDR
255flag set,
256.Dv MLEN
257bytes can otherwise.
258.Pp
259If external storage is being associated with an
260.Vt mbuf ,
261the
262.Va m_ext
263header is added at the cost of losing the internal data buffer.
264It includes a pointer to external storage, the size of the storage,
265a pointer to a function used for freeing the storage,
266a pointer to an optional argument that can be passed to the function,
267and a pointer to a reference counter.
268An
269.Vt mbuf
270using external storage has the
271.Dv M_EXT
272flag set.
273.Pp
274The system supplies a macro for allocating the desired external storage
275buffer,
276.Dv MEXTADD .
277.Pp
278The allocation and management of the reference counter is handled by the
279subsystem.
280The developer can check whether the reference count for the
281external storage of a given
282.Vt mbuf
283is greater than 1 with the
284.Dv MEXT_IS_REF
285macro.
286Similarly, the developer can directly add and remove references,
287if absolutely necessary, with the use of the
288.Dv MEXT_ADD_REF
289and
290.Dv MEXT_REM_REF
291macros.
292.Pp
293The system also supplies a default type of external storage buffer called an
294.Vt mbuf cluster .
295.Vt Mbuf clusters
296can be allocated and configured with the use of the
297.Dv MCLGET
298macro.
299Each
300.Vt mbuf cluster
301is
302.Dv MCLBYTES
303in size, where MCLBYTES is a machine-dependent constant.
304The system defines an advisory macro
305.Dv MINCLSIZE ,
306which is the smallest amount of data to put into an
307.Vt mbuf cluster .
308It's equal to the sum of
309.Dv MLEN
310and
311.Dv MHLEN .
312It is typically preferable to store data into the data region of an
313.Vt mbuf ,
314if size permits, as opposed to allocating a separate
315.Vt mbuf cluster
316to hold the same data.
317.\"
318.Ss Macros and Functions
319There are numerous predefined macros and functions that provide the
320developer with common utilities.
321.\"
322.Bl -ohang -offset indent
323.It Fn mtod mbuf type
324Convert an
325.Fa mbuf
326pointer to a data pointer.
327The macro expands to the data pointer cast to the pointer of the specified
328.Fa type .
329.Sy Note :
330It is advisable to ensure that there is enough contiguous data in
331.Fa mbuf .
332See
333.Fn m_pullup
334for details.
335.It Fn MGET mbuf how type
336Allocate an
337.Vt mbuf
338and initialize it to contain internal data.
339.Fa mbuf
340will point to the allocated
341.Vt mbuf
342on success, or be set to
343.Dv NULL
344on failure.
345The
346.Fa how
347argument is to be set to
348.Dv M_TRYWAIT
349or
350.Dv M_DONTWAIT .
351It specifies whether the caller is willing to block if necessary.
352If
353.Fa how
354is set to
355.Dv M_TRYWAIT ,
356a failed allocation will result in the caller being put
357to sleep for a designated
358kern.ipc.mbuf_wait
359.Xr ( sysctl 8
360tunable)
361number of ticks.
362A number of other functions and macros related to
363.Vt mbufs
364have the same argument because they may
365at some point need to allocate new
366.Vt mbufs .
367.Pp
368Programmers should be careful not to confuse the
369.Vt mbuf
370allocation flag
371.Dv M_DONTWAIT
372with the
373.Xr malloc 9
374allocation flag,
375.Dv M_NOWAIT .
376They are not the same.
377.It Fn MGETHDR mbuf how type
378Allocate an
379.Vt mbuf
380and initialize it to contain a packet header
381and internal data.
382See
383.Fn MGET
384for details.
385.It Fn MCLGET mbuf how
386Allocate and attach an
387.Vt mbuf cluster
388to
389.Fa mbuf .
390If the macro fails, the
391.Dv M_EXT
392flag won't be set in
393.Fa mbuf .
394.It Fn M_ALIGN mbuf len
395Set the pointer
396.Fa mbuf->m_data
397to place an object of the size
398.Fa len
399at the end of the internal data area of
400.Fa mbuf ,
401long word aligned.
402Applicable only if
403.Fa mbuf
404is newly allocated with
405.Fn MGET
406or
407.Fn m_get .
408.It Fn MH_ALIGN mbuf len
409Serves the same purpose as
410.Fn M_ALIGN
411does, but only for
412.Fa mbuf
413newly allocated with
414.Fn MGETHDR
415or
416.Fn m_gethdr ,
417or initialized by
418.Fn m_dup_pkthdr
419or
420.Fn m_move_pkthdr .
421.It Fn M_LEADINGSPACE mbuf
422Returns the number of bytes available before the beginning
423of data in
424.Fa mbuf .
425.It Fn M_TRAILINGSPACE mbuf
426Returns the number of bytes available after the end of data in
427.Fa mbuf .
428.It Fn M_PREPEND mbuf len how
429This macro operates on an
430.Vt mbuf chain .
431It is an optimized wrapper for
432.Fn m_prepend
433that can make use of possible empty space before data
434(e.g. left after trimming of a link-layer header).
435The new
436.Vt mbuf chain
437pointer or
438.Dv NULL
439is in
440.Fa mbuf
441after the call.
442.It Fn M_MOVE_PKTHDR to from
443Using this macro is equivalent to calling
444.Fn m_move_pkthdr to from .
445.It Fn M_WRITABLE mbuf
446This macro will evaluate true if
447.Fa mbuf
448is not marked
449.Dv M_RDONLY
450and if either
451.Fa mbuf
452does not contain external storage or,
453if it does,
454then if the reference count of the storage is not greater than 1.
455The
456.Dv M_RDONLY
457flag can be set in
458.Fa mbuf->m_flags .
459This can be achieved during setup of the external storage,
460by passing the
461.Dv M_RDONLY
462bit as a
463.Fa flags
464argument to the
465.Fn MEXTADD
466macro, or can be directly set in individual
467.Vt mbufs .
468.It Fn MCHTYPE mbuf type
469Change the type of
470.Fa mbuf
471to
472.Fa type .
473This is a relatively expensive operation and should be avoided.
474.El
475.Pp
476The functions are:
477.Bl -ohang -offset indent
478.It Fn m_get how type
479A function version of
480.Fn MGET
481for non-critical paths.
482.It Fn m_getm orig len how type
483Allocate
484.Fa len
485bytes worth of
486.Vt mbufs
487and
488.Vt mbuf clusters
489if necessary and append the resulting allocated
490.Vt mbuf chain
491to the
492.Vt mbuf chain
493.Fa orig ,
494if it is
495.No non- Ns Dv NULL .
496If the allocation fails at any point,
497free whatever was allocated and return
498.Dv NULL .
499If
500.Fa orig
501is
502.No non- Ns Dv NULL ,
503it will not be freed.
504It is possible to use
505.Fn m_getm
506to either append
507.Fa len
508bytes to an existing
509.Vt mbuf
510or
511.Vt mbuf chain
512(for example, one which may be sitting in a pre-allocated ring)
513or to simply perform an all-or-nothing
514.Vt mbuf
515and
516.Vt mbuf cluster
517allocation.
518.It Fn m_gethdr how type
519A function version of
520.Fn MGETHDR
521for non-critical paths.
522.It Fn m_getclr how type
523Allocate an
524.Vt mbuf
525and zero out the data region.
526.El
527.Pp
528The functions below operate on
529.Vt mbuf chains .
530.Bl -ohang -offset indent
531.It Fn m_freem mbuf
532Free an entire
533.Vt mbuf chain ,
534including any external storage.
535.\"
536.It Fn m_adj mbuf len
537Trim
538.Fa len
539bytes from the head of an
540.Vt mbuf chain
541if
542.Fa len
543is positive, from the tail otherwise.
544.\"
545.It Fn m_prepend mbuf len how
546Allocate a new
547.Vt mbuf
548and prepend it to the
549.Vt mbuf chain ,
550handle
551.Dv M_PKTHDR
552properly.
553.Sy Note :
554It doesn't allocate any
555.Vt mbuf clusters ,
556so
557.Fa len
558must be less than
559.Dv MLEN
560or
561.Dv MHLEN ,
562depending on the
563.Dv M_PKTHDR
564flag setting.
565.\"
566.It Fn m_pullup mbuf len
567Arrange that the first
568.Fa len
569bytes of an
570.Vt mbuf chain
571are contiguous and lay in the data area of
572.Fa mbuf ,
573so they are accessible with
574.Fn mtod mbuf type .
575Return the new
576.Vt mbuf chain
577on success,
578.Dv NULL
579on failure
580(the
581.Vt mbuf chain
582is freed in this case).
583.Sy Note :
584It doesn't allocate any
585.Vt mbuf clusters ,
586so
587.Fa len
588must be less than
589.Dv MHLEN .
590.\"
591.It Fn m_copym mbuf offset len how
592Make a copy of an
593.Vt mbuf chain
594starting
595.Fa offset
596bytes from the beginning, continuing for
597.Fa len
598bytes.
599If
600.Fa len
601is
602.Dv M_COPYALL ,
603copy to the end of the
604.Vt mbuf chain .
605.Sy Note :
606The copy is read-only, because the
607.Vt mbuf clusters
608are not copied, only their reference counts are incremented.
609.\"
610.It Fn m_copypacket mbuf how
611Copy an entire packet including header, which must be present.
612This is an optimized version of the common case
613.Fn m_copym mbuf 0 M_COPYALL how .
614.Sy Note :
615the copy is read-only, because the
616.Vt mbuf clusters
617are not copied, only their reference counts are incremented.
618.\"
619.It Fn m_dup mbuf how
620Copy a packet header
621.Vt mbuf chain
622into a completely new
623.Vt mbuf chain ,
624including copying any
625.Vt mbuf clusters .
626Use this instead of
627.Fn m_copypacket
628when you need a writable copy of an
629.Vt mbuf chain .
630.\"
631.It Fn m_copydata mbuf offset len buf
632Copy data from an
633.Vt mbuf chain
634starting
635.Fa off
636bytes from the beginning, continuing for
637.Fa len
638bytes, into the indicated buffer
639.Fa buf .
640.\"
641.It Fn m_copyback mbuf offset len buf
642Copy
643.Fa len
644bytes from the buffer
645.Fa buf
646back into the indicated
647.Vt mbuf chain ,
648starting at
649.Fa offset
650bytes from the beginning of the
651.Vt mbuf chain ,
652extending the
653.Vt mbuf chain
654if necessary.
655.Sy Note :
656It doesn't allocate any
657.Vt mbuf clusters ,
658just adds
659.Vt mbufs
660to the
661.Vt mbuf chain .
662It's safe to set
663.Fa offset
664beyond the current
665.Vt mbuf chain
666end: zeroed
667.Vt mbufs
668will be allocated to fill the space.
669.\"
670.It Fn m_length mbuf last
671Return the length of the
672.Vt mbuf chain ,
673and optionally a pointer to the last
674.Vt mbuf .
675.\"
676.It Fn m_dup_pkthdr to from how
677Upon the function's completion, the
678.Vt mbuf
679.Fa to
680will contain an identical copy of
681.Fa from->m_pkthdr
682and the per-packet attributes found in the
683.Vt mbuf chain
684.Fa from .
685The
686.Vt mbuf
687.Fa from
688must have the flag
689.Dv M_PKTHDR
690initially set, and
691.Fa to
692must be empty on entry.
693.\"
694.It Fn m_move_pkthdr to from
695Move
696.Va m_pkthdr
697and the per-packet attributes from the
698.Vt mbuf chain
699.Fa from
700to the
701.Vt mbuf
702.Fa to .
703The
704.Vt mbuf
705.Fa from
706must have the flag
707.Dv M_PKTHDR
708initially set, and
709.Fa to
710must be empty on entry.
711Upon the function's completion,
712.Fa from
713will have the flag
714.Dv M_PKTHDR
715and the per-packet attributes cleared.
716.\"
717.It Fn m_fixhdr mbuf
718Set the packet-header length to the length of the
719.Vt mbuf chain .
720.\"
721.It Fn m_devget buf len offset ifp copy
722Copy data from a device local memory pointed to by
723.Fa buf
724to an
725.Vt mbuf chain .
726The copy is done using a specified copy routine
727.Fa copy ,
728or
729.Fn bcopy
730if
731.Fa copy
732is
733.Dv NULL .
734.\"
735.It Fn m_cat m n
736Concatenate
737.Fa n
738to
739.Fa m .
740Both
741.Vt mbuf chains
742must be of the same type.
743.Fa N
744is still valid after the function returned.
745.Sy Note :
746It does not handle
747.Dv M_PKTHDR
748and friends.
749.\"
750.It Fn m_split mbuf len how
751Partition an
752.Vt mbuf chain
753in two pieces, returning the tail:
754all but the first
755.Fa len
756bytes.
757In case of failure, it returns
758.Dv NULL
759and attempts to restore the
760.Vt mbuf chain
761to its original state.
762.El
763.Sh STRESS TESTING
764When running a kernel compiled with the option
765.Dv MBUF_STRESS_TEST ,
766the following
767.Xr sysctl 8 Ns
768-controlled options may be used to create
769various failure/extreme cases for testing of network drivers
770and other parts of the kernel that rely on
771.Vt mbufs .
772.Bl -tag -width ident
773.It Va net.inet.ip.mbuf_frag_size
774Causes
775.Fn ip_output
776to fragment outgoing
777.Vt mbuf chains
778into fragments of the specified size.
779Setting this variable to 1 is an excellent way to
780test the long
781.Vt mbuf chain
782handling ability of network drivers.
783.It Va kern.ipc.m_defragrandomfailures
784Causes the function
785.Fn m_defrag
786to randomly fail, returning
787.Dv NULL .
788Any piece of code which uses
789.Fn m_defrag
790should be tested with this feature.
791.El
792.Sh RETURN VALUES
793See above.
794.Sh HISTORY
795.\" Please correct me if I'm wrong
796.Vt Mbufs
797appeared in an early version of
798.Bx .
799Besides being used for network packets, they were used
800to store various dynamic structures, such as routing table
801entries, interface addresses, protocol control blocks, etc.
802.Sh AUTHORS
803The original
804.Nm
805man page was written by Yar Tikhiy.
806