xref: /freebsd/share/man/man9/mbuf.9 (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1.\" Copyright (c) 2000 FreeBSD Inc.
2.\" All rights reserved.
3.\"
4.\" Redistribution and use in source and binary forms, with or without
5.\" modification, are permitted provided that the following conditions
6.\" are met:
7.\" 1. Redistributions of source code must retain the above copyright
8.\"    notice, this list of conditions and the following disclaimer.
9.\" 2. Redistributions in binary form must reproduce the above copyright
10.\"    notice, this list of conditions and the following disclaimer in the
11.\"    documentation and/or other materials provided with the distribution.
12.\"
13.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16.\" ARE DISCLAIMED.  IN NO EVENT SHALL [your name] OR CONTRIBUTORS BE LIABLE
17.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23.\" SUCH DAMAGE.
24.\"
25.Dd December 28, 2023
26.Dt MBUF 9
27.Os
28.\"
29.Sh NAME
30.Nm mbuf
31.Nd "memory management in the kernel IPC subsystem"
32.\"
33.Sh SYNOPSIS
34.In sys/param.h
35.In sys/systm.h
36.In sys/mbuf.h
37.\"
38.Ss Mbuf allocation macros
39.Fn MGET "struct mbuf *mbuf" "int how" "short type"
40.Fn MGETHDR "struct mbuf *mbuf" "int how" "short type"
41.Ft int
42.Fn MCLGET "struct mbuf *mbuf" "int how"
43.Fo MEXTADD
44.Fa "struct mbuf *mbuf"
45.Fa "char *buf"
46.Fa "u_int size"
47.Fa "void (*free)(struct mbuf *)"
48.Fa "void *opt_arg1"
49.Fa "void *opt_arg2"
50.Fa "int flags"
51.Fa "int type"
52.Fc
53.\"
54.Ss Mbuf utility macros
55.Ft type
56.Fn mtod "struct mbuf *mbuf" "type"
57.Ft void *
58.Fn mtodo "struct mbuf *mbuf" "offset"
59.Fn M_ALIGN "struct mbuf *mbuf" "u_int len"
60.Fn MH_ALIGN "struct mbuf *mbuf" "u_int len"
61.Ft int
62.Fn M_LEADINGSPACE "struct mbuf *mbuf"
63.Ft int
64.Fn M_TRAILINGSPACE "struct mbuf *mbuf"
65.Fn M_MOVE_PKTHDR "struct mbuf *to" "struct mbuf *from"
66.Fn M_PREPEND "struct mbuf *mbuf" "int len" "int how"
67.Fn MCHTYPE "struct mbuf *mbuf" "short type"
68.Ft int
69.Fn M_WRITABLE "struct mbuf *mbuf"
70.\"
71.Ss Mbuf allocation functions
72.Ft struct mbuf *
73.Fn m_get "int how" "short type"
74.Ft struct mbuf *
75.Fn m_get2 "int size" "int how" "short type" "int flags"
76.Ft struct mbuf *
77.Fn m_get3 "int size" "int how" "short type" "int flags"
78.Ft struct mbuf *
79.Fn m_getm "struct mbuf *orig" "int len" "int how" "short type"
80.Ft struct mbuf *
81.Fn m_getjcl "int how" "short type" "int flags" "int size"
82.Ft struct mbuf *
83.Fn m_getcl "int how" "short type" "int flags"
84.Ft struct mbuf *
85.Fn m_gethdr "int how" "short type"
86.Ft struct mbuf *
87.Fn m_free "struct mbuf *mbuf"
88.Ft void
89.Fn m_freem "struct mbuf *mbuf"
90.\"
91.Ss Mbuf utility functions
92.Ft void
93.Fn m_adj "struct mbuf *mbuf" "int len"
94.Ft void
95.Fn m_align "struct mbuf *mbuf" "int len"
96.Ft int
97.Fn m_append "struct mbuf *mbuf" "int len" "c_caddr_t cp"
98.Ft struct mbuf *
99.Fn m_prepend "struct mbuf *mbuf" "int len" "int how"
100.Ft struct mbuf *
101.Fn m_copyup "struct mbuf *mbuf" "int len" "int dstoff"
102.Ft struct mbuf *
103.Fn m_pullup "struct mbuf *mbuf" "int len"
104.Ft struct mbuf *
105.Fn m_pulldown "struct mbuf *mbuf" "int offset" "int len" "int *offsetp"
106.Ft struct mbuf *
107.Fn m_copym "struct mbuf *mbuf" "int offset" "int len" "int how"
108.Ft struct mbuf *
109.Fn m_copypacket "struct mbuf *mbuf" "int how"
110.Ft struct mbuf *
111.Fn m_dup "const struct mbuf *mbuf" "int how"
112.Ft void
113.Fn m_copydata "const struct mbuf *mbuf" "int offset" "int len" "caddr_t buf"
114.Ft void
115.Fn m_copyback "struct mbuf *mbuf" "int offset" "int len" "caddr_t buf"
116.Ft struct mbuf *
117.Fo m_devget
118.Fa "char *buf"
119.Fa "int len"
120.Fa "int offset"
121.Fa "struct ifnet *ifp"
122.Fa "void (*copy)(char *from, caddr_t to, u_int len)"
123.Fc
124.Ft void
125.Fn m_cat "struct mbuf *m" "struct mbuf *n"
126.Ft void
127.Fn m_catpkt "struct mbuf *m" "struct mbuf *n"
128.Ft u_int
129.Fn m_fixhdr "struct mbuf *mbuf"
130.Ft int
131.Fn m_dup_pkthdr "struct mbuf *to" "const struct mbuf *from" "int how"
132.Ft void
133.Fn m_move_pkthdr "struct mbuf *to" "struct mbuf *from"
134.Ft u_int
135.Fn m_length "struct mbuf *mbuf" "struct mbuf **last"
136.Ft struct mbuf *
137.Fn m_split "struct mbuf *mbuf" "int len" "int how"
138.Ft int
139.Fn m_apply "struct mbuf *mbuf" "int off" "int len" "int (*f)(void *arg, void *data, u_int len)" "void *arg"
140.Ft struct mbuf *
141.Fn m_getptr "struct mbuf *mbuf" "int loc" "int *off"
142.Ft struct mbuf *
143.Fn m_defrag "struct mbuf *m0" "int how"
144.Ft struct mbuf *
145.Fn m_collapse "struct mbuf *m0" "int how" "int maxfrags"
146.Ft struct mbuf *
147.Fn m_unshare "struct mbuf *m0" "int how"
148.\"
149.Sh DESCRIPTION
150An
151.Vt mbuf
152is a basic unit of memory management in the kernel IPC subsystem.
153Network packets and socket buffers are stored in
154.Vt mbufs .
155A network packet may span multiple
156.Vt mbufs
157arranged into a
158.Vt mbuf chain
159(linked list),
160which allows adding or trimming
161network headers with little overhead.
162.Pp
163While a developer should not bother with
164.Vt mbuf
165internals without serious
166reason in order to avoid incompatibilities with future changes, it
167is useful to understand the general structure of an
168.Vt mbuf .
169.Pp
170An
171.Vt mbuf
172consists of a variable-sized header and a small internal
173buffer for data.
174The total size of an
175.Vt mbuf ,
176.Dv MSIZE ,
177is a constant defined in
178.In sys/param.h .
179The
180.Vt mbuf
181header includes:
182.Bl -tag -width "m_nextpkt" -offset indent
183.It Va m_next
184.Pq Vt struct mbuf *
185A pointer to the next
186.Vt mbuf
187in the
188.Vt mbuf chain .
189.It Va m_nextpkt
190.Pq Vt struct mbuf *
191A pointer to the next
192.Vt mbuf chain
193in the queue.
194.It Va m_data
195.Pq Vt caddr_t
196A pointer to data attached to this
197.Vt mbuf .
198.It Va m_len
199.Pq Vt int
200The length of the data.
201.It Va m_type
202.Pq Vt short
203The type of the data.
204.It Va m_flags
205.Pq Vt int
206The
207.Vt mbuf
208flags.
209.El
210.Pp
211The
212.Vt mbuf
213flag bits are defined as follows:
214.Bd -literal
215#define	M_EXT		0x00000001 /* has associated external storage */
216#define	M_PKTHDR	0x00000002 /* start of record */
217#define	M_EOR		0x00000004 /* end of record */
218#define	M_RDONLY	0x00000008 /* associated data marked read-only */
219#define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
220#define	M_MCAST		0x00000020 /* send/received as link-level multicast */
221#define	M_PROMISC	0x00000040 /* packet was not for us */
222#define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
223#define	M_EXTPG		0x00000100 /* has array of unmapped pages and TLS */
224#define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
225#define	M_TSTMP		0x00000400 /* rcv_tstmp field is valid */
226#define	M_TSTMP_HPREC	0x00000800 /* rcv_tstmp is high-prec, typically
227				      hw-stamped on port (useful for IEEE 1588
228				      and 802.1AS) */
229
230#define	M_PROTO1	0x00001000 /* protocol-specific */
231#define	M_PROTO2	0x00002000 /* protocol-specific */
232#define	M_PROTO3	0x00004000 /* protocol-specific */
233#define	M_PROTO4	0x00008000 /* protocol-specific */
234#define	M_PROTO5	0x00010000 /* protocol-specific */
235#define	M_PROTO6	0x00020000 /* protocol-specific */
236#define	M_PROTO7	0x00040000 /* protocol-specific */
237#define	M_PROTO8	0x00080000 /* protocol-specific */
238#define	M_PROTO9	0x00100000 /* protocol-specific */
239#define	M_PROTO10	0x00200000 /* protocol-specific */
240#define	M_PROTO11	0x00400000 /* protocol-specific */
241#define	M_PROTO12	0x00800000 /* protocol-specific */
242.Ed
243.Pp
244The available
245.Vt mbuf
246types are defined as follows:
247.Bd -literal
248#define	MT_DATA		1	/* dynamic (data) allocation */
249#define	MT_HEADER	MT_DATA	/* packet header */
250
251#define	MT_VENDOR1	4	/* for vendor-internal use */
252#define	MT_VENDOR2	5	/* for vendor-internal use */
253#define	MT_VENDOR3	6	/* for vendor-internal use */
254#define	MT_VENDOR4	7	/* for vendor-internal use */
255
256#define	MT_SONAME	8	/* socket name */
257
258#define	MT_EXP1		9	/* for experimental use */
259#define	MT_EXP2		10	/* for experimental use */
260#define	MT_EXP3		11	/* for experimental use */
261#define	MT_EXP4		12	/* for experimental use */
262
263#define	MT_CONTROL	14	/* extra-data protocol message */
264#define	MT_EXTCONTROL	15	/* control message with externalized contents */
265#define	MT_OOBDATA	16	/* expedited data  */
266.Ed
267.Pp
268The available external buffer types are defined as follows:
269.Bd -literal
270#define	EXT_CLUSTER	1	/* mbuf cluster */
271#define	EXT_SFBUF	2	/* sendfile(2)'s sf_bufs */
272#define	EXT_JUMBOP	3	/* jumbo cluster 4096 bytes */
273#define	EXT_JUMBO9	4	/* jumbo cluster 9216 bytes */
274#define	EXT_JUMBO16	5	/* jumbo cluster 16184 bytes */
275#define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
276#define	EXT_MBUF	7	/* external mbuf reference */
277#define	EXT_RXRING	8	/* data in NIC receive ring */
278#define	EXT_PGS		9	/* array of unmapped pages */
279
280#define	EXT_VENDOR1	224	/* for vendor-internal use */
281#define	EXT_VENDOR2	225	/* for vendor-internal use */
282#define	EXT_VENDOR3	226	/* for vendor-internal use */
283#define	EXT_VENDOR4	227	/* for vendor-internal use */
284
285#define	EXT_EXP1	244	/* for experimental use */
286#define	EXT_EXP2	245	/* for experimental use */
287#define	EXT_EXP3	246	/* for experimental use */
288#define	EXT_EXP4	247	/* for experimental use */
289
290#define	EXT_NET_DRV	252	/* custom ext_buf provided by net driver(s) */
291#define	EXT_MOD_TYPE	253	/* custom module's ext_buf type */
292#define	EXT_DISPOSABLE	254	/* can throw this buffer away w/page flipping */
293#define	EXT_EXTREF	255	/* has externally maintained ref_cnt ptr */
294.Ed
295.Pp
296If the
297.Dv M_PKTHDR
298flag is set, a
299.Vt struct pkthdr Va m_pkthdr
300is added to the
301.Vt mbuf
302header.
303It contains a pointer to the interface
304the packet has been received from
305.Pq Vt struct ifnet Va *rcvif ,
306and the total packet length
307.Pq Vt int Va len .
308Optionally, it may also contain an attached list of packet tags
309.Pq Vt "struct m_tag" .
310See
311.Xr mbuf_tags 9
312for details.
313Fields used in offloading checksum calculation to the hardware are kept in
314.Va m_pkthdr
315as well.
316See
317.Sx HARDWARE-ASSISTED CHECKSUM CALCULATION
318for details.
319.Pp
320If small enough, data is stored in the internal data buffer of an
321.Vt mbuf .
322If the data is sufficiently large, another
323.Vt mbuf
324may be added to the
325.Vt mbuf chain ,
326or external storage may be associated with the
327.Vt mbuf .
328.Dv MHLEN
329bytes of data can fit into an
330.Vt mbuf
331with the
332.Dv M_PKTHDR
333flag set,
334.Dv MLEN
335bytes can otherwise.
336.Pp
337If external storage is being associated with an
338.Vt mbuf ,
339the
340.Va m_ext
341header is added at the cost of losing the internal data buffer.
342It includes a pointer to external storage, the size of the storage,
343a pointer to a function used for freeing the storage,
344a pointer to an optional argument that can be passed to the function,
345and a pointer to a reference counter.
346An
347.Vt mbuf
348using external storage has the
349.Dv M_EXT
350flag set.
351.Pp
352The system supplies a macro for allocating the desired external storage
353buffer,
354.Dv MEXTADD .
355.Pp
356The allocation and management of the reference counter is handled by the
357subsystem.
358.Pp
359The system also supplies a default type of external storage buffer called an
360.Vt mbuf cluster .
361.Vt Mbuf clusters
362can be allocated and configured with the use of the
363.Dv MCLGET
364macro.
365Each
366.Vt mbuf cluster
367is
368.Dv MCLBYTES
369in size, where MCLBYTES is a machine-dependent constant.
370The system defines an advisory macro
371.Dv MINCLSIZE ,
372which is the smallest amount of data to put into an
373.Vt mbuf cluster .
374It is equal to
375.Dv MHLEN
376plus one.
377It is typically preferable to store data into the data region of an
378.Vt mbuf ,
379if size permits, as opposed to allocating a separate
380.Vt mbuf cluster
381to hold the same data.
382.\"
383.Ss Macros and Functions
384There are numerous predefined macros and functions that provide the
385developer with common utilities.
386.\"
387.Bl -ohang -offset indent
388.It Fn mtod mbuf type
389Convert an
390.Fa mbuf
391pointer to a data pointer.
392The macro expands to the data pointer cast to the specified
393.Fa type .
394.Sy Note :
395It is advisable to ensure that there is enough contiguous data in
396.Fa mbuf .
397See
398.Fn m_pullup
399for details.
400.It Fn mtodo mbuf offset
401Return a data pointer at an offset (in bytes) into the data attached to
402.Fa mbuf .
403Returns a
404.Ft void *
405pointer .
406.Sy Note :
407The caller must ensure that the offset is in bounds of the attached data.
408.It Fn MGET mbuf how type
409Allocate an
410.Vt mbuf
411and initialize it to contain internal data.
412.Fa mbuf
413will point to the allocated
414.Vt mbuf
415on success, or be set to
416.Dv NULL
417on failure.
418The
419.Fa how
420argument is to be set to
421.Dv M_WAITOK
422or
423.Dv M_NOWAIT .
424It specifies whether the caller is willing to block if necessary.
425A number of other functions and macros related to
426.Vt mbufs
427have the same argument because they may
428at some point need to allocate new
429.Vt mbufs .
430.It Fn MGETHDR mbuf how type
431Allocate an
432.Vt mbuf
433and initialize it to contain a packet header
434and internal data.
435See
436.Fn MGET
437for details.
438.It Fn MEXTADD mbuf buf size free opt_arg1 opt_arg2 flags type
439Associate externally managed data with
440.Fa mbuf .
441Any internal data contained in the mbuf will be discarded, and the
442.Dv M_EXT
443flag will be set.
444The
445.Fa buf
446and
447.Fa size
448arguments are the address and length, respectively, of the data.
449The
450.Fa free
451argument points to a function which will be called to free the data
452when the mbuf is freed; it is only used if
453.Fa type
454is
455.Dv EXT_EXTREF .
456The
457.Fa opt_arg1
458and
459.Fa opt_arg2
460arguments will be saved in
461.Va ext_arg1
462and
463.Va ext_arg2
464fields of the
465.Va struct m_ext
466of the mbuf.
467The
468.Fa flags
469argument specifies additional
470.Vt mbuf
471flags; it is not necessary to specify
472.Dv M_EXT .
473Finally, the
474.Fa type
475argument specifies the type of external data, which controls how it
476will be disposed of when the
477.Vt mbuf
478is freed.
479In most cases, the correct value is
480.Dv EXT_EXTREF .
481.It Fn MCLGET mbuf how
482Allocate and attach an
483.Vt mbuf cluster
484to
485.Fa mbuf .
486On success, a non-zero value returned; otherwise, 0.
487Historically, consumers would check for success by testing the
488.Dv M_EXT
489flag on the mbuf, but this is now discouraged to avoid unnecessary awareness
490of the implementation of external storage in protocol stacks and device
491drivers.
492.It Fn M_ALIGN mbuf len
493Set the pointer
494.Fa mbuf->m_data
495to place an object of the size
496.Fa len
497at the end of the internal data area of
498.Fa mbuf ,
499long word aligned.
500Applicable only if
501.Fa mbuf
502is newly allocated with
503.Fn MGET
504or
505.Fn m_get .
506.It Fn MH_ALIGN mbuf len
507Serves the same purpose as
508.Fn M_ALIGN
509does, but only for
510.Fa mbuf
511newly allocated with
512.Fn MGETHDR
513or
514.Fn m_gethdr ,
515or initialized by
516.Fn m_dup_pkthdr
517or
518.Fn m_move_pkthdr .
519.It Fn m_align mbuf len
520Services the same purpose as
521.Fn M_ALIGN
522but handles any type of mbuf.
523.It Fn M_LEADINGSPACE mbuf
524Returns the number of bytes available before the beginning
525of data in
526.Fa mbuf .
527.It Fn M_TRAILINGSPACE mbuf
528Returns the number of bytes available after the end of data in
529.Fa mbuf .
530.It Fn M_PREPEND mbuf len how
531This macro operates on an
532.Vt mbuf chain .
533It is an optimized wrapper for
534.Fn m_prepend
535that can make use of possible empty space before data
536(e.g.\& left after trimming of a link-layer header).
537The new
538.Vt mbuf chain
539pointer or
540.Dv NULL
541is in
542.Fa mbuf
543after the call.
544.It Fn M_MOVE_PKTHDR to from
545Using this macro is equivalent to calling
546.Fn m_move_pkthdr to from .
547.It Fn M_WRITABLE mbuf
548This macro will evaluate true if
549.Fa mbuf
550is not marked
551.Dv M_RDONLY
552and if either
553.Fa mbuf
554does not contain external storage or,
555if it does,
556then if the reference count of the storage is not greater than 1.
557The
558.Dv M_RDONLY
559flag can be set in
560.Fa mbuf->m_flags .
561This can be achieved during setup of the external storage,
562by passing the
563.Dv M_RDONLY
564bit as a
565.Fa flags
566argument to the
567.Fn MEXTADD
568macro, or can be directly set in individual
569.Vt mbufs .
570.It Fn MCHTYPE mbuf type
571Change the type of
572.Fa mbuf
573to
574.Fa type .
575This is a relatively expensive operation and should be avoided.
576.El
577.Pp
578The functions are:
579.Bl -ohang -offset indent
580.It Fn m_get how type
581A function version of
582.Fn MGET
583for non-critical paths.
584.It Fn m_get2 size how type flags
585Allocate an
586.Vt mbuf
587with enough space to hold specified amount of data.
588If the size is larger than
589.Dv MJUMPAGESIZE , NULL
590will be returned.
591.It Fn m_get3 size how type flags
592Allocate an
593.Vt mbuf
594with enough space to hold specified amount of data.
595If the size is larger than
596.Dv MJUM16BYTES, NULL
597will be returned.
598.It Fn m_getm orig len how type
599Allocate
600.Fa len
601bytes worth of
602.Vt mbufs
603and
604.Vt mbuf clusters
605if necessary and append the resulting allocated
606.Vt mbuf chain
607to the
608.Vt mbuf chain
609.Fa orig ,
610if it is
611.No non- Ns Dv NULL .
612If the allocation fails at any point,
613free whatever was allocated and return
614.Dv NULL .
615If
616.Fa orig
617is
618.No non- Ns Dv NULL ,
619it will not be freed.
620It is possible to use
621.Fn m_getm
622to either append
623.Fa len
624bytes to an existing
625.Vt mbuf
626or
627.Vt mbuf chain
628(for example, one which may be sitting in a pre-allocated ring)
629or to simply perform an all-or-nothing
630.Vt mbuf
631and
632.Vt mbuf cluster
633allocation.
634.It Fn m_gethdr how type
635A function version of
636.Fn MGETHDR
637for non-critical paths.
638.It Fn m_getcl how type flags
639Fetch an
640.Vt mbuf
641with a
642.Vt mbuf cluster
643attached to it.
644If one of the allocations fails, the entire allocation fails.
645This routine is the preferred way of fetching both the
646.Vt mbuf
647and
648.Vt mbuf cluster
649together, as it avoids having to unlock/relock between allocations.
650Returns
651.Dv NULL
652on failure.
653.It Fn m_getjcl how type flags size
654This is like
655.Fn m_getcl
656but the specified
657.Fa size
658of the cluster to be allocated must be one of
659.Dv MCLBYTES , MJUMPAGESIZE , MJUM9BYTES ,
660or
661.Dv MJUM16BYTES .
662.It Fn m_free mbuf
663Frees
664.Vt mbuf .
665Returns
666.Va m_next
667of the freed
668.Vt mbuf .
669.El
670.Pp
671The functions below operate on
672.Vt mbuf chains .
673.Bl -ohang -offset indent
674.It Fn m_freem mbuf
675Free an entire
676.Vt mbuf chain ,
677including any external storage.
678.\"
679.It Fn m_adj mbuf len
680Trim
681.Fa len
682bytes from the head of an
683.Vt mbuf chain
684if
685.Fa len
686is positive, from the tail otherwise.
687.\"
688.It Fn m_append mbuf len cp
689Append
690.Vt len
691bytes of data
692.Vt cp
693to the
694.Vt mbuf chain .
695Extend the mbuf chain if the new data does not fit in
696existing space.
697.\"
698.It Fn m_prepend mbuf len how
699Allocate a new
700.Vt mbuf
701and prepend it to the
702.Vt mbuf chain ,
703handle
704.Dv M_PKTHDR
705properly.
706.Sy Note :
707It does not allocate any
708.Vt mbuf clusters ,
709so
710.Fa len
711must be less than
712.Dv MLEN
713or
714.Dv MHLEN ,
715depending on the
716.Dv M_PKTHDR
717flag setting.
718.\"
719.It Fn m_copyup mbuf len dstoff
720Similar to
721.Fn m_pullup
722but copies
723.Fa len
724bytes of data into a new mbuf at
725.Fa dstoff
726bytes into the mbuf.
727The
728.Fa dstoff
729argument aligns the data and leaves room for a link layer header.
730Returns the new
731.Vt mbuf chain
732on success,
733and frees the
734.Vt mbuf chain
735and returns
736.Dv NULL
737on failure.
738.Sy Note :
739The function does not allocate
740.Vt mbuf clusters ,
741so
742.Fa len + dstoff
743must be less than
744.Dv MHLEN .
745.\"
746.It Fn m_pullup mbuf len
747Arrange that the first
748.Fa len
749bytes of an
750.Vt mbuf chain
751are contiguous and lay in the data area of
752.Fa mbuf ,
753so they are accessible with
754.Fn mtod mbuf type .
755It is important to remember that this may involve
756reallocating some mbufs and moving data so all pointers
757referencing data within the old mbuf chain
758must be recalculated or made invalid.
759Return the new
760.Vt mbuf chain
761on success,
762.Dv NULL
763on failure
764(the
765.Vt mbuf chain
766is freed in this case).
767.Sy Note :
768It does not allocate any
769.Vt mbuf clusters ,
770so
771.Fa len
772must be less than or equal to
773.Dv MHLEN .
774.\"
775.It Fn m_pulldown mbuf offset len offsetp
776Arrange that
777.Fa len
778bytes between
779.Fa offset
780and
781.Fa offset + len
782in the
783.Vt mbuf chain
784are contiguous and lay in the data area of
785.Fa mbuf ,
786so they are accessible with
787.Fn mtod
788or
789.Fn mtodo .
790.Fa len
791must be smaller than, or equal to, the size of an
792.Vt mbuf cluster .
793Return a pointer to an intermediate
794.Vt mbuf
795in the chain containing the requested region;
796the offset in the data region of the
797.Vt mbuf chain
798to the data contained in the returned mbuf is stored in
799.Fa *offsetp .
800If
801.Fa offsetp
802is NULL, the region may be accessed using
803.Fn mtod mbuf type
804or
805.Fn mtodo mbuf 0 .
806If
807.Fa offsetp
808is non-NULL, the region may be accessed using
809.Fn mtodo mbuf *offsetp .
810The region of the mbuf chain between its beginning and
811.Fa offset
812is not modified, therefore it is safe to hold pointers to data within
813this region before calling
814.Fn m_pulldown .
815.\"
816.It Fn m_copym mbuf offset len how
817Make a copy of an
818.Vt mbuf chain
819starting
820.Fa offset
821bytes from the beginning, continuing for
822.Fa len
823bytes.
824If
825.Fa len
826is
827.Dv M_COPYALL ,
828copy to the end of the
829.Vt mbuf chain .
830.Sy Note :
831The copy is read-only, because the
832.Vt mbuf clusters
833are not copied, only their reference counts are incremented.
834.\"
835.It Fn m_copypacket mbuf how
836Copy an entire packet including header, which must be present.
837This is an optimized version of the common case
838.Fn m_copym mbuf 0 M_COPYALL how .
839.Sy Note :
840the copy is read-only, because the
841.Vt mbuf clusters
842are not copied, only their reference counts are incremented.
843.\"
844.It Fn m_dup mbuf how
845Copy a packet header
846.Vt mbuf chain
847into a completely new
848.Vt mbuf chain ,
849including copying any
850.Vt mbuf clusters .
851Use this instead of
852.Fn m_copypacket
853when you need a writable copy of an
854.Vt mbuf chain .
855.\"
856.It Fn m_copydata mbuf offset len buf
857Copy data from an
858.Vt mbuf chain
859starting
860.Fa off
861bytes from the beginning, continuing for
862.Fa len
863bytes, into the indicated buffer
864.Fa buf .
865.\"
866.It Fn m_copyback mbuf offset len buf
867Copy
868.Fa len
869bytes from the buffer
870.Fa buf
871back into the indicated
872.Vt mbuf chain ,
873starting at
874.Fa offset
875bytes from the beginning of the
876.Vt mbuf chain ,
877extending the
878.Vt mbuf chain
879if necessary.
880.Sy Note :
881It does not allocate any
882.Vt mbuf clusters ,
883just adds
884.Vt mbufs
885to the
886.Vt mbuf chain .
887It is safe to set
888.Fa offset
889beyond the current
890.Vt mbuf chain
891end: zeroed
892.Vt mbufs
893will be allocated to fill the space.
894.\"
895.It Fn m_length mbuf last
896Return the length of the
897.Vt mbuf chain ,
898and optionally a pointer to the last
899.Vt mbuf .
900.\"
901.It Fn m_dup_pkthdr to from how
902Upon the function's completion, the
903.Vt mbuf
904.Fa to
905will contain an identical copy of
906.Fa from->m_pkthdr
907and the per-packet attributes found in the
908.Vt mbuf chain
909.Fa from .
910The
911.Vt mbuf
912.Fa from
913must have the flag
914.Dv M_PKTHDR
915initially set, and
916.Fa to
917must be empty on entry.
918.\"
919.It Fn m_move_pkthdr to from
920Move
921.Va m_pkthdr
922and the per-packet attributes from the
923.Vt mbuf chain
924.Fa from
925to the
926.Vt mbuf
927.Fa to .
928The
929.Vt mbuf
930.Fa from
931must have the flag
932.Dv M_PKTHDR
933initially set, and
934.Fa to
935must be empty on entry.
936Upon the function's completion,
937.Fa from
938will have the flag
939.Dv M_PKTHDR
940and the per-packet attributes cleared.
941.\"
942.It Fn m_fixhdr mbuf
943Set the packet-header length to the length of the
944.Vt mbuf chain .
945.\"
946.It Fn m_devget buf len offset ifp copy
947Copy data from a device local memory pointed to by
948.Fa buf
949to an
950.Vt mbuf chain .
951The copy is done using a specified copy routine
952.Fa copy ,
953or
954.Fn bcopy
955if
956.Fa copy
957is
958.Dv NULL .
959.\"
960.It Fn m_cat m n
961Concatenate
962.Fa n
963to
964.Fa m .
965Both
966.Vt mbuf chains
967must be of the same type.
968.Fa n
969is not guaranteed to be valid after
970.Fn m_cat
971returns.
972.Fn m_cat
973does not update any packet header fields or free mbuf tags.
974.\"
975.It Fn m_catpkt m n
976A variant of
977.Fn m_cat
978that operates on packets.
979Both
980.Fa m
981and
982.Fa n
983must contain packet headers.
984.Fa n
985is not guaranteed to be valid after
986.Fn m_catpkt
987returns.
988.\"
989.It Fn m_split mbuf len how
990Partition an
991.Vt mbuf chain
992in two pieces, returning the tail:
993all but the first
994.Fa len
995bytes.
996In case of failure, it returns
997.Dv NULL
998and attempts to restore the
999.Vt mbuf chain
1000to its original state.
1001.\"
1002.It Fn m_apply mbuf off len f arg
1003Apply a function to an
1004.Vt mbuf chain ,
1005at offset
1006.Fa off ,
1007for length
1008.Fa len
1009bytes.
1010Typically used to avoid calls to
1011.Fn m_pullup
1012which would otherwise be unnecessary or undesirable.
1013.Fa arg
1014is a convenience argument which is passed to the callback function
1015.Fa f .
1016.Pp
1017Each time
1018.Fn f
1019is called, it will be passed
1020.Fa arg ,
1021a pointer to the
1022.Fa data
1023in the current mbuf, and the length
1024.Fa len
1025of the data in this mbuf to which the function should be applied.
1026.Pp
1027The function should return zero to indicate success;
1028otherwise, if an error is indicated, then
1029.Fn m_apply
1030will return the error and stop iterating through the
1031.Vt mbuf chain .
1032.\"
1033.It Fn m_getptr mbuf loc off
1034Return a pointer to the mbuf containing the data located at
1035.Fa loc
1036bytes from the beginning of the
1037.Vt mbuf chain .
1038The corresponding offset into the mbuf will be stored in
1039.Fa *off .
1040.It Fn m_defrag m0 how
1041Defragment an mbuf chain, returning the shortest possible
1042chain of mbufs and clusters.
1043If allocation fails and this can not be completed,
1044.Dv NULL
1045will be returned and the original chain will be unchanged.
1046Upon success, the original chain will be freed and the new
1047chain will be returned.
1048.Fa how
1049should be either
1050.Dv M_WAITOK
1051or
1052.Dv M_NOWAIT ,
1053depending on the caller's preference.
1054.Pp
1055This function is especially useful in network drivers, where
1056certain long mbuf chains must be shortened before being added
1057to TX descriptor lists.
1058.It Fn m_collapse m0 how maxfrags
1059Defragment an mbuf chain, returning a chain of at most
1060.Fa maxfrags
1061mbufs and clusters.
1062If allocation fails or the chain cannot be collapsed as requested,
1063.Dv NULL
1064will be returned, with the original chain possibly modified.
1065As with
1066.Fn m_defrag ,
1067.Fa how
1068should be one of
1069.Dv M_WAITOK
1070or
1071.Dv M_NOWAIT .
1072.It Fn m_unshare m0 how
1073Create a version of the specified mbuf chain whose
1074contents can be safely modified without affecting other users.
1075If allocation fails and this operation can not be completed,
1076.Dv NULL
1077will be returned.
1078The original mbuf chain is always reclaimed and the reference
1079count of any shared mbuf clusters is decremented.
1080.Fa how
1081should be either
1082.Dv M_WAITOK
1083or
1084.Dv M_NOWAIT ,
1085depending on the caller's preference.
1086As a side-effect of this process the returned
1087mbuf chain may be compacted.
1088.Pp
1089This function is especially useful in the transmit path of
1090network code, when data must be encrypted or otherwise
1091altered prior to transmission.
1092.El
1093.Sh HARDWARE-ASSISTED CHECKSUM CALCULATION
1094This section currently applies to TCP/IP only.
1095In order to save the host CPU resources, computing checksums is
1096offloaded to the network interface hardware if possible.
1097The
1098.Va m_pkthdr
1099member of the leading
1100.Vt mbuf
1101of a packet contains two fields used for that purpose,
1102.Vt int Va csum_flags
1103and
1104.Vt int Va csum_data .
1105The meaning of those fields depends on the direction a packet flows in,
1106and on whether the packet is fragmented.
1107Henceforth,
1108.Va csum_flags
1109or
1110.Va csum_data
1111of a packet
1112will denote the corresponding field of the
1113.Va m_pkthdr
1114member of the leading
1115.Vt mbuf
1116in the
1117.Vt mbuf chain
1118containing the packet.
1119.Pp
1120On output, checksum offloading is attempted after the outgoing
1121interface has been determined for a packet.
1122The interface-specific field
1123.Va ifnet.if_data.ifi_hwassist
1124(see
1125.Xr ifnet 9 )
1126is consulted for the capabilities of the interface to assist in
1127computing checksums.
1128The
1129.Va csum_flags
1130field of the packet header is set to indicate which actions the interface
1131is supposed to perform on it.
1132The actions unsupported by the network interface are done in the
1133software prior to passing the packet down to the interface driver;
1134such actions will never be requested through
1135.Va csum_flags .
1136.Pp
1137The flags demanding a particular action from an interface are as follows:
1138.Bl -tag -width ".Dv CSUM_TCP" -offset indent
1139.It Dv CSUM_IP
1140The IP header checksum is to be computed and stored in the
1141corresponding field of the packet.
1142The hardware is expected to know the format of an IP header
1143to determine the offset of the IP checksum field.
1144.It Dv CSUM_TCP
1145The TCP checksum is to be computed.
1146(See below.)
1147.It Dv CSUM_UDP
1148The UDP checksum is to be computed.
1149(See below.)
1150.El
1151.Pp
1152Should a TCP or UDP checksum be offloaded to the hardware,
1153the field
1154.Va csum_data
1155will contain the byte offset of the checksum field relative to the
1156end of the IP header.
1157In this case, the checksum field will be initially
1158set by the TCP/IP module to the checksum of the pseudo header
1159defined by the TCP and UDP specifications.
1160.Pp
1161On input, an interface indicates the actions it has performed
1162on a packet by setting one or more of the following flags in
1163.Va csum_flags
1164associated with the packet:
1165.Bl -tag -width ".Dv CSUM_IP_CHECKED" -offset indent
1166.It Dv CSUM_IP_CHECKED
1167The IP header checksum has been computed.
1168.It Dv CSUM_IP_VALID
1169The IP header has a valid checksum.
1170This flag can appear only in combination with
1171.Dv CSUM_IP_CHECKED .
1172.It Dv CSUM_DATA_VALID
1173The checksum of the data portion of the IP packet has been computed
1174and stored in the field
1175.Va csum_data
1176in network byte order.
1177.It Dv CSUM_PSEUDO_HDR
1178Can be set only along with
1179.Dv CSUM_DATA_VALID
1180to indicate that the IP data checksum found in
1181.Va csum_data
1182allows for the pseudo header defined by the TCP and UDP specifications.
1183Otherwise the checksum of the pseudo header must be calculated by
1184the host CPU and added to
1185.Va csum_data
1186to obtain the final checksum to be used for TCP or UDP validation purposes.
1187.El
1188.Pp
1189If a particular network interface just indicates success or
1190failure of TCP or UDP checksum validation without returning
1191the exact value of the checksum to the host CPU, its driver can mark
1192.Dv CSUM_DATA_VALID
1193and
1194.Dv CSUM_PSEUDO_HDR
1195in
1196.Va csum_flags ,
1197and set
1198.Va csum_data
1199to
1200.Li 0xFFFF
1201hexadecimal to indicate a valid checksum.
1202It is a peculiarity of the algorithm used that the Internet checksum
1203calculated over any valid packet will be
1204.Li 0xFFFF
1205as long as the original checksum field is included.
1206.Sh STRESS TESTING
1207When running a kernel compiled with the option
1208.Dv MBUF_STRESS_TEST ,
1209the following
1210.Xr sysctl 8 Ns
1211-controlled options may be used to create
1212various failure/extreme cases for testing of network drivers
1213and other parts of the kernel that rely on
1214.Vt mbufs .
1215.Bl -tag -width indent
1216.It Va net.inet.ip.mbuf_frag_size
1217Causes
1218.Fn ip_output
1219to fragment outgoing
1220.Vt mbuf chains
1221into fragments of the specified size.
1222Setting this variable to 1 is an excellent way to
1223test the long
1224.Vt mbuf chain
1225handling ability of network drivers.
1226.It Va kern.ipc.m_defragrandomfailures
1227Causes the function
1228.Fn m_defrag
1229to randomly fail, returning
1230.Dv NULL .
1231Any piece of code which uses
1232.Fn m_defrag
1233should be tested with this feature.
1234.El
1235.Sh RETURN VALUES
1236See above.
1237.Sh SEE ALSO
1238.Xr ifnet 9 ,
1239.Xr mbuf_tags 9
1240.Rs
1241.\" 4.4BSD SMM:18
1242.%A S. J. Leffler
1243.%A W. N. Joy
1244.%A R. S. Fabry
1245.%A M. J. Karels
1246.%T Networking Implementation Notes
1247.%B 4.4BSD System Manager's Manual (SMM)
1248.Re
1249.Sh HISTORY
1250.\" Please correct me if I'm wrong
1251.Vt Mbufs
1252appeared in an early version of
1253.Bx .
1254Besides being used for network packets, they were used
1255to store various dynamic structures, such as routing table
1256entries, interface addresses, protocol control blocks, etc.
1257In more recent
1258.Fx
1259use of
1260.Vt mbufs
1261is almost entirely limited to packet storage, with
1262.Xr uma 9
1263zones being used directly to store other network-related memory.
1264.Pp
1265Historically, the
1266.Vt mbuf
1267allocator has been a special-purpose memory allocator able to run in
1268interrupt contexts and allocating from a special kernel address space map.
1269As of
1270.Fx 5.3 ,
1271the
1272.Vt mbuf
1273allocator is a wrapper around
1274.Xr uma 9 ,
1275allowing caching of
1276.Vt mbufs ,
1277clusters, and
1278.Vt mbuf
1279+ cluster pairs in per-CPU caches, as well as bringing other benefits of
1280slab allocation.
1281.Sh AUTHORS
1282The original
1283.Nm
1284manual page was written by
1285.An Yar Tikhiy .
1286The
1287.Xr uma 9
1288.Vt mbuf
1289allocator was written by
1290.An Bosko Milekic .
1291