xref: /freebsd/share/man/man9/mbuf.9 (revision e4e9813eb92cd7c4d4b819a8fbed5cbd3d92f5d8)
1.\" Copyright (c) 2000 FreeBSD Inc.
2.\" All rights reserved.
3.\"
4.\" Redistribution and use in source and binary forms, with or without
5.\" modification, are permitted provided that the following conditions
6.\" are met:
7.\" 1. Redistributions of source code must retain the above copyright
8.\"    notice, this list of conditions and the following disclaimer.
9.\" 2. Redistributions in binary form must reproduce the above copyright
10.\"    notice, this list of conditions and the following disclaimer in the
11.\"    documentation and/or other materials provided with the distribution.
12.\"
13.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16.\" ARE DISCLAIMED.  IN NO EVENT SHALL [your name] OR CONTRIBUTORS BE LIABLE
17.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23.\" SUCH DAMAGE.
24.\"
25.\" $FreeBSD$
26.\"
27.Dd July 24, 2006
28.Dt MBUF 9
29.Os
30.\"
31.Sh NAME
32.Nm mbuf
33.Nd "memory management in the kernel IPC subsystem"
34.\"
35.Sh SYNOPSIS
36.In sys/param.h
37.In sys/systm.h
38.In sys/mbuf.h
39.\"
40.Ss Mbuf allocation macros
41.Fn MGET "struct mbuf *mbuf" "int how" "short type"
42.Fn MGETHDR "struct mbuf *mbuf" "int how" "short type"
43.Fn MCLGET "struct mbuf *mbuf" "int how"
44.Fo MEXTADD
45.Fa "struct mbuf *mbuf"
46.Fa "caddr_t buf"
47.Fa "u_int size"
48.Fa "void (*free)(void *opt_args)"
49.Fa "void *opt_args"
50.Fa "short flags"
51.Fa "int type"
52.Fc
53.Fn MEXTFREE "struct mbuf *mbuf"
54.Fn MFREE "struct mbuf *mbuf" "struct mbuf *successor"
55.\"
56.Ss Mbuf utility macros
57.Fn mtod "struct mbuf *mbuf" "type"
58.Fn M_ALIGN "struct mbuf *mbuf" "u_int len"
59.Fn MH_ALIGN "struct mbuf *mbuf" "u_int len"
60.Ft int
61.Fn M_LEADINGSPACE "struct mbuf *mbuf"
62.Ft int
63.Fn M_TRAILINGSPACE "struct mbuf *mbuf"
64.Fn M_MOVE_PKTHDR "struct mbuf *to" "struct mbuf *from"
65.Fn M_PREPEND "struct mbuf *mbuf" "int len" "int how"
66.Fn MCHTYPE "struct mbuf *mbuf" "u_int type"
67.Ft int
68.Fn M_WRITABLE "struct mbuf *mbuf"
69.\"
70.Ss Mbuf allocation functions
71.Ft struct mbuf *
72.Fn m_get "int how" "int type"
73.Ft struct mbuf *
74.Fn m_getm "struct mbuf *orig" "int len" "int how" "int type"
75.Ft struct mbuf *
76.Fn m_getcl "int how" "short type" "int flags"
77.Ft struct mbuf *
78.Fn m_getclr "int how" "int type"
79.Ft struct mbuf *
80.Fn m_gethdr "int how" "int type"
81.Ft struct mbuf *
82.Fn m_free "struct mbuf *mbuf"
83.Ft void
84.Fn m_freem "struct mbuf *mbuf"
85.\"
86.Ss Mbuf utility functions
87.Ft void
88.Fn m_adj "struct mbuf *mbuf" "int len"
89.Ft void
90.Fn m_align "struct mbuf *mbuf" "int len"
91.Ft int
92.Fn m_append "struct mbuf *mbuf" "int len" "c_caddr_t cp"
93.Ft struct mbuf *
94.Fn m_prepend "struct mbuf *mbuf" "int len" "int how"
95.Ft struct mbuf *
96.Fn m_copyup "struct mbuf *mbuf" "int len" "int dstoff"
97.Ft struct mbuf *
98.Fn m_pullup "struct mbuf *mbuf" "int len"
99.Ft struct mbuf *
100.Fn m_copym "struct mbuf *mbuf" "int offset" "int len" "int how"
101.Ft struct mbuf *
102.Fn m_copypacket "struct mbuf *mbuf" "int how"
103.Ft struct mbuf *
104.Fn m_dup "struct mbuf *mbuf" "int how"
105.Ft void
106.Fn m_copydata "const struct mbuf *mbuf" "int offset" "int len" "caddr_t buf"
107.Ft void
108.Fn m_copyback "struct mbuf *mbuf" "int offset" "int len" "caddr_t buf"
109.Ft struct mbuf *
110.Fo m_devget
111.Fa "char *buf"
112.Fa "int len"
113.Fa "int offset"
114.Fa "struct ifnet *ifp"
115.Fa "void (*copy)(char *from, caddr_t to, u_int len)"
116.Fc
117.Ft void
118.Fn m_cat "struct mbuf *m" "struct mbuf *n"
119.Ft u_int
120.Fn m_fixhdr "struct mbuf *mbuf"
121.Ft void
122.Fn m_dup_pkthdr "struct mbuf *to" "struct mbuf *from"
123.Ft void
124.Fn m_move_pkthdr "struct mbuf *to" "struct mbuf *from"
125.Ft u_int
126.Fn m_length "struct mbuf *mbuf" "struct mbuf **last"
127.Ft struct mbuf *
128.Fn m_split "struct mbuf *mbuf" "int len" "int how"
129.Ft int
130.Fn m_apply "struct mbuf *mbuf" "int off" "int len" "int (*f)(void *arg, void *data, u_int len)" "void *arg"
131.Ft struct mbuf *
132.Fn m_getptr "struct mbuf *mbuf" "int loc" "int *off"
133.Ft struct mbuf *
134.Fn m_defrag "struct mbuf *m0" "int how"
135.Ft struct mbuf *
136.Fn m_unshare "struct mbuf *m0" "int how"
137.\"
138.Sh DESCRIPTION
139An
140.Vt mbuf
141is a basic unit of memory management in the kernel IPC subsystem.
142Network packets and socket buffers are stored in
143.Vt mbufs .
144A network packet may span multiple
145.Vt mbufs
146arranged into a
147.Vt mbuf chain
148(linked list),
149which allows adding or trimming
150network headers with little overhead.
151.Pp
152While a developer should not bother with
153.Vt mbuf
154internals without serious
155reason in order to avoid incompatibilities with future changes, it
156is useful to understand the general structure of an
157.Vt mbuf .
158.Pp
159An
160.Vt mbuf
161consists of a variable-sized header and a small internal
162buffer for data.
163The total size of an
164.Vt mbuf ,
165.Dv MSIZE ,
166is a constant defined in
167.In sys/param.h .
168The
169.Vt mbuf
170header includes:
171.Pp
172.Bl -tag -width "m_nextpkt" -offset indent
173.It Va m_next
174.Pq Vt struct mbuf *
175A pointer to the next
176.Vt mbuf
177in the
178.Vt mbuf chain .
179.It Va m_nextpkt
180.Pq Vt struct mbuf *
181A pointer to the next
182.Vt mbuf chain
183in the queue.
184.It Va m_data
185.Pq Vt caddr_t
186A pointer to data attached to this
187.Vt mbuf .
188.It Va m_len
189.Pq Vt int
190The length of the data.
191.It Va m_type
192.Pq Vt short
193The type of the data.
194.It Va m_flags
195.Pq Vt int
196The
197.Vt mbuf
198flags.
199.El
200.Pp
201The
202.Vt mbuf
203flag bits are defined as follows:
204.Bd -literal
205/* mbuf flags */
206#define	M_EXT		0x0001	/* has associated external storage */
207#define	M_PKTHDR	0x0002	/* start of record */
208#define	M_EOR		0x0004	/* end of record */
209#define	M_RDONLY	0x0008	/* associated data marked read-only */
210#define	M_PROTO1	0x0010	/* protocol-specific */
211#define	M_PROTO2	0x0020 	/* protocol-specific */
212#define	M_PROTO3	0x0040	/* protocol-specific */
213#define	M_PROTO4	0x0080	/* protocol-specific */
214#define	M_PROTO5	0x0100	/* protocol-specific */
215#define	M_PROTO6	0x4000	/* protocol-specific (avoid M_BCAST conflict) */
216#define	M_FREELIST	0x8000	/* mbuf is on the free list */
217
218/* mbuf pkthdr flags (also stored in m_flags) */
219#define	M_BCAST		0x0200	/* send/received as link-level broadcast */
220#define	M_MCAST		0x0400	/* send/received as link-level multicast */
221#define	M_FRAG		0x0800	/* packet is fragment of larger packet */
222#define	M_FIRSTFRAG	0x1000	/* packet is first fragment */
223#define	M_LASTFRAG	0x2000	/* packet is last fragment */
224.Ed
225.Pp
226The available
227.Vt mbuf
228types are defined as follows:
229.Bd -literal
230/* mbuf types */
231#define	MT_DATA		1	/* dynamic (data) allocation */
232#define	MT_HEADER	MT_DATA	/* packet header */
233#define	MT_SONAME	8	/* socket name */
234#define	MT_CONTROL	14	/* extra-data protocol message */
235#define	MT_OOBDATA	15	/* expedited data */
236.Ed
237.Pp
238If the
239.Dv M_PKTHDR
240flag is set, a
241.Vt struct pkthdr Va m_pkthdr
242is added to the
243.Vt mbuf
244header.
245It contains a pointer to the interface
246the packet has been received from
247.Pq Vt struct ifnet Va *rcvif ,
248and the total packet length
249.Pq Vt int Va len .
250Optionally, it may also contain an attached list of packet tags
251.Pq Vt "struct m_tag" .
252See
253.Xr mbuf_tags 9
254for details.
255Fields used in offloading checksum calculation to the hardware are kept in
256.Va m_pkthdr
257as well.
258See
259.Sx HARDWARE-ASSISTED CHECKSUM CALCULATION
260for details.
261.Pp
262If small enough, data is stored in the internal data buffer of an
263.Vt mbuf .
264If the data is sufficiently large, another
265.Vt mbuf
266may be added to the
267.Vt mbuf chain ,
268or external storage may be associated with the
269.Vt mbuf .
270.Dv MHLEN
271bytes of data can fit into an
272.Vt mbuf
273with the
274.Dv M_PKTHDR
275flag set,
276.Dv MLEN
277bytes can otherwise.
278.Pp
279If external storage is being associated with an
280.Vt mbuf ,
281the
282.Va m_ext
283header is added at the cost of losing the internal data buffer.
284It includes a pointer to external storage, the size of the storage,
285a pointer to a function used for freeing the storage,
286a pointer to an optional argument that can be passed to the function,
287and a pointer to a reference counter.
288An
289.Vt mbuf
290using external storage has the
291.Dv M_EXT
292flag set.
293.Pp
294The system supplies a macro for allocating the desired external storage
295buffer,
296.Dv MEXTADD .
297.Pp
298The allocation and management of the reference counter is handled by the
299subsystem.
300.Pp
301The system also supplies a default type of external storage buffer called an
302.Vt mbuf cluster .
303.Vt Mbuf clusters
304can be allocated and configured with the use of the
305.Dv MCLGET
306macro.
307Each
308.Vt mbuf cluster
309is
310.Dv MCLBYTES
311in size, where MCLBYTES is a machine-dependent constant.
312The system defines an advisory macro
313.Dv MINCLSIZE ,
314which is the smallest amount of data to put into an
315.Vt mbuf cluster .
316It is equal to the sum of
317.Dv MLEN
318and
319.Dv MHLEN .
320It is typically preferable to store data into the data region of an
321.Vt mbuf ,
322if size permits, as opposed to allocating a separate
323.Vt mbuf cluster
324to hold the same data.
325.\"
326.Ss Macros and Functions
327There are numerous predefined macros and functions that provide the
328developer with common utilities.
329.\"
330.Bl -ohang -offset indent
331.It Fn mtod mbuf type
332Convert an
333.Fa mbuf
334pointer to a data pointer.
335The macro expands to the data pointer cast to the pointer of the specified
336.Fa type .
337.Sy Note :
338It is advisable to ensure that there is enough contiguous data in
339.Fa mbuf .
340See
341.Fn m_pullup
342for details.
343.It Fn MGET mbuf how type
344Allocate an
345.Vt mbuf
346and initialize it to contain internal data.
347.Fa mbuf
348will point to the allocated
349.Vt mbuf
350on success, or be set to
351.Dv NULL
352on failure.
353The
354.Fa how
355argument is to be set to
356.Dv M_TRYWAIT
357or
358.Dv M_DONTWAIT .
359It specifies whether the caller is willing to block if necessary.
360If
361.Fa how
362is set to
363.Dv M_TRYWAIT ,
364a failed allocation will result in the caller being put
365to sleep for a designated
366kern.ipc.mbuf_wait
367.Xr ( sysctl 8
368tunable)
369number of ticks.
370A number of other functions and macros related to
371.Vt mbufs
372have the same argument because they may
373at some point need to allocate new
374.Vt mbufs .
375.Pp
376Programmers should be careful not to confuse the
377.Vt mbuf
378allocation flag
379.Dv M_DONTWAIT
380with the
381.Xr malloc 9
382allocation flag,
383.Dv M_NOWAIT .
384They are not the same.
385.It Fn MGETHDR mbuf how type
386Allocate an
387.Vt mbuf
388and initialize it to contain a packet header
389and internal data.
390See
391.Fn MGET
392for details.
393.It Fn MCLGET mbuf how
394Allocate and attach an
395.Vt mbuf cluster
396to
397.Fa mbuf .
398If the macro fails, the
399.Dv M_EXT
400flag will not be set in
401.Fa mbuf .
402.It Fn M_ALIGN mbuf len
403Set the pointer
404.Fa mbuf->m_data
405to place an object of the size
406.Fa len
407at the end of the internal data area of
408.Fa mbuf ,
409long word aligned.
410Applicable only if
411.Fa mbuf
412is newly allocated with
413.Fn MGET
414or
415.Fn m_get .
416.It Fn MH_ALIGN mbuf len
417Serves the same purpose as
418.Fn M_ALIGN
419does, but only for
420.Fa mbuf
421newly allocated with
422.Fn MGETHDR
423or
424.Fn m_gethdr ,
425or initialized by
426.Fn m_dup_pkthdr
427or
428.Fn m_move_pkthdr .
429.It Fn m_align mbuf len
430Services the same purpose as
431.Fn M_ALIGN
432but handles any type of mbuf.
433.It Fn M_LEADINGSPACE mbuf
434Returns the number of bytes available before the beginning
435of data in
436.Fa mbuf .
437.It Fn M_TRAILINGSPACE mbuf
438Returns the number of bytes available after the end of data in
439.Fa mbuf .
440.It Fn M_PREPEND mbuf len how
441This macro operates on an
442.Vt mbuf chain .
443It is an optimized wrapper for
444.Fn m_prepend
445that can make use of possible empty space before data
446(e.g.\& left after trimming of a link-layer header).
447The new
448.Vt mbuf chain
449pointer or
450.Dv NULL
451is in
452.Fa mbuf
453after the call.
454.It Fn M_MOVE_PKTHDR to from
455Using this macro is equivalent to calling
456.Fn m_move_pkthdr to from .
457.It Fn M_WRITABLE mbuf
458This macro will evaluate true if
459.Fa mbuf
460is not marked
461.Dv M_RDONLY
462and if either
463.Fa mbuf
464does not contain external storage or,
465if it does,
466then if the reference count of the storage is not greater than 1.
467The
468.Dv M_RDONLY
469flag can be set in
470.Fa mbuf->m_flags .
471This can be achieved during setup of the external storage,
472by passing the
473.Dv M_RDONLY
474bit as a
475.Fa flags
476argument to the
477.Fn MEXTADD
478macro, or can be directly set in individual
479.Vt mbufs .
480.It Fn MCHTYPE mbuf type
481Change the type of
482.Fa mbuf
483to
484.Fa type .
485This is a relatively expensive operation and should be avoided.
486.El
487.Pp
488The functions are:
489.Bl -ohang -offset indent
490.It Fn m_get how type
491A function version of
492.Fn MGET
493for non-critical paths.
494.It Fn m_getm orig len how type
495Allocate
496.Fa len
497bytes worth of
498.Vt mbufs
499and
500.Vt mbuf clusters
501if necessary and append the resulting allocated
502.Vt mbuf chain
503to the
504.Vt mbuf chain
505.Fa orig ,
506if it is
507.No non- Ns Dv NULL .
508If the allocation fails at any point,
509free whatever was allocated and return
510.Dv NULL .
511If
512.Fa orig
513is
514.No non- Ns Dv NULL ,
515it will not be freed.
516It is possible to use
517.Fn m_getm
518to either append
519.Fa len
520bytes to an existing
521.Vt mbuf
522or
523.Vt mbuf chain
524(for example, one which may be sitting in a pre-allocated ring)
525or to simply perform an all-or-nothing
526.Vt mbuf
527and
528.Vt mbuf cluster
529allocation.
530.It Fn m_gethdr how type
531A function version of
532.Fn MGETHDR
533for non-critical paths.
534.It Fn m_getcl how type flags
535Fetch an
536.Vt mbuf
537with a
538.Vt mbuf cluster
539attached to it.
540If one of the allocations fails, the entire allocation fails.
541This routine is the preferred way of fetching both the
542.Vt mbuf
543and
544.Vt mbuf cluster
545together, as it avoids having to unlock/relock between allocations.
546Returns
547.Dv NULL
548on failure.
549.It Fn m_getclr how type
550Allocate an
551.Vt mbuf
552and zero out the data region.
553.It Fn m_free mbuf
554Frees
555.Vt mbuf .
556Returns
557.Va m_next
558of the freed
559.Vt mbuf .
560.El
561.Pp
562The functions below operate on
563.Vt mbuf chains .
564.Bl -ohang -offset indent
565.It Fn m_freem mbuf
566Free an entire
567.Vt mbuf chain ,
568including any external storage.
569.\"
570.It Fn m_adj mbuf len
571Trim
572.Fa len
573bytes from the head of an
574.Vt mbuf chain
575if
576.Fa len
577is positive, from the tail otherwise.
578.\"
579.It Fn m_append mbuf len cp
580Append
581.Vt len
582bytes of data
583.Vt cp
584to the
585.Vt mbuf chain .
586Extend the mbuf chain if the new data does not fit in
587existing space.
588.\"
589.It Fn m_prepend mbuf len how
590Allocate a new
591.Vt mbuf
592and prepend it to the
593.Vt mbuf chain ,
594handle
595.Dv M_PKTHDR
596properly.
597.Sy Note :
598It does not allocate any
599.Vt mbuf clusters ,
600so
601.Fa len
602must be less than
603.Dv MLEN
604or
605.Dv MHLEN ,
606depending on the
607.Dv M_PKTHDR
608flag setting.
609.\"
610.It Fn m_copyup mbuf len dstoff
611Similar to
612.Fn m_pullup
613but copies
614.Fa len
615bytes of data into a new mbuf at
616.Fa dstoff
617bytes into the mbuf.
618The
619.Fa dstoff
620argument aligns the data and leaves room for a link layer header.
621Returns the new
622.Vt mbuf chain
623on success,
624and frees the
625.Vt mbuf chain
626and returns
627.Dv NULL
628on failure.
629.Sy Note :
630The function does not allocate
631.Vt mbuf clusters ,
632so
633.Fa len + dstoff
634must be less than
635.Dv MHLEN .
636.\"
637.It Fn m_pullup mbuf len
638Arrange that the first
639.Fa len
640bytes of an
641.Vt mbuf chain
642are contiguous and lay in the data area of
643.Fa mbuf ,
644so they are accessible with
645.Fn mtod mbuf type .
646Return the new
647.Vt mbuf chain
648on success,
649.Dv NULL
650on failure
651(the
652.Vt mbuf chain
653is freed in this case).
654.Sy Note :
655It does not allocate any
656.Vt mbuf clusters ,
657so
658.Fa len
659must be less than
660.Dv MHLEN .
661.\"
662.It Fn m_copym mbuf offset len how
663Make a copy of an
664.Vt mbuf chain
665starting
666.Fa offset
667bytes from the beginning, continuing for
668.Fa len
669bytes.
670If
671.Fa len
672is
673.Dv M_COPYALL ,
674copy to the end of the
675.Vt mbuf chain .
676.Sy Note :
677The copy is read-only, because the
678.Vt mbuf clusters
679are not copied, only their reference counts are incremented.
680.\"
681.It Fn m_copypacket mbuf how
682Copy an entire packet including header, which must be present.
683This is an optimized version of the common case
684.Fn m_copym mbuf 0 M_COPYALL how .
685.Sy Note :
686the copy is read-only, because the
687.Vt mbuf clusters
688are not copied, only their reference counts are incremented.
689.\"
690.It Fn m_dup mbuf how
691Copy a packet header
692.Vt mbuf chain
693into a completely new
694.Vt mbuf chain ,
695including copying any
696.Vt mbuf clusters .
697Use this instead of
698.Fn m_copypacket
699when you need a writable copy of an
700.Vt mbuf chain .
701.\"
702.It Fn m_copydata mbuf offset len buf
703Copy data from an
704.Vt mbuf chain
705starting
706.Fa off
707bytes from the beginning, continuing for
708.Fa len
709bytes, into the indicated buffer
710.Fa buf .
711.\"
712.It Fn m_copyback mbuf offset len buf
713Copy
714.Fa len
715bytes from the buffer
716.Fa buf
717back into the indicated
718.Vt mbuf chain ,
719starting at
720.Fa offset
721bytes from the beginning of the
722.Vt mbuf chain ,
723extending the
724.Vt mbuf chain
725if necessary.
726.Sy Note :
727It does not allocate any
728.Vt mbuf clusters ,
729just adds
730.Vt mbufs
731to the
732.Vt mbuf chain .
733It is safe to set
734.Fa offset
735beyond the current
736.Vt mbuf chain
737end: zeroed
738.Vt mbufs
739will be allocated to fill the space.
740.\"
741.It Fn m_length mbuf last
742Return the length of the
743.Vt mbuf chain ,
744and optionally a pointer to the last
745.Vt mbuf .
746.\"
747.It Fn m_dup_pkthdr to from how
748Upon the function's completion, the
749.Vt mbuf
750.Fa to
751will contain an identical copy of
752.Fa from->m_pkthdr
753and the per-packet attributes found in the
754.Vt mbuf chain
755.Fa from .
756The
757.Vt mbuf
758.Fa from
759must have the flag
760.Dv M_PKTHDR
761initially set, and
762.Fa to
763must be empty on entry.
764.\"
765.It Fn m_move_pkthdr to from
766Move
767.Va m_pkthdr
768and the per-packet attributes from the
769.Vt mbuf chain
770.Fa from
771to the
772.Vt mbuf
773.Fa to .
774The
775.Vt mbuf
776.Fa from
777must have the flag
778.Dv M_PKTHDR
779initially set, and
780.Fa to
781must be empty on entry.
782Upon the function's completion,
783.Fa from
784will have the flag
785.Dv M_PKTHDR
786and the per-packet attributes cleared.
787.\"
788.It Fn m_fixhdr mbuf
789Set the packet-header length to the length of the
790.Vt mbuf chain .
791.\"
792.It Fn m_devget buf len offset ifp copy
793Copy data from a device local memory pointed to by
794.Fa buf
795to an
796.Vt mbuf chain .
797The copy is done using a specified copy routine
798.Fa copy ,
799or
800.Fn bcopy
801if
802.Fa copy
803is
804.Dv NULL .
805.\"
806.It Fn m_cat m n
807Concatenate
808.Fa n
809to
810.Fa m .
811Both
812.Vt mbuf chains
813must be of the same type.
814.Fa N
815is still valid after the function returned.
816.Sy Note :
817It does not handle
818.Dv M_PKTHDR
819and friends.
820.\"
821.It Fn m_split mbuf len how
822Partition an
823.Vt mbuf chain
824in two pieces, returning the tail:
825all but the first
826.Fa len
827bytes.
828In case of failure, it returns
829.Dv NULL
830and attempts to restore the
831.Vt mbuf chain
832to its original state.
833.\"
834.It Fn m_apply mbuf off len f arg
835Apply a function to an
836.Vt mbuf chain ,
837at offset
838.Fa off ,
839for length
840.Fa len
841bytes.
842Typically used to avoid calls to
843.Fn m_pullup
844which would otherwise be unnecessary or undesirable.
845.Fa arg
846is a convenience argument which is passed to the callback function
847.Fa f .
848.Pp
849Each time
850.Fn f
851is called, it will be passed
852.Fa arg ,
853a pointer to the
854.Fa data
855in the current mbuf, and the length
856.Fa len
857of the data in this mbuf to which the function should be applied.
858.Pp
859The function should return zero to indicate success;
860otherwise, if an error is indicated, then
861.Fn m_apply
862will return the error and stop iterating through the
863.Vt mbuf chain .
864.\"
865.It Fn m_getptr mbuf loc off
866Return a pointer to the mbuf containing the data located at
867.Fa loc
868bytes from the beginning of the
869.Vt mbuf chain .
870The corresponding offset into the mbuf will be stored in
871.Fa *off .
872.It Fn m_defrag m0 how
873Defragment an mbuf chain, returning the shortest possible
874chain of mbufs and clusters.
875If allocation fails and this can not be completed,
876.Dv NULL
877will be returned and the original chain will be unchanged.
878Upon success, the original chain will be freed and the new
879chain will be returned.
880.Fa how
881should be either
882.Dv M_TRYWAIT
883or
884.Dv M_DONTWAIT ,
885depending on the caller's preference.
886.Pp
887This function is especially useful in network drivers, where
888certain long mbuf chains must be shortened before being added
889to TX descriptor lists.
890.It Fn m_unshare m0 how
891Create a version of the specified mbuf chain whose
892contents can be safely modified without affecting other users.
893If allocation fails and this operation can not be completed,
894.Dv NULL
895will be returned.
896The original mbuf chain is always reclaimed and the reference
897count of any shared mbuf clusters is decremented.
898.Fa how
899should be either
900.Dv M_TRYWAIT
901or
902.Dv M_DONTWAIT ,
903depending on the caller's preference.
904As a side-effect of this process the returned
905mbuf chain may be compacted.
906.Pp
907This function is especially useful in the transmit path of
908network code, when data must be encrypted or otherwise
909altered prior to transmission.
910.El
911.Sh HARDWARE-ASSISTED CHECKSUM CALCULATION
912This section currently applies to TCP/IP only.
913In order to save the host CPU resources, computing checksums is
914offloaded to the network interface hardware if possible.
915The
916.Va m_pkthdr
917member of the leading
918.Vt mbuf
919of a packet contains two fields used for that purpose,
920.Vt int Va csum_flags
921and
922.Vt int Va csum_data .
923The meaning of those fields depends on the direction a packet flows in,
924and on whether the packet is fragmented.
925Henceforth,
926.Va csum_flags
927or
928.Va csum_data
929of a packet
930will denote the corresponding field of the
931.Va m_pkthdr
932member of the leading
933.Vt mbuf
934in the
935.Vt mbuf chain
936containing the packet.
937.Pp
938On output, checksum offloading is attempted after the outgoing
939interface has been determined for a packet.
940The interface-specific field
941.Va ifnet.if_data.ifi_hwassist
942(see
943.Xr ifnet 9 )
944is consulted for the capabilities of the interface to assist in
945computing checksums.
946The
947.Va csum_flags
948field of the packet header is set to indicate which actions the interface
949is supposed to perform on it.
950The actions unsupported by the network interface are done in the
951software prior to passing the packet down to the interface driver;
952such actions will never be requested through
953.Va csum_flags .
954.Pp
955The flags demanding a particular action from an interface are as follows:
956.Bl -tag -width ".Dv CSUM_TCP" -offset indent
957.It Dv CSUM_IP
958The IP header checksum is to be computed and stored in the
959corresponding field of the packet.
960The hardware is expected to know the format of an IP header
961to determine the offset of the IP checksum field.
962.It Dv CSUM_TCP
963The TCP checksum is to be computed.
964(See below.)
965.It Dv CSUM_UDP
966The UDP checksum is to be computed.
967(See below.)
968.El
969.Pp
970Should a TCP or UDP checksum be offloaded to the hardware,
971the field
972.Va csum_data
973will contain the byte offset of the checksum field relative to the
974end of the IP header.
975In this case, the checksum field will be initially
976set by the TCP/IP module to the checksum of the pseudo header
977defined by the TCP and UDP specifications.
978.Pp
979For outbound packets which have been fragmented
980by the host CPU, the following will also be true,
981regardless of the checksum flag settings:
982.Bl -bullet -offset indent
983.It
984all fragments will have the flag
985.Dv M_FRAG
986set in their
987.Va m_flags
988field;
989.It
990the first and the last fragments in the chain will have
991.Dv M_FIRSTFRAG
992or
993.Dv M_LASTFRAG
994set in their
995.Va m_flags ,
996correspondingly;
997.It
998the first fragment in the chain will have the total number
999of fragments contained in its
1000.Va csum_data
1001field.
1002.El
1003.Pp
1004The last rule for fragmented packets takes precedence over the one
1005for a TCP or UDP checksum.
1006Nevertheless, offloading a TCP or UDP checksum is possible for a
1007fragmented packet if the flag
1008.Dv CSUM_IP_FRAGS
1009is set in the field
1010.Va ifnet.if_data.ifi_hwassist
1011associated with the network interface.
1012However, in this case the interface is expected to figure out
1013the location of the checksum field within the sequence of fragments
1014by itself because
1015.Va csum_data
1016contains a fragment count instead of a checksum offset value.
1017.Pp
1018On input, an interface indicates the actions it has performed
1019on a packet by setting one or more of the following flags in
1020.Va csum_flags
1021associated with the packet:
1022.Bl -tag -width ".Dv CSUM_IP_CHECKED" -offset indent
1023.It Dv CSUM_IP_CHECKED
1024The IP header checksum has been computed.
1025.It Dv CSUM_IP_VALID
1026The IP header has a valid checksum.
1027This flag can appear only in combination with
1028.Dv CSUM_IP_CHECKED .
1029.It Dv CSUM_DATA_VALID
1030The checksum of the data portion of the IP packet has been computed
1031and stored in the field
1032.Va csum_data
1033in network byte order.
1034.It Dv CSUM_PSEUDO_HDR
1035Can be set only along with
1036.Dv CSUM_DATA_VALID
1037to indicate that the IP data checksum found in
1038.Va csum_data
1039allows for the pseudo header defined by the TCP and UDP specifications.
1040Otherwise the checksum of the pseudo header must be calculated by
1041the host CPU and added to
1042.Va csum_data
1043to obtain the final checksum to be used for TCP or UDP validation purposes.
1044.El
1045.Pp
1046If a particular network interface just indicates success or
1047failure of TCP or UDP checksum validation without returning
1048the exact value of the checksum to the host CPU, its driver can mark
1049.Dv CSUM_DATA_VALID
1050and
1051.Dv CSUM_PSEUDO_HDR
1052in
1053.Va csum_flags ,
1054and set
1055.Va csum_data
1056to
1057.Li 0xFFFF
1058hexadecimal to indicate a valid checksum.
1059It is a peculiarity of the algorithm used that the Internet checksum
1060calculated over any valid packet will be
1061.Li 0xFFFF
1062as long as the original checksum field is included.
1063.Pp
1064For inbound packets which are IP fragments, all
1065.Va csum_data
1066fields will be summed during reassembly to obtain the final checksum
1067value passed to an upper layer in the
1068.Va csum_data
1069field of the reassembled packet.
1070The
1071.Va csum_flags
1072fields of all fragments will be consolidated using logical AND
1073to obtain the final value for
1074.Va csum_flags .
1075Thus, in order to successfully
1076offload checksum computation for fragmented data,
1077all fragments should have the same value of
1078.Va csum_flags .
1079.Sh STRESS TESTING
1080When running a kernel compiled with the option
1081.Dv MBUF_STRESS_TEST ,
1082the following
1083.Xr sysctl 8 Ns
1084-controlled options may be used to create
1085various failure/extreme cases for testing of network drivers
1086and other parts of the kernel that rely on
1087.Vt mbufs .
1088.Bl -tag -width ident
1089.It Va net.inet.ip.mbuf_frag_size
1090Causes
1091.Fn ip_output
1092to fragment outgoing
1093.Vt mbuf chains
1094into fragments of the specified size.
1095Setting this variable to 1 is an excellent way to
1096test the long
1097.Vt mbuf chain
1098handling ability of network drivers.
1099.It Va kern.ipc.m_defragrandomfailures
1100Causes the function
1101.Fn m_defrag
1102to randomly fail, returning
1103.Dv NULL .
1104Any piece of code which uses
1105.Fn m_defrag
1106should be tested with this feature.
1107.El
1108.Sh RETURN VALUES
1109See above.
1110.Sh SEE ALSO
1111.Xr ifnet 9 ,
1112.Xr mbuf_tags 9
1113.Sh HISTORY
1114.\" Please correct me if I'm wrong
1115.Vt Mbufs
1116appeared in an early version of
1117.Bx .
1118Besides being used for network packets, they were used
1119to store various dynamic structures, such as routing table
1120entries, interface addresses, protocol control blocks, etc.
1121.Sh AUTHORS
1122The original
1123.Nm
1124manual page was written by Yar Tikhiy.
1125