kern_mbuf.c (0269ae4c19ad779b43b0d6e2416ac7386945d692) | kern_mbuf.c (82334850ea451f7f6903be20e4836118e6a77460) |
---|---|
1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions --- 31 unchanged lines hidden (view full) --- 40#include <sys/mbuf.h> 41#include <sys/domain.h> 42#include <sys/eventhandler.h> 43#include <sys/kernel.h> 44#include <sys/limits.h> 45#include <sys/lock.h> 46#include <sys/mutex.h> 47#include <sys/protosw.h> | 1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2005, 5 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions --- 31 unchanged lines hidden (view full) --- 40#include <sys/mbuf.h> 41#include <sys/domain.h> 42#include <sys/eventhandler.h> 43#include <sys/kernel.h> 44#include <sys/limits.h> 45#include <sys/lock.h> 46#include <sys/mutex.h> 47#include <sys/protosw.h> |
48#include <sys/sf_buf.h> |
|
48#include <sys/smp.h> 49#include <sys/socket.h> 50#include <sys/sysctl.h> 51 52#include <net/if.h> 53#include <net/if_var.h> 54 55#include <vm/vm.h> --- 220 unchanged lines hidden (view full) --- 276 * Zones from which we allocate. 277 */ 278uma_zone_t zone_mbuf; 279uma_zone_t zone_clust; 280uma_zone_t zone_pack; 281uma_zone_t zone_jumbop; 282uma_zone_t zone_jumbo9; 283uma_zone_t zone_jumbo16; | 49#include <sys/smp.h> 50#include <sys/socket.h> 51#include <sys/sysctl.h> 52 53#include <net/if.h> 54#include <net/if_var.h> 55 56#include <vm/vm.h> --- 220 unchanged lines hidden (view full) --- 277 * Zones from which we allocate. 278 */ 279uma_zone_t zone_mbuf; 280uma_zone_t zone_clust; 281uma_zone_t zone_pack; 282uma_zone_t zone_jumbop; 283uma_zone_t zone_jumbo9; 284uma_zone_t zone_jumbo16; |
285uma_zone_t zone_extpgs; |
|
284 285/* 286 * Local prototypes. 287 */ 288static int mb_ctor_mbuf(void *, int, void *, int); 289static int mb_ctor_clust(void *, int, void *, int); 290static int mb_ctor_pack(void *, int, void *, int); 291static void mb_dtor_mbuf(void *, int, void *); 292static void mb_dtor_pack(void *, int, void *); 293static int mb_zinit_pack(void *, int, int); 294static void mb_zfini_pack(void *, int); 295static void mb_reclaim(uma_zone_t, int); 296static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 297 298/* Ensure that MSIZE is a power of 2. */ 299CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 300 | 286 287/* 288 * Local prototypes. 289 */ 290static int mb_ctor_mbuf(void *, int, void *, int); 291static int mb_ctor_clust(void *, int, void *, int); 292static int mb_ctor_pack(void *, int, void *, int); 293static void mb_dtor_mbuf(void *, int, void *); 294static void mb_dtor_pack(void *, int, void *); 295static int mb_zinit_pack(void *, int, int); 296static void mb_zfini_pack(void *, int); 297static void mb_reclaim(uma_zone_t, int); 298static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 299 300/* Ensure that MSIZE is a power of 2. */ 301CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); 302 |
303_Static_assert(sizeof(struct mbuf_ext_pgs) == 256, 304 "mbuf_ext_pgs size mismatch"); 305 |
|
301/* 302 * Initialize FreeBSD Network buffer allocation. 303 */ 304static void 305mbuf_init(void *dummy) 306{ 307 308 /* --- 65 unchanged lines hidden (view full) --- 374#endif 375 UMA_ALIGN_PTR, 0); 376 uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); 377 if (nmbjumbo16 > 0) 378 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 379 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 380 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 381 | 306/* 307 * Initialize FreeBSD Network buffer allocation. 308 */ 309static void 310mbuf_init(void *dummy) 311{ 312 313 /* --- 65 unchanged lines hidden (view full) --- 379#endif 380 UMA_ALIGN_PTR, 0); 381 uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); 382 if (nmbjumbo16 > 0) 383 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); 384 uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); 385 uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); 386 |
387 zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME, 388 sizeof(struct mbuf_ext_pgs), 389#ifdef INVARIANTS 390 trash_ctor, trash_dtor, trash_init, trash_fini, 391#else 392 NULL, NULL, NULL, NULL, 393#endif 394 UMA_ALIGN_CACHE, 0); 395 |
|
382 /* 383 * Hook event handler for low-memory situation, used to 384 * drain protocols and push data back to the caches (UMA 385 * later pushes it back to VM). 386 */ 387 EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, 388 EVENTHANDLER_PRI_FIRST); 389 --- 429 unchanged lines hidden (view full) --- 819 820 for (dp = domains; dp != NULL; dp = dp->dom_next) 821 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) 822 if (pr->pr_drain != NULL) 823 (*pr->pr_drain)(); 824} 825 826/* | 396 /* 397 * Hook event handler for low-memory situation, used to 398 * drain protocols and push data back to the caches (UMA 399 * later pushes it back to VM). 400 */ 401 EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, 402 EVENTHANDLER_PRI_FIRST); 403 --- 429 unchanged lines hidden (view full) --- 833 834 for (dp = domains; dp != NULL; dp = dp->dom_next) 835 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) 836 if (pr->pr_drain != NULL) 837 (*pr->pr_drain)(); 838} 839 840/* |
841 * Free "count" units of I/O from an mbuf chain. They could be held 842 * in EXT_PGS or just as a normal mbuf. This code is intended to be 843 * called in an error path (I/O error, closed connection, etc). 844 */ 845void 846mb_free_notready(struct mbuf *m, int count) 847{ 848 int i; 849 850 for (i = 0; i < count && m != NULL; i++) { 851 if ((m->m_flags & M_EXT) != 0 && 852 m->m_ext.ext_type == EXT_PGS) { 853 m->m_ext.ext_pgs->nrdy--; 854 if (m->m_ext.ext_pgs->nrdy != 0) 855 continue; 856 } 857 m = m_free(m); 858 } 859 KASSERT(i == count, ("Removed only %d items from %p", i, m)); 860} 861 862/* 863 * Compress an unmapped mbuf into a simple mbuf when it holds a small 864 * amount of data. This is used as a DOS defense to avoid having 865 * small packets tie up wired pages, an ext_pgs structure, and an 866 * mbuf. Since this converts the existing mbuf in place, it can only 867 * be used if there are no other references to 'm'. 868 */ 869int 870mb_unmapped_compress(struct mbuf *m) 871{ 872 volatile u_int *refcnt; 873 struct mbuf m_temp; 874 875 /* 876 * Assert that 'm' does not have a packet header. If 'm' had 877 * a packet header, it would only be able to hold MHLEN bytes 878 * and m_data would have to be initialized differently. 879 */ 880 KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) && 881 m->m_ext.ext_type == EXT_PGS, 882 ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m)); 883 KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); 884 885 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 886 refcnt = &m->m_ext.ext_count; 887 } else { 888 KASSERT(m->m_ext.ext_cnt != NULL, 889 ("%s: no refcounting pointer on %p", __func__, m)); 890 refcnt = m->m_ext.ext_cnt; 891 } 892 893 if (*refcnt != 1) 894 return (EBUSY); 895 896 /* 897 * Copy mbuf header and m_ext portion of 'm' to 'm_temp' to 898 * create a "fake" EXT_PGS mbuf that can be used with 899 * m_copydata() as well as the ext_free callback. 900 */ 901 memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext)); 902 m_temp.m_next = NULL; 903 m_temp.m_nextpkt = NULL; 904 905 /* Turn 'm' into a "normal" mbuf. */ 906 m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP); 907 m->m_data = m->m_dat; 908 909 /* Copy data from template's ext_pgs. */ 910 m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t)); 911 912 /* Free the backing pages. */ 913 m_temp.m_ext.ext_free(&m_temp); 914 915 /* Finally, free the ext_pgs struct. */ 916 uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs); 917 return (0); 918} 919 920/* 921 * These next few routines are used to permit downgrading an unmapped 922 * mbuf to a chain of mapped mbufs. This is used when an interface 923 * doesn't supported unmapped mbufs or if checksums need to be 924 * computed in software. 925 * 926 * Each unmapped mbuf is converted to a chain of mbufs. First, any 927 * TLS header data is stored in a regular mbuf. Second, each page of 928 * unmapped data is stored in an mbuf with an EXT_SFBUF external 929 * cluster. These mbufs use an sf_buf to provide a valid KVA for the 930 * associated physical page. They also hold a reference on the 931 * original EXT_PGS mbuf to ensure the physical page doesn't go away. 932 * Finally, any TLS trailer data is stored in a regular mbuf. 933 * 934 * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF 935 * mbufs. It frees the associated sf_buf and releases its reference 936 * on the original EXT_PGS mbuf. 937 * 938 * _mb_unmapped_to_ext() is a helper function that converts a single 939 * unmapped mbuf into a chain of mbufs. 940 * 941 * mb_unmapped_to_ext() is the public function that walks an mbuf 942 * chain converting any unmapped mbufs to mapped mbufs. It returns 943 * the new chain of unmapped mbufs on success. On failure it frees 944 * the original mbuf chain and returns NULL. 945 */ 946static void 947mb_unmapped_free_mext(struct mbuf *m) 948{ 949 struct sf_buf *sf; 950 struct mbuf *old_m; 951 952 sf = m->m_ext.ext_arg1; 953 sf_buf_free(sf); 954 955 /* Drop the reference on the backing EXT_PGS mbuf. */ 956 old_m = m->m_ext.ext_arg2; 957 mb_free_ext(old_m); 958} 959 960static struct mbuf * 961_mb_unmapped_to_ext(struct mbuf *m) 962{ 963 struct mbuf_ext_pgs *ext_pgs; 964 struct mbuf *m_new, *top, *prev, *mref; 965 struct sf_buf *sf; 966 vm_page_t pg; 967 int i, len, off, pglen, pgoff, seglen, segoff; 968 volatile u_int *refcnt; 969 u_int ref_inc = 0; 970 971 MBUF_EXT_PGS_ASSERT(m); 972 ext_pgs = m->m_ext.ext_pgs; 973 len = m->m_len; 974 KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p", 975 __func__, m)); 976 977 /* See if this is the mbuf that holds the embedded refcount. */ 978 if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { 979 refcnt = &m->m_ext.ext_count; 980 mref = m; 981 } else { 982 KASSERT(m->m_ext.ext_cnt != NULL, 983 ("%s: no refcounting pointer on %p", __func__, m)); 984 refcnt = m->m_ext.ext_cnt; 985 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); 986 } 987 988 /* Skip over any data removed from the front. */ 989 off = mtod(m, vm_offset_t); 990 991 top = NULL; 992 if (ext_pgs->hdr_len != 0) { 993 if (off >= ext_pgs->hdr_len) { 994 off -= ext_pgs->hdr_len; 995 } else { 996 seglen = ext_pgs->hdr_len - off; 997 segoff = off; 998 seglen = min(seglen, len); 999 off = 0; 1000 len -= seglen; 1001 m_new = m_get(M_NOWAIT, MT_DATA); 1002 if (m_new == NULL) 1003 goto fail; 1004 m_new->m_len = seglen; 1005 prev = top = m_new; 1006 memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff], 1007 seglen); 1008 } 1009 } 1010 pgoff = ext_pgs->first_pg_off; 1011 for (i = 0; i < ext_pgs->npgs && len > 0; i++) { 1012 pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); 1013 if (off >= pglen) { 1014 off -= pglen; 1015 pgoff = 0; 1016 continue; 1017 } 1018 seglen = pglen - off; 1019 segoff = pgoff + off; 1020 off = 0; 1021 seglen = min(seglen, len); 1022 len -= seglen; 1023 1024 pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); 1025 m_new = m_get(M_NOWAIT, MT_DATA); 1026 if (m_new == NULL) 1027 goto fail; 1028 if (top == NULL) { 1029 top = prev = m_new; 1030 } else { 1031 prev->m_next = m_new; 1032 prev = m_new; 1033 } 1034 sf = sf_buf_alloc(pg, SFB_NOWAIT); 1035 if (sf == NULL) 1036 goto fail; 1037 1038 ref_inc++; 1039 m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, 1040 mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); 1041 m_new->m_data += segoff; 1042 m_new->m_len = seglen; 1043 1044 pgoff = 0; 1045 }; 1046 if (len != 0) { 1047 KASSERT((off + len) <= ext_pgs->trail_len, 1048 ("off + len > trail (%d + %d > %d)", off, len, 1049 ext_pgs->trail_len)); 1050 m_new = m_get(M_NOWAIT, MT_DATA); 1051 if (m_new == NULL) 1052 goto fail; 1053 if (top == NULL) 1054 top = m_new; 1055 else 1056 prev->m_next = m_new; 1057 m_new->m_len = len; 1058 memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len); 1059 } 1060 1061 if (ref_inc != 0) { 1062 /* 1063 * Obtain an additional reference on the old mbuf for 1064 * each created EXT_SFBUF mbuf. They will be dropped 1065 * in mb_unmapped_free_mext(). 1066 */ 1067 if (*refcnt == 1) 1068 *refcnt += ref_inc; 1069 else 1070 atomic_add_int(refcnt, ref_inc); 1071 } 1072 m_free(m); 1073 return (top); 1074 1075fail: 1076 if (ref_inc != 0) { 1077 /* 1078 * Obtain an additional reference on the old mbuf for 1079 * each created EXT_SFBUF mbuf. They will be 1080 * immediately dropped when these mbufs are freed 1081 * below. 1082 */ 1083 if (*refcnt == 1) 1084 *refcnt += ref_inc; 1085 else 1086 atomic_add_int(refcnt, ref_inc); 1087 } 1088 m_free(m); 1089 m_freem(top); 1090 return (NULL); 1091} 1092 1093struct mbuf * 1094mb_unmapped_to_ext(struct mbuf *top) 1095{ 1096 struct mbuf *m, *next, *prev = NULL; 1097 1098 prev = NULL; 1099 for (m = top; m != NULL; m = next) { 1100 /* m might be freed, so cache the next pointer. */ 1101 next = m->m_next; 1102 if (m->m_flags & M_NOMAP) { 1103 if (prev != NULL) { 1104 /* 1105 * Remove 'm' from the new chain so 1106 * that the 'top' chain terminates 1107 * before 'm' in case 'top' is freed 1108 * due to an error. 1109 */ 1110 prev->m_next = NULL; 1111 } 1112 m = _mb_unmapped_to_ext(m); 1113 if (m == NULL) { 1114 m_freem(top); 1115 m_freem(next); 1116 return (NULL); 1117 } 1118 if (prev == NULL) { 1119 top = m; 1120 } else { 1121 prev->m_next = m; 1122 } 1123 1124 /* 1125 * Replaced one mbuf with a chain, so we must 1126 * find the end of chain. 1127 */ 1128 prev = m_last(m); 1129 } else { 1130 if (prev != NULL) { 1131 prev->m_next = m; 1132 } 1133 prev = m; 1134 } 1135 } 1136 return (top); 1137} 1138 1139/* 1140 * Allocate an empty EXT_PGS mbuf. The ext_free routine is 1141 * responsible for freeing any pages backing this mbuf when it is 1142 * freed. 1143 */ 1144struct mbuf * 1145mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free) 1146{ 1147 struct mbuf *m; 1148 struct mbuf_ext_pgs *ext_pgs; 1149 1150 if (pkthdr) 1151 m = m_gethdr(how, MT_DATA); 1152 else 1153 m = m_get(how, MT_DATA); 1154 if (m == NULL) 1155 return (NULL); 1156 1157 ext_pgs = uma_zalloc(zone_extpgs, how); 1158 if (ext_pgs == NULL) { 1159 m_free(m); 1160 return (NULL); 1161 } 1162 ext_pgs->npgs = 0; 1163 ext_pgs->nrdy = 0; 1164 ext_pgs->first_pg_off = 0; 1165 ext_pgs->last_pg_len = 0; 1166 ext_pgs->hdr_len = 0; 1167 ext_pgs->trail_len = 0; 1168 ext_pgs->tls = NULL; 1169 ext_pgs->so = NULL; 1170 m->m_data = NULL; 1171 m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP); 1172 m->m_ext.ext_type = EXT_PGS; 1173 m->m_ext.ext_flags = EXT_FLAG_EMBREF; 1174 m->m_ext.ext_count = 1; 1175 m->m_ext.ext_pgs = ext_pgs; 1176 m->m_ext.ext_size = 0; 1177 m->m_ext.ext_free = ext_free; 1178 return (m); 1179} 1180 1181#ifdef INVARIANT_SUPPORT 1182void 1183mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs) 1184{ 1185 1186 /* 1187 * NB: This expects a non-empty buffer (npgs > 0 and 1188 * last_pg_len > 0). 1189 */ 1190 KASSERT(ext_pgs->npgs > 0, 1191 ("ext_pgs with no valid pages: %p", ext_pgs)); 1192 KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa), 1193 ("ext_pgs with too many pages: %p", ext_pgs)); 1194 KASSERT(ext_pgs->nrdy <= ext_pgs->npgs, 1195 ("ext_pgs with too many ready pages: %p", ext_pgs)); 1196 KASSERT(ext_pgs->first_pg_off < PAGE_SIZE, 1197 ("ext_pgs with too large page offset: %p", ext_pgs)); 1198 KASSERT(ext_pgs->last_pg_len > 0, 1199 ("ext_pgs with zero last page length: %p", ext_pgs)); 1200 KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE, 1201 ("ext_pgs with too large last page length: %p", ext_pgs)); 1202 if (ext_pgs->npgs == 1) { 1203 KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <= 1204 PAGE_SIZE, ("ext_pgs with single page too large: %p", 1205 ext_pgs)); 1206 } 1207 KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr), 1208 ("ext_pgs with too large header length: %p", ext_pgs)); 1209 KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail), 1210 ("ext_pgs with too large header length: %p", ext_pgs)); 1211} 1212#endif 1213 1214/* |
|
827 * Clean up after mbufs with M_EXT storage attached to them if the 828 * reference count hits 1. 829 */ 830void 831mb_free_ext(struct mbuf *m) 832{ 833 volatile u_int *refcnt; 834 struct mbuf *mref; --- 48 unchanged lines hidden (view full) --- 883 case EXT_JUMBO9: 884 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 885 uma_zfree(zone_mbuf, mref); 886 break; 887 case EXT_JUMBO16: 888 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 889 uma_zfree(zone_mbuf, mref); 890 break; | 1215 * Clean up after mbufs with M_EXT storage attached to them if the 1216 * reference count hits 1. 1217 */ 1218void 1219mb_free_ext(struct mbuf *m) 1220{ 1221 volatile u_int *refcnt; 1222 struct mbuf *mref; --- 48 unchanged lines hidden (view full) --- 1271 case EXT_JUMBO9: 1272 uma_zfree(zone_jumbo9, m->m_ext.ext_buf); 1273 uma_zfree(zone_mbuf, mref); 1274 break; 1275 case EXT_JUMBO16: 1276 uma_zfree(zone_jumbo16, m->m_ext.ext_buf); 1277 uma_zfree(zone_mbuf, mref); 1278 break; |
1279 case EXT_PGS: 1280 uma_zfree(zone_extpgs, mref->m_ext.ext_pgs); 1281 uma_zfree(zone_mbuf, mref); 1282 break; |
|
891 case EXT_SFBUF: 892 case EXT_NET_DRV: 893 case EXT_MOD_TYPE: 894 case EXT_DISPOSABLE: 895 KASSERT(mref->m_ext.ext_free != NULL, 896 ("%s: ext_free not set", __func__)); 897 mref->m_ext.ext_free(mref); 898 uma_zfree(zone_mbuf, mref); --- 284 unchanged lines hidden --- | 1283 case EXT_SFBUF: 1284 case EXT_NET_DRV: 1285 case EXT_MOD_TYPE: 1286 case EXT_DISPOSABLE: 1287 KASSERT(mref->m_ext.ext_free != NULL, 1288 ("%s: ext_free not set", __func__)); 1289 mref->m_ext.ext_free(mref); 1290 uma_zfree(zone_mbuf, mref); --- 284 unchanged lines hidden --- |