1 /*- 2 * Copyright (c) 2010-2011 Alexander V. Chernikov <melifaro@ipfw.ru> 3 * Copyright (c) 2004-2005 Gleb Smirnoff <glebius@FreeBSD.org> 4 * Copyright (c) 2001-2003 Roman V. Palagin <romanp@unshadow.net> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $SourceForge: netflow.c,v 1.41 2004/09/05 11:41:10 glebius Exp $ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_inet6.h" 35 #include "opt_route.h" 36 #include <sys/param.h> 37 #include <sys/bitstring.h> 38 #include <sys/systm.h> 39 #include <sys/counter.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/limits.h> 43 #include <sys/mbuf.h> 44 #include <sys/syslog.h> 45 #include <sys/socket.h> 46 #include <vm/uma.h> 47 48 #include <net/if.h> 49 #include <net/if_dl.h> 50 #include <net/if_var.h> 51 #include <net/route.h> 52 #include <net/ethernet.h> 53 #include <netinet/in.h> 54 #include <netinet/in_systm.h> 55 #include <netinet/ip.h> 56 #include <netinet/ip6.h> 57 #include <netinet/tcp.h> 58 #include <netinet/udp.h> 59 60 #include <netgraph/ng_message.h> 61 #include <netgraph/netgraph.h> 62 63 #include <netgraph/netflow/netflow.h> 64 #include <netgraph/netflow/netflow_v9.h> 65 #include <netgraph/netflow/ng_netflow.h> 66 67 #define NBUCKETS (65536) /* must be power of 2 */ 68 69 /* This hash is for TCP or UDP packets. */ 70 #define FULL_HASH(addr1, addr2, port1, port2) \ 71 (((addr1 ^ (addr1 >> 16) ^ \ 72 htons(addr2 ^ (addr2 >> 16))) ^ \ 73 port1 ^ htons(port2)) & \ 74 (NBUCKETS - 1)) 75 76 /* This hash is for all other IP packets. */ 77 #define ADDR_HASH(addr1, addr2) \ 78 ((addr1 ^ (addr1 >> 16) ^ \ 79 htons(addr2 ^ (addr2 >> 16))) & \ 80 (NBUCKETS - 1)) 81 82 /* Macros to shorten logical constructions */ 83 /* XXX: priv must exist in namespace */ 84 #define INACTIVE(fle) (time_uptime - fle->f.last > priv->nfinfo_inact_t) 85 #define AGED(fle) (time_uptime - fle->f.first > priv->nfinfo_act_t) 86 #define ISFREE(fle) (fle->f.packets == 0) 87 88 /* 89 * 4 is a magical number: statistically number of 4-packet flows is 90 * bigger than 5,6,7...-packet flows by an order of magnitude. Most UDP/ICMP 91 * scans are 1 packet (~ 90% of flow cache). TCP scans are 2-packet in case 92 * of reachable host and 4-packet otherwise. 93 */ 94 #define SMALL(fle) (fle->f.packets <= 4) 95 96 MALLOC_DEFINE(M_NETFLOW_HASH, "netflow_hash", "NetFlow hash"); 97 98 static int export_add(item_p, struct flow_entry *); 99 static int export_send(priv_p, fib_export_p, item_p, int); 100 101 static int hash_insert(priv_p, struct flow_hash_entry *, struct flow_rec *, 102 int, uint8_t, uint8_t); 103 #ifdef INET6 104 static int hash6_insert(priv_p, struct flow_hash_entry *, struct flow6_rec *, 105 int, uint8_t, uint8_t); 106 #endif 107 108 static void expire_flow(priv_p, fib_export_p, struct flow_entry *, int); 109 110 /* 111 * Generate hash for a given flow record. 112 * 113 * FIB is not used here, because: 114 * most VRFS will carry public IPv4 addresses which are unique even 115 * without FIB private addresses can overlap, but this is worked out 116 * via flow_rec bcmp() containing fib id. In IPv6 world addresses are 117 * all globally unique (it's not fully true, there is FC00::/7 for example, 118 * but chances of address overlap are MUCH smaller) 119 */ 120 static inline uint32_t 121 ip_hash(struct flow_rec *r) 122 { 123 124 switch (r->r_ip_p) { 125 case IPPROTO_TCP: 126 case IPPROTO_UDP: 127 return FULL_HASH(r->r_src.s_addr, r->r_dst.s_addr, 128 r->r_sport, r->r_dport); 129 default: 130 return ADDR_HASH(r->r_src.s_addr, r->r_dst.s_addr); 131 } 132 } 133 134 #ifdef INET6 135 /* Generate hash for a given flow6 record. Use lower 4 octets from v6 addresses */ 136 static inline uint32_t 137 ip6_hash(struct flow6_rec *r) 138 { 139 140 switch (r->r_ip_p) { 141 case IPPROTO_TCP: 142 case IPPROTO_UDP: 143 return FULL_HASH(r->src.r_src6.__u6_addr.__u6_addr32[3], 144 r->dst.r_dst6.__u6_addr.__u6_addr32[3], r->r_sport, 145 r->r_dport); 146 default: 147 return ADDR_HASH(r->src.r_src6.__u6_addr.__u6_addr32[3], 148 r->dst.r_dst6.__u6_addr.__u6_addr32[3]); 149 } 150 } 151 152 static inline int 153 ip6_masklen(struct in6_addr *saddr, struct rt_addrinfo *info) 154 { 155 const int nbits = sizeof(*saddr) * NBBY; 156 int mlen; 157 158 if (info->rti_addrs & RTA_NETMASK) 159 bit_count((bitstr_t *)saddr, 0, nbits, &mlen); 160 else 161 mlen = nbits; 162 return (mlen); 163 } 164 #endif 165 166 /* 167 * Detach export datagram from priv, if there is any. 168 * If there is no, allocate a new one. 169 */ 170 static item_p 171 get_export_dgram(priv_p priv, fib_export_p fe) 172 { 173 item_p item = NULL; 174 175 mtx_lock(&fe->export_mtx); 176 if (fe->exp.item != NULL) { 177 item = fe->exp.item; 178 fe->exp.item = NULL; 179 } 180 mtx_unlock(&fe->export_mtx); 181 182 if (item == NULL) { 183 struct netflow_v5_export_dgram *dgram; 184 struct mbuf *m; 185 186 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 187 if (m == NULL) 188 return (NULL); 189 item = ng_package_data(m, NG_NOFLAGS); 190 if (item == NULL) 191 return (NULL); 192 dgram = mtod(m, struct netflow_v5_export_dgram *); 193 dgram->header.count = 0; 194 dgram->header.version = htons(NETFLOW_V5); 195 dgram->header.pad = 0; 196 } 197 198 return (item); 199 } 200 201 /* 202 * Re-attach incomplete datagram back to priv. 203 * If there is already another one, then send incomplete. */ 204 static void 205 return_export_dgram(priv_p priv, fib_export_p fe, item_p item, int flags) 206 { 207 208 /* 209 * It may happen on SMP, that some thread has already 210 * put its item there, in this case we bail out and 211 * send what we have to collector. 212 */ 213 mtx_lock(&fe->export_mtx); 214 if (fe->exp.item == NULL) { 215 fe->exp.item = item; 216 mtx_unlock(&fe->export_mtx); 217 } else { 218 mtx_unlock(&fe->export_mtx); 219 export_send(priv, fe, item, flags); 220 } 221 } 222 223 /* 224 * The flow is over. Call export_add() and free it. If datagram is 225 * full, then call export_send(). 226 */ 227 static void 228 expire_flow(priv_p priv, fib_export_p fe, struct flow_entry *fle, int flags) 229 { 230 struct netflow_export_item exp; 231 uint16_t version = fle->f.version; 232 233 if ((priv->export != NULL) && (version == IPVERSION)) { 234 exp.item = get_export_dgram(priv, fe); 235 if (exp.item == NULL) { 236 priv->nfinfo_export_failed++; 237 if (priv->export9 != NULL) 238 priv->nfinfo_export9_failed++; 239 /* fle definitely contains IPv4 flow. */ 240 uma_zfree_arg(priv->zone, fle, priv); 241 return; 242 } 243 244 if (export_add(exp.item, fle) > 0) 245 export_send(priv, fe, exp.item, flags); 246 else 247 return_export_dgram(priv, fe, exp.item, NG_QUEUE); 248 } 249 250 if (priv->export9 != NULL) { 251 exp.item9 = get_export9_dgram(priv, fe, &exp.item9_opt); 252 if (exp.item9 == NULL) { 253 priv->nfinfo_export9_failed++; 254 if (version == IPVERSION) 255 uma_zfree_arg(priv->zone, fle, priv); 256 #ifdef INET6 257 else if (version == IP6VERSION) 258 uma_zfree_arg(priv->zone6, fle, priv); 259 #endif 260 else 261 panic("ng_netflow: Unknown IP proto: %d", 262 version); 263 return; 264 } 265 266 if (export9_add(exp.item9, exp.item9_opt, fle) > 0) 267 export9_send(priv, fe, exp.item9, exp.item9_opt, flags); 268 else 269 return_export9_dgram(priv, fe, exp.item9, 270 exp.item9_opt, NG_QUEUE); 271 } 272 273 if (version == IPVERSION) 274 uma_zfree_arg(priv->zone, fle, priv); 275 #ifdef INET6 276 else if (version == IP6VERSION) 277 uma_zfree_arg(priv->zone6, fle, priv); 278 #endif 279 } 280 281 /* Get a snapshot of node statistics */ 282 void 283 ng_netflow_copyinfo(priv_p priv, struct ng_netflow_info *i) 284 { 285 286 i->nfinfo_bytes = counter_u64_fetch(priv->nfinfo_bytes); 287 i->nfinfo_packets = counter_u64_fetch(priv->nfinfo_packets); 288 i->nfinfo_bytes6 = counter_u64_fetch(priv->nfinfo_bytes6); 289 i->nfinfo_packets6 = counter_u64_fetch(priv->nfinfo_packets6); 290 i->nfinfo_sbytes = counter_u64_fetch(priv->nfinfo_sbytes); 291 i->nfinfo_spackets = counter_u64_fetch(priv->nfinfo_spackets); 292 i->nfinfo_sbytes6 = counter_u64_fetch(priv->nfinfo_sbytes6); 293 i->nfinfo_spackets6 = counter_u64_fetch(priv->nfinfo_spackets6); 294 i->nfinfo_act_exp = counter_u64_fetch(priv->nfinfo_act_exp); 295 i->nfinfo_inact_exp = counter_u64_fetch(priv->nfinfo_inact_exp); 296 297 i->nfinfo_used = uma_zone_get_cur(priv->zone); 298 #ifdef INET6 299 i->nfinfo_used6 = uma_zone_get_cur(priv->zone6); 300 #endif 301 302 i->nfinfo_alloc_failed = priv->nfinfo_alloc_failed; 303 i->nfinfo_export_failed = priv->nfinfo_export_failed; 304 i->nfinfo_export9_failed = priv->nfinfo_export9_failed; 305 i->nfinfo_realloc_mbuf = priv->nfinfo_realloc_mbuf; 306 i->nfinfo_alloc_fibs = priv->nfinfo_alloc_fibs; 307 i->nfinfo_inact_t = priv->nfinfo_inact_t; 308 i->nfinfo_act_t = priv->nfinfo_act_t; 309 } 310 311 /* 312 * Insert a record into defined slot. 313 * 314 * First we get for us a free flow entry, then fill in all 315 * possible fields in it. 316 * 317 * TODO: consider dropping hash mutex while filling in datagram, 318 * as this was done in previous version. Need to test & profile 319 * to be sure. 320 */ 321 static int 322 hash_insert(priv_p priv, struct flow_hash_entry *hsh, struct flow_rec *r, 323 int plen, uint8_t flags, uint8_t tcp_flags) 324 { 325 struct flow_entry *fle; 326 struct sockaddr_in sin, sin_mask; 327 struct sockaddr_dl rt_gateway; 328 struct rt_addrinfo info; 329 330 mtx_assert(&hsh->mtx, MA_OWNED); 331 332 fle = uma_zalloc_arg(priv->zone, priv, M_NOWAIT); 333 if (fle == NULL) { 334 priv->nfinfo_alloc_failed++; 335 return (ENOMEM); 336 } 337 338 /* 339 * Now fle is totally ours. It is detached from all lists, 340 * we can safely edit it. 341 */ 342 fle->f.version = IPVERSION; 343 bcopy(r, &fle->f.r, sizeof(struct flow_rec)); 344 fle->f.bytes = plen; 345 fle->f.packets = 1; 346 fle->f.tcp_flags = tcp_flags; 347 348 fle->f.first = fle->f.last = time_uptime; 349 350 /* 351 * First we do route table lookup on destination address. So we can 352 * fill in out_ifx, dst_mask, nexthop, and dst_as in future releases. 353 */ 354 if ((flags & NG_NETFLOW_CONF_NODSTLOOKUP) == 0) { 355 bzero(&sin, sizeof(sin)); 356 sin.sin_len = sizeof(struct sockaddr_in); 357 sin.sin_family = AF_INET; 358 sin.sin_addr = fle->f.r.r_dst; 359 360 rt_gateway.sdl_len = sizeof(rt_gateway); 361 sin_mask.sin_len = sizeof(struct sockaddr_in); 362 bzero(&info, sizeof(info)); 363 364 info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; 365 info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&sin_mask; 366 367 if (rib_lookup_info(r->fib, (struct sockaddr *)&sin, NHR_REF, 0, 368 &info) == 0) { 369 fle->f.fle_o_ifx = info.rti_ifp->if_index; 370 371 if (info.rti_flags & RTF_GATEWAY && 372 rt_gateway.sdl_family == AF_INET) 373 fle->f.next_hop = 374 ((struct sockaddr_in *)&rt_gateway)->sin_addr; 375 376 if (info.rti_addrs & RTA_NETMASK) 377 fle->f.dst_mask = bitcount32(sin_mask.sin_addr.s_addr); 378 else if (info.rti_flags & RTF_HOST) 379 /* Give up. We can't determine mask :( */ 380 fle->f.dst_mask = 32; 381 382 rib_free_info(&info); 383 } 384 } 385 386 /* Do route lookup on source address, to fill in src_mask. */ 387 if ((flags & NG_NETFLOW_CONF_NOSRCLOOKUP) == 0) { 388 bzero(&sin, sizeof(sin)); 389 sin.sin_len = sizeof(struct sockaddr_in); 390 sin.sin_family = AF_INET; 391 sin.sin_addr = fle->f.r.r_src; 392 393 sin_mask.sin_len = sizeof(struct sockaddr_in); 394 bzero(&info, sizeof(info)); 395 396 info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&sin_mask; 397 398 if (rib_lookup_info(r->fib, (struct sockaddr *)&sin, 0, 0, 399 &info) == 0) { 400 if (info.rti_addrs & RTA_NETMASK) 401 fle->f.src_mask = 402 bitcount32(sin_mask.sin_addr.s_addr); 403 else if (info.rti_flags & RTF_HOST) 404 /* Give up. We can't determine mask :( */ 405 fle->f.src_mask = 32; 406 } 407 } 408 409 /* Push new flow at the and of hash. */ 410 TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash); 411 412 return (0); 413 } 414 415 #ifdef INET6 416 static int 417 hash6_insert(priv_p priv, struct flow_hash_entry *hsh6, struct flow6_rec *r, 418 int plen, uint8_t flags, uint8_t tcp_flags) 419 { 420 struct flow6_entry *fle6; 421 struct sockaddr_in6 sin6, sin6_mask; 422 struct sockaddr_dl rt_gateway; 423 struct rt_addrinfo info; 424 425 mtx_assert(&hsh6->mtx, MA_OWNED); 426 427 fle6 = uma_zalloc_arg(priv->zone6, priv, M_NOWAIT); 428 if (fle6 == NULL) { 429 priv->nfinfo_alloc_failed++; 430 return (ENOMEM); 431 } 432 433 /* 434 * Now fle is totally ours. It is detached from all lists, 435 * we can safely edit it. 436 */ 437 438 fle6->f.version = IP6VERSION; 439 bcopy(r, &fle6->f.r, sizeof(struct flow6_rec)); 440 fle6->f.bytes = plen; 441 fle6->f.packets = 1; 442 fle6->f.tcp_flags = tcp_flags; 443 444 fle6->f.first = fle6->f.last = time_uptime; 445 446 /* 447 * First we do route table lookup on destination address. So we can 448 * fill in out_ifx, dst_mask, nexthop, and dst_as in future releases. 449 */ 450 if ((flags & NG_NETFLOW_CONF_NODSTLOOKUP) == 0) { 451 bzero(&sin6, sizeof(struct sockaddr_in6)); 452 sin6.sin6_len = sizeof(struct sockaddr_in6); 453 sin6.sin6_family = AF_INET6; 454 sin6.sin6_addr = r->dst.r_dst6; 455 456 rt_gateway.sdl_len = sizeof(rt_gateway); 457 sin6_mask.sin6_len = sizeof(struct sockaddr_in6); 458 bzero(&info, sizeof(info)); 459 460 info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; 461 info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&sin6_mask; 462 463 if (rib_lookup_info(r->fib, (struct sockaddr *)&sin6, NHR_REF, 464 0, &info) == 0) { 465 fle6->f.fle_o_ifx = info.rti_ifp->if_index; 466 467 if (info.rti_flags & RTF_GATEWAY && 468 rt_gateway.sdl_family == AF_INET6) 469 fle6->f.n.next_hop6 = 470 ((struct sockaddr_in6 *)&rt_gateway)->sin6_addr; 471 472 fle6->f.dst_mask = 473 ip6_masklen(&sin6_mask.sin6_addr, &info); 474 475 rib_free_info(&info); 476 } 477 } 478 479 if ((flags & NG_NETFLOW_CONF_NOSRCLOOKUP) == 0) { 480 /* Do route lookup on source address, to fill in src_mask. */ 481 bzero(&sin6, sizeof(struct sockaddr_in6)); 482 sin6.sin6_len = sizeof(struct sockaddr_in6); 483 sin6.sin6_family = AF_INET6; 484 sin6.sin6_addr = r->src.r_src6; 485 486 sin6_mask.sin6_len = sizeof(struct sockaddr_in6); 487 bzero(&info, sizeof(info)); 488 489 info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&sin6_mask; 490 491 if (rib_lookup_info(r->fib, (struct sockaddr *)&sin6, 0, 0, 492 &info) == 0) 493 fle6->f.src_mask = 494 ip6_masklen(&sin6_mask.sin6_addr, &info); 495 } 496 497 /* Push new flow at the and of hash. */ 498 TAILQ_INSERT_TAIL(&hsh6->head, (struct flow_entry *)fle6, fle_hash); 499 500 return (0); 501 } 502 #endif 503 504 505 /* 506 * Non-static functions called from ng_netflow.c 507 */ 508 509 /* Allocate memory and set up flow cache */ 510 void 511 ng_netflow_cache_init(priv_p priv) 512 { 513 struct flow_hash_entry *hsh; 514 int i; 515 516 /* Initialize cache UMA zone. */ 517 priv->zone = uma_zcreate("NetFlow IPv4 cache", 518 sizeof(struct flow_entry), NULL, NULL, NULL, NULL, 519 UMA_ALIGN_CACHE, 0); 520 uma_zone_set_max(priv->zone, CACHESIZE); 521 #ifdef INET6 522 priv->zone6 = uma_zcreate("NetFlow IPv6 cache", 523 sizeof(struct flow6_entry), NULL, NULL, NULL, NULL, 524 UMA_ALIGN_CACHE, 0); 525 uma_zone_set_max(priv->zone6, CACHESIZE); 526 #endif 527 528 /* Allocate hash. */ 529 priv->hash = malloc(NBUCKETS * sizeof(struct flow_hash_entry), 530 M_NETFLOW_HASH, M_WAITOK | M_ZERO); 531 532 /* Initialize hash. */ 533 for (i = 0, hsh = priv->hash; i < NBUCKETS; i++, hsh++) { 534 mtx_init(&hsh->mtx, "hash mutex", NULL, MTX_DEF); 535 TAILQ_INIT(&hsh->head); 536 } 537 538 #ifdef INET6 539 /* Allocate hash. */ 540 priv->hash6 = malloc(NBUCKETS * sizeof(struct flow_hash_entry), 541 M_NETFLOW_HASH, M_WAITOK | M_ZERO); 542 543 /* Initialize hash. */ 544 for (i = 0, hsh = priv->hash6; i < NBUCKETS; i++, hsh++) { 545 mtx_init(&hsh->mtx, "hash mutex", NULL, MTX_DEF); 546 TAILQ_INIT(&hsh->head); 547 } 548 #endif 549 550 priv->nfinfo_bytes = counter_u64_alloc(M_WAITOK); 551 priv->nfinfo_packets = counter_u64_alloc(M_WAITOK); 552 priv->nfinfo_bytes6 = counter_u64_alloc(M_WAITOK); 553 priv->nfinfo_packets6 = counter_u64_alloc(M_WAITOK); 554 priv->nfinfo_sbytes = counter_u64_alloc(M_WAITOK); 555 priv->nfinfo_spackets = counter_u64_alloc(M_WAITOK); 556 priv->nfinfo_sbytes6 = counter_u64_alloc(M_WAITOK); 557 priv->nfinfo_spackets6 = counter_u64_alloc(M_WAITOK); 558 priv->nfinfo_act_exp = counter_u64_alloc(M_WAITOK); 559 priv->nfinfo_inact_exp = counter_u64_alloc(M_WAITOK); 560 561 ng_netflow_v9_cache_init(priv); 562 CTR0(KTR_NET, "ng_netflow startup()"); 563 } 564 565 /* Initialize new FIB table for v5 and v9 */ 566 int 567 ng_netflow_fib_init(priv_p priv, int fib) 568 { 569 fib_export_p fe = priv_to_fib(priv, fib); 570 571 CTR1(KTR_NET, "ng_netflow(): fib init: %d", fib); 572 573 if (fe != NULL) 574 return (0); 575 576 if ((fe = malloc(sizeof(struct fib_export), M_NETGRAPH, 577 M_NOWAIT | M_ZERO)) == NULL) 578 return (ENOMEM); 579 580 mtx_init(&fe->export_mtx, "export dgram lock", NULL, MTX_DEF); 581 mtx_init(&fe->export9_mtx, "export9 dgram lock", NULL, MTX_DEF); 582 fe->fib = fib; 583 fe->domain_id = fib; 584 585 if (atomic_cmpset_ptr((volatile uintptr_t *)&priv->fib_data[fib], 586 (uintptr_t)NULL, (uintptr_t)fe) == 0) { 587 /* FIB already set up by other ISR */ 588 CTR3(KTR_NET, "ng_netflow(): fib init: %d setup %p but got %p", 589 fib, fe, priv_to_fib(priv, fib)); 590 mtx_destroy(&fe->export_mtx); 591 mtx_destroy(&fe->export9_mtx); 592 free(fe, M_NETGRAPH); 593 } else { 594 /* Increase counter for statistics */ 595 CTR3(KTR_NET, "ng_netflow(): fib %d setup to %p (%p)", 596 fib, fe, priv_to_fib(priv, fib)); 597 priv->nfinfo_alloc_fibs++; 598 } 599 600 return (0); 601 } 602 603 /* Free all flow cache memory. Called from node close method. */ 604 void 605 ng_netflow_cache_flush(priv_p priv) 606 { 607 struct flow_entry *fle, *fle1; 608 struct flow_hash_entry *hsh; 609 struct netflow_export_item exp; 610 fib_export_p fe; 611 int i; 612 613 bzero(&exp, sizeof(exp)); 614 615 /* 616 * We are going to free probably billable data. 617 * Expire everything before freeing it. 618 * No locking is required since callout is already drained. 619 */ 620 for (hsh = priv->hash, i = 0; i < NBUCKETS; hsh++, i++) 621 TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) { 622 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 623 fe = priv_to_fib(priv, fle->f.r.fib); 624 expire_flow(priv, fe, fle, NG_QUEUE); 625 } 626 #ifdef INET6 627 for (hsh = priv->hash6, i = 0; i < NBUCKETS; hsh++, i++) 628 TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) { 629 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 630 fe = priv_to_fib(priv, fle->f.r.fib); 631 expire_flow(priv, fe, fle, NG_QUEUE); 632 } 633 #endif 634 635 uma_zdestroy(priv->zone); 636 /* Destroy hash mutexes. */ 637 for (i = 0, hsh = priv->hash; i < NBUCKETS; i++, hsh++) 638 mtx_destroy(&hsh->mtx); 639 640 /* Free hash memory. */ 641 if (priv->hash != NULL) 642 free(priv->hash, M_NETFLOW_HASH); 643 #ifdef INET6 644 uma_zdestroy(priv->zone6); 645 /* Destroy hash mutexes. */ 646 for (i = 0, hsh = priv->hash6; i < NBUCKETS; i++, hsh++) 647 mtx_destroy(&hsh->mtx); 648 649 /* Free hash memory. */ 650 if (priv->hash6 != NULL) 651 free(priv->hash6, M_NETFLOW_HASH); 652 #endif 653 654 for (i = 0; i < priv->maxfibs; i++) { 655 if ((fe = priv_to_fib(priv, i)) == NULL) 656 continue; 657 658 if (fe->exp.item != NULL) 659 export_send(priv, fe, fe->exp.item, NG_QUEUE); 660 661 if (fe->exp.item9 != NULL) 662 export9_send(priv, fe, fe->exp.item9, 663 fe->exp.item9_opt, NG_QUEUE); 664 665 mtx_destroy(&fe->export_mtx); 666 mtx_destroy(&fe->export9_mtx); 667 free(fe, M_NETGRAPH); 668 } 669 670 counter_u64_free(priv->nfinfo_bytes); 671 counter_u64_free(priv->nfinfo_packets); 672 counter_u64_free(priv->nfinfo_bytes6); 673 counter_u64_free(priv->nfinfo_packets6); 674 counter_u64_free(priv->nfinfo_sbytes); 675 counter_u64_free(priv->nfinfo_spackets); 676 counter_u64_free(priv->nfinfo_sbytes6); 677 counter_u64_free(priv->nfinfo_spackets6); 678 counter_u64_free(priv->nfinfo_act_exp); 679 counter_u64_free(priv->nfinfo_inact_exp); 680 681 ng_netflow_v9_cache_flush(priv); 682 } 683 684 /* Insert packet from into flow cache. */ 685 int 686 ng_netflow_flow_add(priv_p priv, fib_export_p fe, struct ip *ip, 687 caddr_t upper_ptr, uint8_t upper_proto, uint8_t flags, 688 unsigned int src_if_index) 689 { 690 struct flow_entry *fle, *fle1; 691 struct flow_hash_entry *hsh; 692 struct flow_rec r; 693 int hlen, plen; 694 int error = 0; 695 uint16_t eproto; 696 uint8_t tcp_flags = 0; 697 698 bzero(&r, sizeof(r)); 699 700 if (ip->ip_v != IPVERSION) 701 return (EINVAL); 702 703 hlen = ip->ip_hl << 2; 704 if (hlen < sizeof(struct ip)) 705 return (EINVAL); 706 707 eproto = ETHERTYPE_IP; 708 /* Assume L4 template by default */ 709 r.flow_type = NETFLOW_V9_FLOW_V4_L4; 710 711 r.r_src = ip->ip_src; 712 r.r_dst = ip->ip_dst; 713 r.fib = fe->fib; 714 715 plen = ntohs(ip->ip_len); 716 717 r.r_ip_p = ip->ip_p; 718 r.r_tos = ip->ip_tos; 719 720 r.r_i_ifx = src_if_index; 721 722 /* 723 * XXX NOTE: only first fragment of fragmented TCP, UDP and 724 * ICMP packet will be recorded with proper s_port and d_port. 725 * Following fragments will be recorded simply as IP packet with 726 * ip_proto = ip->ip_p and s_port, d_port set to zero. 727 * I know, it looks like bug. But I don't want to re-implement 728 * ip packet assebmling here. Anyway, (in)famous trafd works this way - 729 * and nobody complains yet :) 730 */ 731 if ((ip->ip_off & htons(IP_OFFMASK)) == 0) 732 switch(r.r_ip_p) { 733 case IPPROTO_TCP: 734 { 735 struct tcphdr *tcp; 736 737 tcp = (struct tcphdr *)((caddr_t )ip + hlen); 738 r.r_sport = tcp->th_sport; 739 r.r_dport = tcp->th_dport; 740 tcp_flags = tcp->th_flags; 741 break; 742 } 743 case IPPROTO_UDP: 744 r.r_ports = *(uint32_t *)((caddr_t )ip + hlen); 745 break; 746 } 747 748 counter_u64_add(priv->nfinfo_packets, 1); 749 counter_u64_add(priv->nfinfo_bytes, plen); 750 751 /* Find hash slot. */ 752 hsh = &priv->hash[ip_hash(&r)]; 753 754 mtx_lock(&hsh->mtx); 755 756 /* 757 * Go through hash and find our entry. If we encounter an 758 * entry, that should be expired, purge it. We do a reverse 759 * search since most active entries are first, and most 760 * searches are done on most active entries. 761 */ 762 TAILQ_FOREACH_REVERSE_SAFE(fle, &hsh->head, fhead, fle_hash, fle1) { 763 if (bcmp(&r, &fle->f.r, sizeof(struct flow_rec)) == 0) 764 break; 765 if ((INACTIVE(fle) && SMALL(fle)) || AGED(fle)) { 766 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 767 expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), 768 fle, NG_QUEUE); 769 counter_u64_add(priv->nfinfo_act_exp, 1); 770 } 771 } 772 773 if (fle) { /* An existent entry. */ 774 775 fle->f.bytes += plen; 776 fle->f.packets ++; 777 fle->f.tcp_flags |= tcp_flags; 778 fle->f.last = time_uptime; 779 780 /* 781 * We have the following reasons to expire flow in active way: 782 * - it hit active timeout 783 * - a TCP connection closed 784 * - it is going to overflow counter 785 */ 786 if (tcp_flags & TH_FIN || tcp_flags & TH_RST || AGED(fle) || 787 (fle->f.bytes >= (CNTR_MAX - IF_MAXMTU)) ) { 788 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 789 expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), 790 fle, NG_QUEUE); 791 counter_u64_add(priv->nfinfo_act_exp, 1); 792 } else { 793 /* 794 * It is the newest, move it to the tail, 795 * if it isn't there already. Next search will 796 * locate it quicker. 797 */ 798 if (fle != TAILQ_LAST(&hsh->head, fhead)) { 799 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 800 TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash); 801 } 802 } 803 } else /* A new flow entry. */ 804 error = hash_insert(priv, hsh, &r, plen, flags, tcp_flags); 805 806 mtx_unlock(&hsh->mtx); 807 808 return (error); 809 } 810 811 #ifdef INET6 812 /* Insert IPv6 packet from into flow cache. */ 813 int 814 ng_netflow_flow6_add(priv_p priv, fib_export_p fe, struct ip6_hdr *ip6, 815 caddr_t upper_ptr, uint8_t upper_proto, uint8_t flags, 816 unsigned int src_if_index) 817 { 818 struct flow_entry *fle = NULL, *fle1; 819 struct flow6_entry *fle6; 820 struct flow_hash_entry *hsh; 821 struct flow6_rec r; 822 int plen; 823 int error = 0; 824 uint8_t tcp_flags = 0; 825 826 /* check version */ 827 if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) 828 return (EINVAL); 829 830 bzero(&r, sizeof(r)); 831 832 r.src.r_src6 = ip6->ip6_src; 833 r.dst.r_dst6 = ip6->ip6_dst; 834 r.fib = fe->fib; 835 836 /* Assume L4 template by default */ 837 r.flow_type = NETFLOW_V9_FLOW_V6_L4; 838 839 plen = ntohs(ip6->ip6_plen) + sizeof(struct ip6_hdr); 840 841 #if 0 842 /* XXX: set DSCP/CoS value */ 843 r.r_tos = ip->ip_tos; 844 #endif 845 if ((flags & NG_NETFLOW_IS_FRAG) == 0) { 846 switch(upper_proto) { 847 case IPPROTO_TCP: 848 { 849 struct tcphdr *tcp; 850 851 tcp = (struct tcphdr *)upper_ptr; 852 r.r_ports = *(uint32_t *)upper_ptr; 853 tcp_flags = tcp->th_flags; 854 break; 855 } 856 case IPPROTO_UDP: 857 case IPPROTO_SCTP: 858 r.r_ports = *(uint32_t *)upper_ptr; 859 break; 860 } 861 } 862 863 r.r_ip_p = upper_proto; 864 r.r_i_ifx = src_if_index; 865 866 counter_u64_add(priv->nfinfo_packets6, 1); 867 counter_u64_add(priv->nfinfo_bytes6, plen); 868 869 /* Find hash slot. */ 870 hsh = &priv->hash6[ip6_hash(&r)]; 871 872 mtx_lock(&hsh->mtx); 873 874 /* 875 * Go through hash and find our entry. If we encounter an 876 * entry, that should be expired, purge it. We do a reverse 877 * search since most active entries are first, and most 878 * searches are done on most active entries. 879 */ 880 TAILQ_FOREACH_REVERSE_SAFE(fle, &hsh->head, fhead, fle_hash, fle1) { 881 if (fle->f.version != IP6VERSION) 882 continue; 883 fle6 = (struct flow6_entry *)fle; 884 if (bcmp(&r, &fle6->f.r, sizeof(struct flow6_rec)) == 0) 885 break; 886 if ((INACTIVE(fle6) && SMALL(fle6)) || AGED(fle6)) { 887 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 888 expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle, 889 NG_QUEUE); 890 counter_u64_add(priv->nfinfo_act_exp, 1); 891 } 892 } 893 894 if (fle != NULL) { /* An existent entry. */ 895 fle6 = (struct flow6_entry *)fle; 896 897 fle6->f.bytes += plen; 898 fle6->f.packets ++; 899 fle6->f.tcp_flags |= tcp_flags; 900 fle6->f.last = time_uptime; 901 902 /* 903 * We have the following reasons to expire flow in active way: 904 * - it hit active timeout 905 * - a TCP connection closed 906 * - it is going to overflow counter 907 */ 908 if (tcp_flags & TH_FIN || tcp_flags & TH_RST || AGED(fle6) || 909 (fle6->f.bytes >= (CNTR_MAX - IF_MAXMTU)) ) { 910 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 911 expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle, 912 NG_QUEUE); 913 counter_u64_add(priv->nfinfo_act_exp, 1); 914 } else { 915 /* 916 * It is the newest, move it to the tail, 917 * if it isn't there already. Next search will 918 * locate it quicker. 919 */ 920 if (fle != TAILQ_LAST(&hsh->head, fhead)) { 921 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 922 TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash); 923 } 924 } 925 } else /* A new flow entry. */ 926 error = hash6_insert(priv, hsh, &r, plen, flags, tcp_flags); 927 928 mtx_unlock(&hsh->mtx); 929 930 return (error); 931 } 932 #endif 933 934 /* 935 * Return records from cache to userland. 936 * 937 * TODO: matching particular IP should be done in kernel, here. 938 */ 939 int 940 ng_netflow_flow_show(priv_p priv, struct ngnf_show_header *req, 941 struct ngnf_show_header *resp) 942 { 943 struct flow_hash_entry *hsh; 944 struct flow_entry *fle; 945 struct flow_entry_data *data = (struct flow_entry_data *)(resp + 1); 946 #ifdef INET6 947 struct flow6_entry_data *data6 = (struct flow6_entry_data *)(resp + 1); 948 #endif 949 int i, max; 950 951 i = req->hash_id; 952 if (i > NBUCKETS-1) 953 return (EINVAL); 954 955 #ifdef INET6 956 if (req->version == 6) { 957 resp->version = 6; 958 hsh = priv->hash6 + i; 959 max = NREC6_AT_ONCE; 960 } else 961 #endif 962 if (req->version == 4) { 963 resp->version = 4; 964 hsh = priv->hash + i; 965 max = NREC_AT_ONCE; 966 } else 967 return (EINVAL); 968 969 /* 970 * We will transfer not more than NREC_AT_ONCE. More data 971 * will come in next message. 972 * We send current hash index and current record number in list 973 * to userland, and userland should return it back to us. 974 * Then, we will restart with new entry. 975 * 976 * The resulting cache snapshot can be inaccurate if flow expiration 977 * is taking place on hash item between userland data requests for 978 * this hash item id. 979 */ 980 resp->nentries = 0; 981 for (; i < NBUCKETS; hsh++, i++) { 982 int list_id; 983 984 if (mtx_trylock(&hsh->mtx) == 0) { 985 /* 986 * Requested hash index is not available, 987 * relay decision to skip or re-request data 988 * to userland. 989 */ 990 resp->hash_id = i; 991 resp->list_id = 0; 992 return (0); 993 } 994 995 list_id = 0; 996 TAILQ_FOREACH(fle, &hsh->head, fle_hash) { 997 if (hsh->mtx.mtx_lock & MTX_CONTESTED) { 998 resp->hash_id = i; 999 resp->list_id = list_id; 1000 mtx_unlock(&hsh->mtx); 1001 return (0); 1002 } 1003 1004 list_id++; 1005 /* Search for particular record in list. */ 1006 if (req->list_id > 0) { 1007 if (list_id < req->list_id) 1008 continue; 1009 1010 /* Requested list position found. */ 1011 req->list_id = 0; 1012 } 1013 #ifdef INET6 1014 if (req->version == 6) { 1015 struct flow6_entry *fle6; 1016 1017 fle6 = (struct flow6_entry *)fle; 1018 bcopy(&fle6->f, data6 + resp->nentries, 1019 sizeof(fle6->f)); 1020 } else 1021 #endif 1022 bcopy(&fle->f, data + resp->nentries, 1023 sizeof(fle->f)); 1024 resp->nentries++; 1025 if (resp->nentries == max) { 1026 resp->hash_id = i; 1027 /* 1028 * If it was the last item in list 1029 * we simply skip to next hash_id. 1030 */ 1031 resp->list_id = list_id + 1; 1032 mtx_unlock(&hsh->mtx); 1033 return (0); 1034 } 1035 } 1036 mtx_unlock(&hsh->mtx); 1037 } 1038 1039 resp->hash_id = resp->list_id = 0; 1040 1041 return (0); 1042 } 1043 1044 /* We have full datagram in privdata. Send it to export hook. */ 1045 static int 1046 export_send(priv_p priv, fib_export_p fe, item_p item, int flags) 1047 { 1048 struct mbuf *m = NGI_M(item); 1049 struct netflow_v5_export_dgram *dgram = mtod(m, 1050 struct netflow_v5_export_dgram *); 1051 struct netflow_v5_header *header = &dgram->header; 1052 struct timespec ts; 1053 int error = 0; 1054 1055 /* Fill mbuf header. */ 1056 m->m_len = m->m_pkthdr.len = sizeof(struct netflow_v5_record) * 1057 header->count + sizeof(struct netflow_v5_header); 1058 1059 /* Fill export header. */ 1060 header->sys_uptime = htonl(MILLIUPTIME(time_uptime)); 1061 getnanotime(&ts); 1062 header->unix_secs = htonl(ts.tv_sec); 1063 header->unix_nsecs = htonl(ts.tv_nsec); 1064 header->engine_type = 0; 1065 header->engine_id = fe->domain_id; 1066 header->pad = 0; 1067 header->flow_seq = htonl(atomic_fetchadd_32(&fe->flow_seq, 1068 header->count)); 1069 header->count = htons(header->count); 1070 1071 if (priv->export != NULL) 1072 NG_FWD_ITEM_HOOK_FLAGS(error, item, priv->export, flags); 1073 else 1074 NG_FREE_ITEM(item); 1075 1076 return (error); 1077 } 1078 1079 1080 /* Add export record to dgram. */ 1081 static int 1082 export_add(item_p item, struct flow_entry *fle) 1083 { 1084 struct netflow_v5_export_dgram *dgram = mtod(NGI_M(item), 1085 struct netflow_v5_export_dgram *); 1086 struct netflow_v5_header *header = &dgram->header; 1087 struct netflow_v5_record *rec; 1088 1089 rec = &dgram->r[header->count]; 1090 header->count ++; 1091 1092 KASSERT(header->count <= NETFLOW_V5_MAX_RECORDS, 1093 ("ng_netflow: export too big")); 1094 1095 /* Fill in export record. */ 1096 rec->src_addr = fle->f.r.r_src.s_addr; 1097 rec->dst_addr = fle->f.r.r_dst.s_addr; 1098 rec->next_hop = fle->f.next_hop.s_addr; 1099 rec->i_ifx = htons(fle->f.fle_i_ifx); 1100 rec->o_ifx = htons(fle->f.fle_o_ifx); 1101 rec->packets = htonl(fle->f.packets); 1102 rec->octets = htonl(fle->f.bytes); 1103 rec->first = htonl(MILLIUPTIME(fle->f.first)); 1104 rec->last = htonl(MILLIUPTIME(fle->f.last)); 1105 rec->s_port = fle->f.r.r_sport; 1106 rec->d_port = fle->f.r.r_dport; 1107 rec->flags = fle->f.tcp_flags; 1108 rec->prot = fle->f.r.r_ip_p; 1109 rec->tos = fle->f.r.r_tos; 1110 rec->dst_mask = fle->f.dst_mask; 1111 rec->src_mask = fle->f.src_mask; 1112 rec->pad1 = 0; 1113 rec->pad2 = 0; 1114 1115 /* Not supported fields. */ 1116 rec->src_as = rec->dst_as = 0; 1117 1118 if (header->count == NETFLOW_V5_MAX_RECORDS) 1119 return (1); /* end of datagram */ 1120 else 1121 return (0); 1122 } 1123 1124 /* Periodic flow expiry run. */ 1125 void 1126 ng_netflow_expire(void *arg) 1127 { 1128 struct flow_entry *fle, *fle1; 1129 struct flow_hash_entry *hsh; 1130 priv_p priv = (priv_p )arg; 1131 int used, i; 1132 1133 /* 1134 * Going through all the cache. 1135 */ 1136 used = uma_zone_get_cur(priv->zone); 1137 for (hsh = priv->hash, i = 0; i < NBUCKETS; hsh++, i++) { 1138 /* 1139 * Skip entries, that are already being worked on. 1140 */ 1141 if (mtx_trylock(&hsh->mtx) == 0) 1142 continue; 1143 1144 TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) { 1145 /* 1146 * Interrupt thread wants this entry! 1147 * Quick! Quick! Bail out! 1148 */ 1149 if (hsh->mtx.mtx_lock & MTX_CONTESTED) 1150 break; 1151 1152 /* 1153 * Don't expire aggressively while hash collision 1154 * ratio is predicted small. 1155 */ 1156 if (used <= (NBUCKETS*2) && !INACTIVE(fle)) 1157 break; 1158 1159 if ((INACTIVE(fle) && (SMALL(fle) || 1160 (used > (NBUCKETS*2)))) || AGED(fle)) { 1161 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 1162 expire_flow(priv, priv_to_fib(priv, 1163 fle->f.r.fib), fle, NG_NOFLAGS); 1164 used--; 1165 counter_u64_add(priv->nfinfo_inact_exp, 1); 1166 } 1167 } 1168 mtx_unlock(&hsh->mtx); 1169 } 1170 1171 #ifdef INET6 1172 used = uma_zone_get_cur(priv->zone6); 1173 for (hsh = priv->hash6, i = 0; i < NBUCKETS; hsh++, i++) { 1174 struct flow6_entry *fle6; 1175 1176 /* 1177 * Skip entries, that are already being worked on. 1178 */ 1179 if (mtx_trylock(&hsh->mtx) == 0) 1180 continue; 1181 1182 TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) { 1183 fle6 = (struct flow6_entry *)fle; 1184 /* 1185 * Interrupt thread wants this entry! 1186 * Quick! Quick! Bail out! 1187 */ 1188 if (hsh->mtx.mtx_lock & MTX_CONTESTED) 1189 break; 1190 1191 /* 1192 * Don't expire aggressively while hash collision 1193 * ratio is predicted small. 1194 */ 1195 if (used <= (NBUCKETS*2) && !INACTIVE(fle6)) 1196 break; 1197 1198 if ((INACTIVE(fle6) && (SMALL(fle6) || 1199 (used > (NBUCKETS*2)))) || AGED(fle6)) { 1200 TAILQ_REMOVE(&hsh->head, fle, fle_hash); 1201 expire_flow(priv, priv_to_fib(priv, 1202 fle->f.r.fib), fle, NG_NOFLAGS); 1203 used--; 1204 counter_u64_add(priv->nfinfo_inact_exp, 1); 1205 } 1206 } 1207 mtx_unlock(&hsh->mtx); 1208 } 1209 #endif 1210 1211 /* Schedule next expire. */ 1212 callout_reset(&priv->exp_callout, (1*hz), &ng_netflow_expire, 1213 (void *)priv); 1214 } 1215