1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017-2025 Yandex LLC 5 * Copyright (c) 2017-2025 Andrey V. Elsukov <ae@FreeBSD.org> 6 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_ipfw.h" 34 #ifndef INET 35 #error IPFIREWALL requires INET. 36 #endif /* INET */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/hash.h> 41 #include <sys/mbuf.h> 42 #include <sys/kernel.h> 43 #include <sys/lock.h> 44 #include <sys/pcpu.h> 45 #include <sys/queue.h> 46 #include <sys/rmlock.h> 47 #include <sys/smp.h> 48 #include <sys/socket.h> 49 #include <sys/sysctl.h> 50 #include <sys/syslog.h> 51 #include <net/ethernet.h> 52 #include <net/if.h> 53 #include <net/if_var.h> 54 #include <net/vnet.h> 55 56 #include <netinet/in.h> 57 #include <netinet/ip.h> 58 #include <netinet/ip_var.h> 59 #include <netinet/ip_fw.h> 60 #include <netinet/tcp.h> 61 #include <netinet/udp.h> 62 63 #include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */ 64 #ifdef INET6 65 #include <netinet6/in6_var.h> 66 #include <netinet6/ip6_var.h> 67 #include <netinet6/scope6_var.h> 68 #endif 69 70 #include <netpfil/ipfw/ip_fw_private.h> 71 72 #include <machine/in_cksum.h> /* XXX for in_cksum */ 73 74 #ifdef MAC 75 #include <security/mac/mac_framework.h> 76 #endif 77 78 /* 79 * Description of dynamic states. 80 * 81 * Dynamic states are stored in lists accessed through a hash tables 82 * whose size is curr_dyn_buckets. This value can be modified through 83 * the sysctl variable dyn_buckets. 84 * 85 * Currently there are four tables: dyn_ipv4, dyn_ipv6, dyn_ipv4_parent, 86 * and dyn_ipv6_parent. 87 * 88 * When a packet is received, its address fields hashed, then matched 89 * against the entries in the corresponding list by addr_type. 90 * Dynamic states can be used for different purposes: 91 * + stateful rules; 92 * + enforcing limits on the number of sessions; 93 * + in-kernel NAT (not implemented yet) 94 * 95 * The lifetime of dynamic states is regulated by dyn_*_lifetime, 96 * measured in seconds and depending on the flags. 97 * 98 * The total number of dynamic states is equal to UMA zone items count. 99 * The max number of dynamic states is dyn_max. When we reach 100 * the maximum number of rules we do not create anymore. This is 101 * done to avoid consuming too much memory, but also too much 102 * time when searching on each packet (ideally, we should try instead 103 * to put a limit on the length of the list on each bucket...). 104 * 105 * Each state holds a pointer to the parent ipfw rule so we know what 106 * action to perform. Dynamic rules are removed when the parent rule is 107 * deleted. 108 * 109 * There are some limitations with dynamic rules -- we do not 110 * obey the 'randomized match', and we do not do multiple 111 * passes through the firewall. XXX check the latter!!! 112 */ 113 114 /* By default use jenkins hash function */ 115 #define IPFIREWALL_JENKINSHASH 116 117 #define DYN_COUNTER_INC(d, dir, pktlen) do { \ 118 (d)->pcnt_ ## dir++; \ 119 (d)->bcnt_ ## dir += pktlen; \ 120 } while (0) 121 122 #define DYN_REFERENCED 0x01 123 /* 124 * DYN_REFERENCED flag is used to show that state keeps reference to named 125 * object, and this reference should be released when state becomes expired. 126 */ 127 128 struct dyn_data { 129 void *parent; /* pointer to parent rule */ 130 uint32_t chain_id; /* cached ruleset id */ 131 uint32_t f_pos; /* cached rule index */ 132 133 uint32_t hashval; /* hash value used for hash resize */ 134 uint16_t fibnum; /* fib used to send keepalives */ 135 uint8_t _pad; 136 uint8_t flags; /* internal flags */ 137 uint32_t rulenum; /* parent rule number */ 138 uint32_t ruleid; /* parent rule id */ 139 140 uint32_t state; /* TCP session state and flags */ 141 uint32_t ack_fwd; /* most recent ACKs in forward */ 142 uint32_t ack_rev; /* and reverse direction (used */ 143 /* to generate keepalives) */ 144 uint32_t sync; /* synchronization time */ 145 uint32_t expire; /* expire time */ 146 147 uint64_t pcnt_fwd; /* packets counter in forward */ 148 uint64_t bcnt_fwd; /* bytes counter in forward */ 149 uint64_t pcnt_rev; /* packets counter in reverse */ 150 uint64_t bcnt_rev; /* bytes counter in reverse */ 151 }; 152 153 #define DPARENT_COUNT_DEC(p) do { \ 154 MPASS(p->count > 0); \ 155 ck_pr_dec_32(&(p)->count); \ 156 } while (0) 157 #define DPARENT_COUNT_INC(p) ck_pr_inc_32(&(p)->count) 158 #define DPARENT_COUNT(p) ck_pr_load_32(&(p)->count) 159 struct dyn_parent { 160 void *parent; /* pointer to parent rule */ 161 uint32_t count; /* number of linked states */ 162 uint32_t rulenum; /* parent rule number */ 163 uint32_t ruleid; /* parent rule id */ 164 uint32_t hashval; /* hash value used for hash resize */ 165 uint32_t expire; /* expire time */ 166 }; 167 168 struct dyn_ipv4_state { 169 uint8_t type; /* State type */ 170 uint8_t proto; /* UL Protocol */ 171 uint16_t spare; 172 uint32_t kidx; /* named object index */ 173 uint16_t sport, dport; /* ULP source and destination ports */ 174 in_addr_t src, dst; /* IPv4 source and destination */ 175 176 union { 177 struct dyn_data *data; 178 struct dyn_parent *limit; 179 }; 180 CK_SLIST_ENTRY(dyn_ipv4_state) entry; 181 SLIST_ENTRY(dyn_ipv4_state) expired; 182 }; 183 CK_SLIST_HEAD(dyn_ipv4ck_slist, dyn_ipv4_state); 184 VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4); 185 VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4_parent); 186 187 SLIST_HEAD(dyn_ipv4_slist, dyn_ipv4_state); 188 VNET_DEFINE_STATIC(struct dyn_ipv4_slist, dyn_expired_ipv4); 189 #define V_dyn_ipv4 VNET(dyn_ipv4) 190 #define V_dyn_ipv4_parent VNET(dyn_ipv4_parent) 191 #define V_dyn_expired_ipv4 VNET(dyn_expired_ipv4) 192 193 #ifdef INET6 194 struct dyn_ipv6_state { 195 uint8_t type; /* State type */ 196 uint8_t proto; /* UL Protocol */ 197 uint16_t kidx; /* named object index */ 198 uint16_t sport, dport; /* ULP source and destination ports */ 199 struct in6_addr src, dst; /* IPv6 source and destination */ 200 uint32_t zoneid; /* IPv6 scope zone id */ 201 union { 202 struct dyn_data *data; 203 struct dyn_parent *limit; 204 }; 205 CK_SLIST_ENTRY(dyn_ipv6_state) entry; 206 SLIST_ENTRY(dyn_ipv6_state) expired; 207 }; 208 CK_SLIST_HEAD(dyn_ipv6ck_slist, dyn_ipv6_state); 209 VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6); 210 VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6_parent); 211 212 SLIST_HEAD(dyn_ipv6_slist, dyn_ipv6_state); 213 VNET_DEFINE_STATIC(struct dyn_ipv6_slist, dyn_expired_ipv6); 214 #define V_dyn_ipv6 VNET(dyn_ipv6) 215 #define V_dyn_ipv6_parent VNET(dyn_ipv6_parent) 216 #define V_dyn_expired_ipv6 VNET(dyn_expired_ipv6) 217 #endif /* INET6 */ 218 219 /* 220 * Per-CPU pointer indicates that specified state is currently in use 221 * and must not be reclaimed by expiration callout. 222 */ 223 static void **dyn_hp_cache; 224 DPCPU_DEFINE_STATIC(void *, dyn_hp); 225 #define DYNSTATE_GET(cpu) ck_pr_load_ptr(DPCPU_ID_PTR((cpu), dyn_hp)) 226 #define DYNSTATE_PROTECT(v) ck_pr_store_ptr(DPCPU_PTR(dyn_hp), (v)) 227 #define DYNSTATE_RELEASE() DYNSTATE_PROTECT(NULL) 228 #define DYNSTATE_CRITICAL_ENTER() critical_enter() 229 #define DYNSTATE_CRITICAL_EXIT() do { \ 230 DYNSTATE_RELEASE(); \ 231 critical_exit(); \ 232 } while (0); 233 234 /* 235 * We keep two version numbers, one is updated when new entry added to 236 * the list. Second is updated when an entry deleted from the list. 237 * Versions are updated under bucket lock. 238 * 239 * Bucket "add" version number is used to know, that in the time between 240 * state lookup (i.e. ipfw_dyn_lookup_state()) and the followed state 241 * creation (i.e. ipfw_dyn_install_state()) another concurrent thread did 242 * not install some state in this bucket. Using this info we can avoid 243 * additional state lookup, because we are sure that we will not install 244 * the state twice. 245 * 246 * Also doing the tracking of bucket "del" version during lookup we can 247 * be sure, that state entry was not unlinked and freed in time between 248 * we read the state pointer and protect it with hazard pointer. 249 * 250 * An entry unlinked from CK list keeps unchanged until it is freed. 251 * Unlinked entries are linked into expired lists using "expired" field. 252 */ 253 254 /* 255 * dyn_expire_lock is used to protect access to dyn_expired_xxx lists. 256 * dyn_bucket_lock is used to get write access to lists in specific bucket. 257 * Currently one dyn_bucket_lock is used for all ipv4, ipv4_parent, ipv6, 258 * and ipv6_parent lists. 259 */ 260 VNET_DEFINE_STATIC(struct mtx, dyn_expire_lock); 261 VNET_DEFINE_STATIC(struct mtx *, dyn_bucket_lock); 262 #define V_dyn_expire_lock VNET(dyn_expire_lock) 263 #define V_dyn_bucket_lock VNET(dyn_bucket_lock) 264 265 /* 266 * Bucket's add/delete generation versions. 267 */ 268 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_add); 269 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_del); 270 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_add); 271 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_del); 272 #define V_dyn_ipv4_add VNET(dyn_ipv4_add) 273 #define V_dyn_ipv4_del VNET(dyn_ipv4_del) 274 #define V_dyn_ipv4_parent_add VNET(dyn_ipv4_parent_add) 275 #define V_dyn_ipv4_parent_del VNET(dyn_ipv4_parent_del) 276 277 #ifdef INET6 278 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_add); 279 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_del); 280 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_add); 281 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_del); 282 #define V_dyn_ipv6_add VNET(dyn_ipv6_add) 283 #define V_dyn_ipv6_del VNET(dyn_ipv6_del) 284 #define V_dyn_ipv6_parent_add VNET(dyn_ipv6_parent_add) 285 #define V_dyn_ipv6_parent_del VNET(dyn_ipv6_parent_del) 286 #endif /* INET6 */ 287 288 #define DYN_BUCKET(h, b) ((h) & (b - 1)) 289 #define DYN_BUCKET_VERSION(b, v) ck_pr_load_32(&V_dyn_ ## v[(b)]) 290 #define DYN_BUCKET_VERSION_BUMP(b, v) ck_pr_inc_32(&V_dyn_ ## v[(b)]) 291 292 #define DYN_BUCKET_LOCK_INIT(lock, b) \ 293 mtx_init(&lock[(b)], "IPFW dynamic bucket", NULL, MTX_DEF) 294 #define DYN_BUCKET_LOCK_DESTROY(lock, b) mtx_destroy(&lock[(b)]) 295 #define DYN_BUCKET_LOCK(b) mtx_lock(&V_dyn_bucket_lock[(b)]) 296 #define DYN_BUCKET_UNLOCK(b) mtx_unlock(&V_dyn_bucket_lock[(b)]) 297 #define DYN_BUCKET_ASSERT(b) mtx_assert(&V_dyn_bucket_lock[(b)], MA_OWNED) 298 299 #define DYN_EXPIRED_LOCK_INIT() \ 300 mtx_init(&V_dyn_expire_lock, "IPFW expired states list", NULL, MTX_DEF) 301 #define DYN_EXPIRED_LOCK_DESTROY() mtx_destroy(&V_dyn_expire_lock) 302 #define DYN_EXPIRED_LOCK() mtx_lock(&V_dyn_expire_lock) 303 #define DYN_EXPIRED_UNLOCK() mtx_unlock(&V_dyn_expire_lock) 304 305 VNET_DEFINE_STATIC(uint32_t, dyn_buckets_max); 306 VNET_DEFINE_STATIC(uint32_t, curr_dyn_buckets); 307 VNET_DEFINE_STATIC(struct callout, dyn_timeout); 308 #define V_dyn_buckets_max VNET(dyn_buckets_max) 309 #define V_curr_dyn_buckets VNET(curr_dyn_buckets) 310 #define V_dyn_timeout VNET(dyn_timeout) 311 312 /* Maximum length of states chain in a bucket */ 313 VNET_DEFINE_STATIC(uint32_t, curr_max_length); 314 #define V_curr_max_length VNET(curr_max_length) 315 316 VNET_DEFINE_STATIC(uint32_t, dyn_keep_states); 317 #define V_dyn_keep_states VNET(dyn_keep_states) 318 319 VNET_DEFINE_STATIC(uma_zone_t, dyn_data_zone); 320 VNET_DEFINE_STATIC(uma_zone_t, dyn_parent_zone); 321 VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv4_zone); 322 #ifdef INET6 323 VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv6_zone); 324 #define V_dyn_ipv6_zone VNET(dyn_ipv6_zone) 325 #endif /* INET6 */ 326 #define V_dyn_data_zone VNET(dyn_data_zone) 327 #define V_dyn_parent_zone VNET(dyn_parent_zone) 328 #define V_dyn_ipv4_zone VNET(dyn_ipv4_zone) 329 330 /* 331 * Timeouts for various events in handing dynamic rules. 332 */ 333 VNET_DEFINE_STATIC(uint32_t, dyn_ack_lifetime); 334 VNET_DEFINE_STATIC(uint32_t, dyn_syn_lifetime); 335 VNET_DEFINE_STATIC(uint32_t, dyn_fin_lifetime); 336 VNET_DEFINE_STATIC(uint32_t, dyn_rst_lifetime); 337 VNET_DEFINE_STATIC(uint32_t, dyn_udp_lifetime); 338 VNET_DEFINE_STATIC(uint32_t, dyn_short_lifetime); 339 340 #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) 341 #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) 342 #define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) 343 #define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) 344 #define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) 345 #define V_dyn_short_lifetime VNET(dyn_short_lifetime) 346 347 /* 348 * Keepalives are sent if dyn_keepalive is set. They are sent every 349 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval 350 * seconds of lifetime of a rule. 351 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower 352 * than dyn_keepalive_period. 353 */ 354 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_interval); 355 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_period); 356 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive); 357 VNET_DEFINE_STATIC(time_t, dyn_keepalive_last); 358 359 #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) 360 #define V_dyn_keepalive_period VNET(dyn_keepalive_period) 361 #define V_dyn_keepalive VNET(dyn_keepalive) 362 #define V_dyn_keepalive_last VNET(dyn_keepalive_last) 363 364 VNET_DEFINE_STATIC(uint32_t, dyn_max); /* max # of dynamic states */ 365 VNET_DEFINE_STATIC(uint32_t, dyn_count); /* number of states */ 366 VNET_DEFINE_STATIC(uint32_t, dyn_parent_max); /* max # of parent states */ 367 VNET_DEFINE_STATIC(uint32_t, dyn_parent_count); /* number of parent states */ 368 369 #define V_dyn_max VNET(dyn_max) 370 #define V_dyn_count VNET(dyn_count) 371 #define V_dyn_parent_max VNET(dyn_parent_max) 372 #define V_dyn_parent_count VNET(dyn_parent_count) 373 374 #define DYN_COUNT_DEC(name) do { \ 375 MPASS((V_ ## name) > 0); \ 376 ck_pr_dec_32(&(V_ ## name)); \ 377 } while (0) 378 #define DYN_COUNT_INC(name) ck_pr_inc_32(&(V_ ## name)) 379 #define DYN_COUNT(name) ck_pr_load_32(&(V_ ## name)) 380 381 static time_t last_log; /* Log ratelimiting */ 382 383 /* 384 * Get/set maximum number of dynamic states in given VNET instance. 385 */ 386 static int 387 sysctl_dyn_max(SYSCTL_HANDLER_ARGS) 388 { 389 uint32_t nstates; 390 int error; 391 392 nstates = V_dyn_max; 393 error = sysctl_handle_32(oidp, &nstates, 0, req); 394 /* Read operation or some error */ 395 if ((error != 0) || (req->newptr == NULL)) 396 return (error); 397 398 V_dyn_max = nstates; 399 uma_zone_set_max(V_dyn_data_zone, V_dyn_max); 400 return (0); 401 } 402 403 static int 404 sysctl_dyn_parent_max(SYSCTL_HANDLER_ARGS) 405 { 406 uint32_t nstates; 407 int error; 408 409 nstates = V_dyn_parent_max; 410 error = sysctl_handle_32(oidp, &nstates, 0, req); 411 /* Read operation or some error */ 412 if ((error != 0) || (req->newptr == NULL)) 413 return (error); 414 415 V_dyn_parent_max = nstates; 416 uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); 417 return (0); 418 } 419 420 static int 421 sysctl_dyn_buckets(SYSCTL_HANDLER_ARGS) 422 { 423 uint32_t nbuckets; 424 int error; 425 426 nbuckets = V_dyn_buckets_max; 427 error = sysctl_handle_32(oidp, &nbuckets, 0, req); 428 /* Read operation or some error */ 429 if ((error != 0) || (req->newptr == NULL)) 430 return (error); 431 432 if (nbuckets > 256) 433 V_dyn_buckets_max = 1 << fls(nbuckets - 1); 434 else 435 return (EINVAL); 436 return (0); 437 } 438 439 SYSCTL_DECL(_net_inet_ip_fw); 440 441 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_count, 442 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_count), 0, 443 "Current number of dynamic states."); 444 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_parent_count, 445 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_parent_count), 0, 446 "Current number of parent states. "); 447 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, 448 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, 449 "Current number of buckets for states hash table."); 450 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_max_length, 451 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_max_length), 0, 452 "Current maximum length of states chains in hash buckets."); 453 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_buckets, 454 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 455 0, 0, sysctl_dyn_buckets, "IU", 456 "Max number of buckets for dynamic states hash table."); 457 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, 458 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 459 0, 0, sysctl_dyn_max, "IU", 460 "Max number of dynamic states."); 461 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_parent_max, 462 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 463 0, 0, sysctl_dyn_parent_max, "IU", 464 "Max number of parent dynamic states."); 465 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, 466 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, 467 "Lifetime of dynamic states for TCP ACK."); 468 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, 469 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, 470 "Lifetime of dynamic states for TCP SYN."); 471 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, 472 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, 473 "Lifetime of dynamic states for TCP FIN."); 474 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, 475 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, 476 "Lifetime of dynamic states for TCP RST."); 477 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, 478 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, 479 "Lifetime of dynamic states for UDP."); 480 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, 481 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, 482 "Lifetime of dynamic states for other situations."); 483 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, 484 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, 485 "Enable keepalives for dynamic states."); 486 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, 487 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, 488 "Do not flush dynamic states on rule deletion"); 489 490 #ifdef IPFIREWALL_DYNDEBUG 491 #define DYN_DEBUG(fmt, ...) do { \ 492 printf("%s: " fmt "\n", __func__, __VA_ARGS__); \ 493 } while (0) 494 #else 495 #define DYN_DEBUG(fmt, ...) 496 #endif /* !IPFIREWALL_DYNDEBUG */ 497 498 #ifdef INET6 499 /* Functions to work with IPv6 states */ 500 static struct dyn_ipv6_state *dyn_lookup_ipv6_state( 501 const struct ipfw_flow_id *, uint32_t, const void *, 502 struct ipfw_dyn_info *, int); 503 static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *, 504 uint32_t, const void *, int, uint32_t, uint32_t); 505 static struct dyn_ipv6_state *dyn_alloc_ipv6_state( 506 const struct ipfw_flow_id *, uint32_t, uint32_t, uint8_t); 507 static int dyn_add_ipv6_state(void *, uint32_t, uint32_t, 508 const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t, 509 struct ipfw_dyn_info *, uint16_t, uint32_t, uint8_t); 510 static void dyn_export_ipv6_state(const struct dyn_ipv6_state *, 511 ipfw_dyn_rule *); 512 513 static uint32_t dyn_getscopeid(const struct ip_fw_args *); 514 static void dyn_make_keepalive_ipv6(struct mbuf *, const struct in6_addr *, 515 const struct in6_addr *, uint32_t, uint32_t, uint32_t, uint16_t, 516 uint16_t); 517 static void dyn_enqueue_keepalive_ipv6(struct mbufq *, 518 const struct dyn_ipv6_state *); 519 static void dyn_send_keepalive_ipv6(struct ip_fw_chain *); 520 521 static struct dyn_ipv6_state *dyn_lookup_ipv6_parent( 522 const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint32_t, 523 uint32_t); 524 static struct dyn_ipv6_state *dyn_lookup_ipv6_parent_locked( 525 const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint32_t, 526 uint32_t); 527 static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint32_t, 528 const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t, uint32_t); 529 #endif /* INET6 */ 530 531 /* Functions to work with limit states */ 532 static void *dyn_get_parent_state(const struct ipfw_flow_id *, uint32_t, 533 struct ip_fw *, uint32_t, uint32_t, uint32_t); 534 static struct dyn_ipv4_state *dyn_lookup_ipv4_parent( 535 const struct ipfw_flow_id *, const void *, uint32_t, uint32_t, uint32_t); 536 static struct dyn_ipv4_state *dyn_lookup_ipv4_parent_locked( 537 const struct ipfw_flow_id *, const void *, uint32_t, uint32_t, uint32_t); 538 static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint32_t, 539 uint32_t); 540 static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint32_t, 541 const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t); 542 543 static void dyn_tick(void *); 544 static void dyn_expire_states(struct ip_fw_chain *, ipfw_range_tlv *); 545 static void dyn_free_states(struct ip_fw_chain *); 546 static void dyn_export_parent(const struct dyn_parent *, uint32_t, uint8_t, 547 ipfw_dyn_rule *); 548 static void dyn_export_data(const struct dyn_data *, uint32_t, uint8_t, 549 uint8_t, ipfw_dyn_rule *); 550 static uint32_t dyn_update_tcp_state(struct dyn_data *, 551 const struct ipfw_flow_id *, const struct tcphdr *, int); 552 static void dyn_update_proto_state(struct dyn_data *, 553 const struct ipfw_flow_id *, const void *, int, int); 554 555 /* Functions to work with IPv4 states */ 556 struct dyn_ipv4_state *dyn_lookup_ipv4_state(const struct ipfw_flow_id *, 557 const void *, struct ipfw_dyn_info *, int); 558 static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *, 559 const void *, int, uint32_t, uint32_t); 560 static struct dyn_ipv4_state *dyn_alloc_ipv4_state( 561 const struct ipfw_flow_id *, uint32_t, uint8_t); 562 static int dyn_add_ipv4_state(void *, uint32_t, uint32_t, 563 const struct ipfw_flow_id *, const void *, int, uint32_t, 564 struct ipfw_dyn_info *, uint16_t, uint32_t, uint8_t); 565 static void dyn_export_ipv4_state(const struct dyn_ipv4_state *, 566 ipfw_dyn_rule *); 567 568 /* 569 * Named states support. 570 */ 571 static char *default_state_name = "default"; 572 struct dyn_state_obj { 573 struct named_object no; 574 char name[64]; 575 }; 576 577 /* 578 * Classifier callback. 579 * Return 0 if opcode contains object that should be referenced 580 * or rewritten. 581 */ 582 static int 583 dyn_classify(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) 584 { 585 ipfw_insn_kidx *cmd; 586 587 if (F_LEN(cmd0) < 2) 588 return (EINVAL); 589 590 /* 591 * NOTE: ipfw_insn_kidx and ipfw_insn_limit has overlapped kidx 592 * field, so we can use one type to get access to kidx field. 593 */ 594 cmd = insntod(cmd0, kidx); 595 DYN_DEBUG("opcode %u, kidx %u", cmd0->opcode, cmd->kidx); 596 /* Don't rewrite "check-state any" */ 597 if (cmd->kidx == 0 && 598 cmd0->opcode == O_CHECK_STATE) 599 return (1); 600 601 *puidx = cmd->kidx; 602 *ptype = 0; 603 return (0); 604 } 605 606 static void 607 dyn_update(ipfw_insn *cmd0, uint32_t idx) 608 { 609 610 insntod(cmd0, kidx)->kidx = idx; 611 DYN_DEBUG("opcode %u, kidx %u", cmd0->opcode, idx); 612 } 613 614 static int 615 dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, 616 struct named_object **pno) 617 { 618 ipfw_obj_ntlv *ntlv; 619 const char *name; 620 621 DYN_DEBUG("uidx %u", ti->uidx); 622 if (ti->uidx != 0) { 623 if (ti->tlvs == NULL) 624 return (EINVAL); 625 /* Search ntlv in the buffer provided by user */ 626 ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, 627 IPFW_TLV_STATE_NAME); 628 if (ntlv == NULL) 629 return (EINVAL); 630 name = ntlv->name; 631 } else 632 name = default_state_name; 633 /* 634 * Search named object with corresponding name. 635 * Since states objects are global - ignore the set value 636 * and use zero instead. 637 */ 638 *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, 639 IPFW_TLV_STATE_NAME, name); 640 /* 641 * We always return success here. 642 * The caller will check *pno and mark object as unresolved, 643 * then it will automatically create "default" object. 644 */ 645 return (0); 646 } 647 648 static struct named_object * 649 dyn_findbykidx(struct ip_fw_chain *ch, uint32_t idx) 650 { 651 652 DYN_DEBUG("kidx %u", idx); 653 return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); 654 } 655 656 static int 657 dyn_create(struct ip_fw_chain *ch, struct tid_info *ti, 658 uint32_t *pkidx) 659 { 660 struct namedobj_instance *ni; 661 struct dyn_state_obj *obj; 662 struct named_object *no; 663 ipfw_obj_ntlv *ntlv; 664 char *name; 665 666 DYN_DEBUG("uidx %u", ti->uidx); 667 if (ti->uidx != 0) { 668 if (ti->tlvs == NULL) 669 return (EINVAL); 670 ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, 671 IPFW_TLV_STATE_NAME); 672 if (ntlv == NULL) 673 return (EINVAL); 674 name = ntlv->name; 675 } else 676 name = default_state_name; 677 678 ni = CHAIN_TO_SRV(ch); 679 obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); 680 obj->no.name = obj->name; 681 obj->no.etlv = IPFW_TLV_STATE_NAME; 682 strlcpy(obj->name, name, sizeof(obj->name)); 683 684 IPFW_UH_WLOCK(ch); 685 no = ipfw_objhash_lookup_name_type(ni, 0, 686 IPFW_TLV_STATE_NAME, name); 687 if (no != NULL) { 688 /* 689 * Object is already created. 690 * Just return its kidx and bump refcount. 691 */ 692 *pkidx = no->kidx; 693 no->refcnt++; 694 IPFW_UH_WUNLOCK(ch); 695 free(obj, M_IPFW); 696 DYN_DEBUG("\tfound kidx %u for name '%s'", *pkidx, no->name); 697 return (0); 698 } 699 if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { 700 DYN_DEBUG("\talloc_idx failed for %s", name); 701 IPFW_UH_WUNLOCK(ch); 702 free(obj, M_IPFW); 703 return (ENOSPC); 704 } 705 ipfw_objhash_add(ni, &obj->no); 706 SRV_OBJECT(ch, obj->no.kidx) = obj; 707 obj->no.refcnt++; 708 *pkidx = obj->no.kidx; 709 IPFW_UH_WUNLOCK(ch); 710 DYN_DEBUG("\tcreated kidx %u for name '%s'", *pkidx, name); 711 return (0); 712 } 713 714 static void 715 dyn_destroy(struct ip_fw_chain *ch, struct named_object *no) 716 { 717 struct dyn_state_obj *obj; 718 719 IPFW_UH_WLOCK_ASSERT(ch); 720 721 KASSERT(no->etlv == IPFW_TLV_STATE_NAME, 722 ("%s: wrong object type %u", __func__, no->etlv)); 723 KASSERT(no->refcnt == 1, 724 ("Destroying object '%s' (type %u, idx %u) with refcnt %u", 725 no->name, no->etlv, no->kidx, no->refcnt)); 726 DYN_DEBUG("kidx %u", no->kidx); 727 obj = SRV_OBJECT(ch, no->kidx); 728 SRV_OBJECT(ch, no->kidx) = NULL; 729 ipfw_objhash_del(CHAIN_TO_SRV(ch), no); 730 ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx); 731 732 free(obj, M_IPFW); 733 } 734 735 static struct opcode_obj_rewrite dyn_opcodes[] = { 736 { 737 O_KEEP_STATE, IPFW_TLV_STATE_NAME, 738 dyn_classify, dyn_update, 739 dyn_findbyname, dyn_findbykidx, 740 dyn_create, dyn_destroy 741 }, 742 { 743 O_CHECK_STATE, IPFW_TLV_STATE_NAME, 744 dyn_classify, dyn_update, 745 dyn_findbyname, dyn_findbykidx, 746 dyn_create, dyn_destroy 747 }, 748 { 749 O_PROBE_STATE, IPFW_TLV_STATE_NAME, 750 dyn_classify, dyn_update, 751 dyn_findbyname, dyn_findbykidx, 752 dyn_create, dyn_destroy 753 }, 754 { 755 O_LIMIT, IPFW_TLV_STATE_NAME, 756 dyn_classify, dyn_update, 757 dyn_findbyname, dyn_findbykidx, 758 dyn_create, dyn_destroy 759 }, 760 }; 761 762 /* 763 * IMPORTANT: the hash function for dynamic rules must be commutative 764 * in source and destination (ip,port), because rules are bidirectional 765 * and we want to find both in the same bucket. 766 */ 767 #ifndef IPFIREWALL_JENKINSHASH 768 static __inline uint32_t 769 hash_packet(const struct ipfw_flow_id *id) 770 { 771 uint32_t i; 772 773 #ifdef INET6 774 if (IS_IP6_FLOW_ID(id)) 775 i = ntohl((id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ 776 (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ 777 (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ 778 (id->src_ip6.__u6_addr.__u6_addr32[3])); 779 else 780 #endif /* INET6 */ 781 i = (id->dst_ip) ^ (id->src_ip); 782 i ^= (id->dst_port) ^ (id->src_port); 783 return (i); 784 } 785 786 static __inline uint32_t 787 hash_parent(const struct ipfw_flow_id *id, const void *rule) 788 { 789 790 return (hash_packet(id) ^ ((uintptr_t)rule)); 791 } 792 793 #else /* IPFIREWALL_JENKINSHASH */ 794 795 VNET_DEFINE_STATIC(uint32_t, dyn_hashseed); 796 #define V_dyn_hashseed VNET(dyn_hashseed) 797 798 static __inline int 799 addrcmp4(const struct ipfw_flow_id *id) 800 { 801 802 if (id->src_ip < id->dst_ip) 803 return (0); 804 if (id->src_ip > id->dst_ip) 805 return (1); 806 if (id->src_port <= id->dst_port) 807 return (0); 808 return (1); 809 } 810 811 #ifdef INET6 812 static __inline int 813 addrcmp6(const struct ipfw_flow_id *id) 814 { 815 int ret; 816 817 ret = memcmp(&id->src_ip6, &id->dst_ip6, sizeof(struct in6_addr)); 818 if (ret < 0) 819 return (0); 820 if (ret > 0) 821 return (1); 822 if (id->src_port <= id->dst_port) 823 return (0); 824 return (1); 825 } 826 827 static __inline uint32_t 828 hash_packet6(const struct ipfw_flow_id *id) 829 { 830 struct tuple6 { 831 struct in6_addr addr[2]; 832 uint16_t port[2]; 833 } t6; 834 835 if (addrcmp6(id) == 0) { 836 t6.addr[0] = id->src_ip6; 837 t6.addr[1] = id->dst_ip6; 838 t6.port[0] = id->src_port; 839 t6.port[1] = id->dst_port; 840 } else { 841 t6.addr[0] = id->dst_ip6; 842 t6.addr[1] = id->src_ip6; 843 t6.port[0] = id->dst_port; 844 t6.port[1] = id->src_port; 845 } 846 return (jenkins_hash32((const uint32_t *)&t6, 847 sizeof(t6) / sizeof(uint32_t), V_dyn_hashseed)); 848 } 849 #endif 850 851 static __inline uint32_t 852 hash_packet(const struct ipfw_flow_id *id) 853 { 854 struct tuple4 { 855 in_addr_t addr[2]; 856 uint16_t port[2]; 857 } t4; 858 859 if (IS_IP4_FLOW_ID(id)) { 860 /* All fields are in host byte order */ 861 if (addrcmp4(id) == 0) { 862 t4.addr[0] = id->src_ip; 863 t4.addr[1] = id->dst_ip; 864 t4.port[0] = id->src_port; 865 t4.port[1] = id->dst_port; 866 } else { 867 t4.addr[0] = id->dst_ip; 868 t4.addr[1] = id->src_ip; 869 t4.port[0] = id->dst_port; 870 t4.port[1] = id->src_port; 871 } 872 return (jenkins_hash32((const uint32_t *)&t4, 873 sizeof(t4) / sizeof(uint32_t), V_dyn_hashseed)); 874 } else 875 #ifdef INET6 876 if (IS_IP6_FLOW_ID(id)) 877 return (hash_packet6(id)); 878 #endif 879 return (0); 880 } 881 882 static __inline uint32_t 883 hash_parent(const struct ipfw_flow_id *id, const void *rule) 884 { 885 886 return (jenkins_hash32((const uint32_t *)&rule, 887 sizeof(rule) / sizeof(uint32_t), hash_packet(id))); 888 } 889 #endif /* IPFIREWALL_JENKINSHASH */ 890 891 /* 892 * Print customizable flow id description via log(9) facility. 893 */ 894 static void 895 print_dyn_rule_flags(const struct ipfw_flow_id *id, int dyn_type, 896 int log_flags, char *prefix, char *postfix) 897 { 898 struct in_addr da; 899 #ifdef INET6 900 char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; 901 #else 902 char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; 903 #endif 904 905 #ifdef INET6 906 if (IS_IP6_FLOW_ID(id)) { 907 ip6_sprintf(src, &id->src_ip6); 908 ip6_sprintf(dst, &id->dst_ip6); 909 } else 910 #endif 911 { 912 da.s_addr = htonl(id->src_ip); 913 inet_ntop(AF_INET, &da, src, sizeof(src)); 914 da.s_addr = htonl(id->dst_ip); 915 inet_ntop(AF_INET, &da, dst, sizeof(dst)); 916 } 917 log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", 918 prefix, dyn_type, src, id->src_port, dst, 919 id->dst_port, V_dyn_count, postfix); 920 } 921 922 #define print_dyn_rule(id, dtype, prefix, postfix) \ 923 print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) 924 925 #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) 926 #define TIME_LE(a,b) ((int)((a)-(b)) < 0) 927 #define _SEQ_GE(a,b) ((int)((a)-(b)) >= 0) 928 #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) 929 #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) 930 #define BOTH_RST (TH_RST | (TH_RST << 8)) 931 #define TCP_FLAGS (BOTH_SYN | BOTH_FIN | BOTH_RST) 932 #define ACK_FWD 0x00010000 /* fwd ack seen */ 933 #define ACK_REV 0x00020000 /* rev ack seen */ 934 #define ACK_BOTH (ACK_FWD | ACK_REV) 935 936 static uint32_t 937 dyn_update_tcp_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, 938 const struct tcphdr *tcp, int dir) 939 { 940 uint32_t ack, expire; 941 uint32_t state, old; 942 uint8_t th_flags; 943 944 expire = data->expire; 945 old = state = data->state; 946 th_flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); 947 state |= (dir == MATCH_FORWARD) ? th_flags: (th_flags << 8); 948 switch (state & TCP_FLAGS) { 949 case TH_SYN: /* opening */ 950 expire = time_uptime + V_dyn_syn_lifetime; 951 break; 952 953 case BOTH_SYN: /* move to established */ 954 case BOTH_SYN | TH_FIN: /* one side tries to close */ 955 case BOTH_SYN | (TH_FIN << 8): 956 if (tcp == NULL) 957 break; 958 ack = ntohl(tcp->th_ack); 959 if (dir == MATCH_FORWARD) { 960 if (data->ack_fwd == 0 || 961 _SEQ_GE(ack, data->ack_fwd)) { 962 state |= ACK_FWD; 963 if (data->ack_fwd != ack) 964 ck_pr_store_32(&data->ack_fwd, ack); 965 } 966 } else { 967 if (data->ack_rev == 0 || 968 _SEQ_GE(ack, data->ack_rev)) { 969 state |= ACK_REV; 970 if (data->ack_rev != ack) 971 ck_pr_store_32(&data->ack_rev, ack); 972 } 973 } 974 if ((state & ACK_BOTH) == ACK_BOTH) { 975 /* 976 * Set expire time to V_dyn_ack_lifetime only if 977 * we got ACKs for both directions. 978 * We use XOR here to avoid possible state 979 * overwriting in concurrent thread. 980 */ 981 expire = time_uptime + V_dyn_ack_lifetime; 982 ck_pr_xor_32(&data->state, ACK_BOTH); 983 } else if ((data->state & ACK_BOTH) != (state & ACK_BOTH)) 984 ck_pr_or_32(&data->state, state & ACK_BOTH); 985 break; 986 987 case BOTH_SYN | BOTH_FIN: /* both sides closed */ 988 if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) 989 V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; 990 expire = time_uptime + V_dyn_fin_lifetime; 991 break; 992 993 default: 994 if (V_dyn_keepalive != 0 && 995 V_dyn_rst_lifetime >= V_dyn_keepalive_period) 996 V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; 997 expire = time_uptime + V_dyn_rst_lifetime; 998 } 999 /* Save TCP state if it was changed */ 1000 if ((state & TCP_FLAGS) != (old & TCP_FLAGS)) 1001 ck_pr_or_32(&data->state, state & TCP_FLAGS); 1002 return (expire); 1003 } 1004 1005 /* 1006 * Update ULP specific state. 1007 * For TCP we keep sequence numbers and flags. For other protocols 1008 * currently we update only expire time. Packets and bytes counters 1009 * are also updated here. 1010 */ 1011 static void 1012 dyn_update_proto_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, 1013 const void *ulp, int pktlen, int dir) 1014 { 1015 uint32_t expire; 1016 1017 /* NOTE: we are in critical section here. */ 1018 switch (pkt->proto) { 1019 case IPPROTO_UDP: 1020 case IPPROTO_UDPLITE: 1021 expire = time_uptime + V_dyn_udp_lifetime; 1022 break; 1023 case IPPROTO_TCP: 1024 expire = dyn_update_tcp_state(data, pkt, ulp, dir); 1025 break; 1026 default: 1027 expire = time_uptime + V_dyn_short_lifetime; 1028 } 1029 /* 1030 * Expiration timer has the per-second granularity, no need to update 1031 * it every time when state is matched. 1032 */ 1033 if (data->expire != expire) 1034 ck_pr_store_32(&data->expire, expire); 1035 1036 if (dir == MATCH_FORWARD) 1037 DYN_COUNTER_INC(data, fwd, pktlen); 1038 else 1039 DYN_COUNTER_INC(data, rev, pktlen); 1040 } 1041 1042 /* 1043 * Lookup IPv4 state. 1044 * Must be called in critical section. 1045 */ 1046 struct dyn_ipv4_state * 1047 dyn_lookup_ipv4_state(const struct ipfw_flow_id *pkt, const void *ulp, 1048 struct ipfw_dyn_info *info, int pktlen) 1049 { 1050 struct dyn_ipv4_state *s; 1051 uint32_t version, bucket; 1052 1053 bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); 1054 info->version = DYN_BUCKET_VERSION(bucket, ipv4_add); 1055 restart: 1056 version = DYN_BUCKET_VERSION(bucket, ipv4_del); 1057 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 1058 DYNSTATE_PROTECT(s); 1059 if (version != DYN_BUCKET_VERSION(bucket, ipv4_del)) 1060 goto restart; 1061 if (s->proto != pkt->proto) 1062 continue; 1063 if (info->kidx != 0 && s->kidx != info->kidx) 1064 continue; 1065 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1066 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1067 info->direction = MATCH_FORWARD; 1068 break; 1069 } 1070 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1071 s->src == pkt->dst_ip && s->dst == pkt->src_ip) { 1072 info->direction = MATCH_REVERSE; 1073 break; 1074 } 1075 } 1076 1077 if (s != NULL) 1078 dyn_update_proto_state(s->data, pkt, ulp, pktlen, 1079 info->direction); 1080 return (s); 1081 } 1082 1083 /* 1084 * Lookup IPv4 state. 1085 * Simplifed version is used to check that matching state doesn't exist. 1086 */ 1087 static int 1088 dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *pkt, 1089 const void *ulp, int pktlen, uint32_t bucket, uint32_t kidx) 1090 { 1091 struct dyn_ipv4_state *s; 1092 int dir; 1093 1094 dir = MATCH_NONE; 1095 DYN_BUCKET_ASSERT(bucket); 1096 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 1097 if (s->proto != pkt->proto || 1098 s->kidx != kidx) 1099 continue; 1100 if (s->sport == pkt->src_port && 1101 s->dport == pkt->dst_port && 1102 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1103 dir = MATCH_FORWARD; 1104 break; 1105 } 1106 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1107 s->src == pkt->dst_ip && s->dst == pkt->src_ip) { 1108 dir = MATCH_REVERSE; 1109 break; 1110 } 1111 } 1112 if (s != NULL) 1113 dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); 1114 return (s != NULL); 1115 } 1116 1117 struct dyn_ipv4_state * 1118 dyn_lookup_ipv4_parent(const struct ipfw_flow_id *pkt, const void *rule, 1119 uint32_t ruleid, uint32_t rulenum, uint32_t hashval) 1120 { 1121 struct dyn_ipv4_state *s; 1122 uint32_t version, bucket; 1123 1124 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1125 restart: 1126 version = DYN_BUCKET_VERSION(bucket, ipv4_parent_del); 1127 CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { 1128 DYNSTATE_PROTECT(s); 1129 if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_del)) 1130 goto restart; 1131 /* 1132 * NOTE: we do not need to check kidx, because parent rule 1133 * can not create states with different kidx. 1134 * And parent rule always created for forward direction. 1135 */ 1136 if (s->limit->parent == rule && 1137 s->limit->ruleid == ruleid && 1138 s->limit->rulenum == rulenum && 1139 s->proto == pkt->proto && 1140 s->sport == pkt->src_port && 1141 s->dport == pkt->dst_port && 1142 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1143 if (s->limit->expire != time_uptime + 1144 V_dyn_short_lifetime) 1145 ck_pr_store_32(&s->limit->expire, 1146 time_uptime + V_dyn_short_lifetime); 1147 break; 1148 } 1149 } 1150 return (s); 1151 } 1152 1153 static struct dyn_ipv4_state * 1154 dyn_lookup_ipv4_parent_locked(const struct ipfw_flow_id *pkt, 1155 const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t bucket) 1156 { 1157 struct dyn_ipv4_state *s; 1158 1159 DYN_BUCKET_ASSERT(bucket); 1160 CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { 1161 if (s->limit->parent == rule && 1162 s->limit->ruleid == ruleid && 1163 s->limit->rulenum == rulenum && 1164 s->proto == pkt->proto && 1165 s->sport == pkt->src_port && 1166 s->dport == pkt->dst_port && 1167 s->src == pkt->src_ip && s->dst == pkt->dst_ip) 1168 break; 1169 } 1170 return (s); 1171 } 1172 1173 #ifdef INET6 1174 static uint32_t 1175 dyn_getscopeid(const struct ip_fw_args *args) 1176 { 1177 1178 /* 1179 * If source or destination address is an scopeid address, we need 1180 * determine the scope zone id to resolve address scope ambiguity. 1181 */ 1182 if (IN6_IS_ADDR_LINKLOCAL(&args->f_id.src_ip6) || 1183 IN6_IS_ADDR_LINKLOCAL(&args->f_id.dst_ip6)) 1184 return (in6_getscopezone(args->ifp, IPV6_ADDR_SCOPE_LINKLOCAL)); 1185 1186 return (0); 1187 } 1188 1189 /* 1190 * Lookup IPv6 state. 1191 * Must be called in critical section. 1192 */ 1193 static struct dyn_ipv6_state * 1194 dyn_lookup_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1195 const void *ulp, struct ipfw_dyn_info *info, int pktlen) 1196 { 1197 struct dyn_ipv6_state *s; 1198 uint32_t version, bucket; 1199 1200 bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); 1201 info->version = DYN_BUCKET_VERSION(bucket, ipv6_add); 1202 restart: 1203 version = DYN_BUCKET_VERSION(bucket, ipv6_del); 1204 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 1205 DYNSTATE_PROTECT(s); 1206 if (version != DYN_BUCKET_VERSION(bucket, ipv6_del)) 1207 goto restart; 1208 if (s->proto != pkt->proto || s->zoneid != zoneid) 1209 continue; 1210 if (info->kidx != 0 && s->kidx != info->kidx) 1211 continue; 1212 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1213 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1214 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1215 info->direction = MATCH_FORWARD; 1216 break; 1217 } 1218 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1219 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && 1220 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { 1221 info->direction = MATCH_REVERSE; 1222 break; 1223 } 1224 } 1225 if (s != NULL) 1226 dyn_update_proto_state(s->data, pkt, ulp, pktlen, 1227 info->direction); 1228 return (s); 1229 } 1230 1231 /* 1232 * Lookup IPv6 state. 1233 * Simplifed version is used to check that matching state doesn't exist. 1234 */ 1235 static int 1236 dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1237 const void *ulp, int pktlen, uint32_t bucket, uint32_t kidx) 1238 { 1239 struct dyn_ipv6_state *s; 1240 int dir; 1241 1242 dir = MATCH_NONE; 1243 DYN_BUCKET_ASSERT(bucket); 1244 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 1245 if (s->proto != pkt->proto || s->kidx != kidx || 1246 s->zoneid != zoneid) 1247 continue; 1248 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1249 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1250 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1251 dir = MATCH_FORWARD; 1252 break; 1253 } 1254 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1255 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && 1256 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { 1257 dir = MATCH_REVERSE; 1258 break; 1259 } 1260 } 1261 if (s != NULL) 1262 dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); 1263 return (s != NULL); 1264 } 1265 1266 static struct dyn_ipv6_state * 1267 dyn_lookup_ipv6_parent(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1268 const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t hashval) 1269 { 1270 struct dyn_ipv6_state *s; 1271 uint32_t version, bucket; 1272 1273 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1274 restart: 1275 version = DYN_BUCKET_VERSION(bucket, ipv6_parent_del); 1276 CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { 1277 DYNSTATE_PROTECT(s); 1278 if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_del)) 1279 goto restart; 1280 /* 1281 * NOTE: we do not need to check kidx, because parent rule 1282 * can not create states with different kidx. 1283 * Also parent rule always created for forward direction. 1284 */ 1285 if (s->limit->parent == rule && 1286 s->limit->ruleid == ruleid && 1287 s->limit->rulenum == rulenum && 1288 s->proto == pkt->proto && 1289 s->sport == pkt->src_port && 1290 s->dport == pkt->dst_port && s->zoneid == zoneid && 1291 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1292 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1293 if (s->limit->expire != time_uptime + 1294 V_dyn_short_lifetime) 1295 ck_pr_store_32(&s->limit->expire, 1296 time_uptime + V_dyn_short_lifetime); 1297 break; 1298 } 1299 } 1300 return (s); 1301 } 1302 1303 static struct dyn_ipv6_state * 1304 dyn_lookup_ipv6_parent_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1305 const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t bucket) 1306 { 1307 struct dyn_ipv6_state *s; 1308 1309 DYN_BUCKET_ASSERT(bucket); 1310 CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { 1311 if (s->limit->parent == rule && 1312 s->limit->ruleid == ruleid && 1313 s->limit->rulenum == rulenum && 1314 s->proto == pkt->proto && 1315 s->sport == pkt->src_port && 1316 s->dport == pkt->dst_port && s->zoneid == zoneid && 1317 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1318 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) 1319 break; 1320 } 1321 return (s); 1322 } 1323 1324 #endif /* INET6 */ 1325 1326 /* 1327 * Lookup dynamic state. 1328 * pkt - filled by ipfw_chk() ipfw_flow_id; 1329 * ulp - determined by ipfw_chk() upper level protocol header; 1330 * dyn_info - info about matched state to return back; 1331 * Returns pointer to state's parent rule and dyn_info. If there is 1332 * no state, NULL is returned. 1333 * On match ipfw_dyn_lookup() updates state's counters. 1334 */ 1335 struct ip_fw * 1336 ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp, 1337 int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info) 1338 { 1339 struct dyn_data *data; 1340 struct ip_fw *rule; 1341 1342 IPFW_RLOCK_ASSERT(&V_layer3_chain); 1343 MPASS(F_LEN(cmd) >= F_INSN_SIZE(ipfw_insn_kidx)); 1344 1345 data = NULL; 1346 rule = NULL; 1347 info->kidx = insntoc(cmd, kidx)->kidx; 1348 info->direction = MATCH_NONE; 1349 info->hashval = hash_packet(&args->f_id); 1350 1351 DYNSTATE_CRITICAL_ENTER(); 1352 if (IS_IP4_FLOW_ID(&args->f_id)) { 1353 struct dyn_ipv4_state *s; 1354 1355 s = dyn_lookup_ipv4_state(&args->f_id, ulp, info, pktlen); 1356 if (s != NULL) { 1357 /* 1358 * Dynamic states are created using the same 5-tuple, 1359 * so it is assumed, that parent rule for O_LIMIT 1360 * state has the same address family. 1361 */ 1362 data = s->data; 1363 if (s->type == O_LIMIT) { 1364 s = data->parent; 1365 rule = s->limit->parent; 1366 } else 1367 rule = data->parent; 1368 } 1369 } 1370 #ifdef INET6 1371 else if (IS_IP6_FLOW_ID(&args->f_id)) { 1372 struct dyn_ipv6_state *s; 1373 1374 s = dyn_lookup_ipv6_state(&args->f_id, dyn_getscopeid(args), 1375 ulp, info, pktlen); 1376 if (s != NULL) { 1377 data = s->data; 1378 if (s->type == O_LIMIT) { 1379 s = data->parent; 1380 rule = s->limit->parent; 1381 } else 1382 rule = data->parent; 1383 } 1384 } 1385 #endif 1386 if (data != NULL) { 1387 /* 1388 * If cached chain id is the same, we can avoid rule index 1389 * lookup. Otherwise do lookup and update chain_id and f_pos. 1390 * It is safe even if there is concurrent thread that want 1391 * update the same state, because chain->id can be changed 1392 * only under IPFW_WLOCK(). 1393 */ 1394 if (data->chain_id != V_layer3_chain.id) { 1395 data->f_pos = ipfw_find_rule(&V_layer3_chain, 1396 data->rulenum, data->ruleid); 1397 /* 1398 * Check that found state has not orphaned. 1399 * When chain->id being changed the parent 1400 * rule can be deleted. If found rule doesn't 1401 * match the parent pointer, consider this 1402 * result as MATCH_NONE and return NULL. 1403 * 1404 * This will lead to creation of new similar state 1405 * that will be added into head of this bucket. 1406 * And the state that we currently have matched 1407 * should be deleted by dyn_expire_states(). 1408 * 1409 * In case when dyn_keep_states is enabled, return 1410 * pointer to deleted rule and f_pos value 1411 * corresponding to penultimate rule. 1412 * When we have enabled V_dyn_keep_states, states 1413 * that become orphaned will get the DYN_REFERENCED 1414 * flag and rule will keep around. So we can return 1415 * it. But since it is not in the rules map, we need 1416 * return such f_pos value, so after the state 1417 * handling if the search will continue, the next rule 1418 * will be the last one - the default rule. 1419 */ 1420 if (V_layer3_chain.map[data->f_pos] == rule) { 1421 data->chain_id = V_layer3_chain.id; 1422 } else if (V_dyn_keep_states != 0) { 1423 /* 1424 * The original rule pointer is still usable. 1425 * So, we return it, but f_pos need to be 1426 * changed to point to the penultimate rule. 1427 */ 1428 MPASS(V_layer3_chain.n_rules > 1); 1429 data->chain_id = V_layer3_chain.id; 1430 data->f_pos = V_layer3_chain.n_rules - 2; 1431 } else { 1432 rule = NULL; 1433 info->direction = MATCH_NONE; 1434 DYN_DEBUG("rule %p [%u, %u] is considered " 1435 "invalid in data %p", rule, data->ruleid, 1436 data->rulenum, data); 1437 /* info->f_pos doesn't matter here. */ 1438 } 1439 } 1440 info->f_pos = data->f_pos; 1441 } 1442 DYNSTATE_CRITICAL_EXIT(); 1443 #if 0 1444 /* 1445 * Return MATCH_NONE if parent rule is in disabled set. 1446 * This will lead to creation of new similar state that 1447 * will be added into head of this bucket. 1448 * 1449 * XXXAE: we need to be able update state's set when parent 1450 * rule set is changed. 1451 */ 1452 if (rule != NULL && (V_set_disable & (1 << rule->set))) { 1453 rule = NULL; 1454 info->direction = MATCH_NONE; 1455 } 1456 #endif 1457 return (rule); 1458 } 1459 1460 static struct dyn_parent * 1461 dyn_alloc_parent(void *parent, uint32_t ruleid, uint32_t rulenum, 1462 uint32_t hashval) 1463 { 1464 struct dyn_parent *limit; 1465 1466 limit = uma_zalloc(V_dyn_parent_zone, M_NOWAIT | M_ZERO); 1467 if (limit == NULL) { 1468 if (last_log != time_uptime) { 1469 last_log = time_uptime; 1470 log(LOG_DEBUG, 1471 "ipfw: Cannot allocate parent dynamic state, " 1472 "consider increasing " 1473 "net.inet.ip.fw.dyn_parent_max\n"); 1474 } 1475 return (NULL); 1476 } 1477 1478 limit->parent = parent; 1479 limit->ruleid = ruleid; 1480 limit->rulenum = rulenum; 1481 limit->hashval = hashval; 1482 limit->expire = time_uptime + V_dyn_short_lifetime; 1483 return (limit); 1484 } 1485 1486 static struct dyn_data * 1487 dyn_alloc_dyndata(void *parent, uint32_t ruleid, uint32_t rulenum, 1488 const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, 1489 uint32_t hashval, uint16_t fibnum) 1490 { 1491 struct dyn_data *data; 1492 1493 data = uma_zalloc(V_dyn_data_zone, M_NOWAIT | M_ZERO); 1494 if (data == NULL) { 1495 if (last_log != time_uptime) { 1496 last_log = time_uptime; 1497 log(LOG_DEBUG, 1498 "ipfw: Cannot allocate dynamic state, " 1499 "consider increasing net.inet.ip.fw.dyn_max\n"); 1500 } 1501 return (NULL); 1502 } 1503 1504 data->parent = parent; 1505 data->ruleid = ruleid; 1506 data->rulenum = rulenum; 1507 data->fibnum = fibnum; 1508 data->hashval = hashval; 1509 data->expire = time_uptime + V_dyn_syn_lifetime; 1510 dyn_update_proto_state(data, pkt, ulp, pktlen, MATCH_FORWARD); 1511 return (data); 1512 } 1513 1514 static struct dyn_ipv4_state * 1515 dyn_alloc_ipv4_state(const struct ipfw_flow_id *pkt, uint32_t kidx, 1516 uint8_t type) 1517 { 1518 struct dyn_ipv4_state *s; 1519 1520 s = uma_zalloc(V_dyn_ipv4_zone, M_NOWAIT | M_ZERO); 1521 if (s == NULL) 1522 return (NULL); 1523 1524 s->type = type; 1525 s->kidx = kidx; 1526 s->proto = pkt->proto; 1527 s->sport = pkt->src_port; 1528 s->dport = pkt->dst_port; 1529 s->src = pkt->src_ip; 1530 s->dst = pkt->dst_ip; 1531 return (s); 1532 } 1533 1534 /* 1535 * Add IPv4 parent state. 1536 * Returns pointer to parent state. When it is not NULL we are in 1537 * critical section and pointer protected by hazard pointer. 1538 * When some error occurs, it returns NULL and exit from critical section 1539 * is not needed. 1540 */ 1541 static struct dyn_ipv4_state * 1542 dyn_add_ipv4_parent(void *rule, uint32_t ruleid, uint32_t rulenum, 1543 const struct ipfw_flow_id *pkt, uint32_t hashval, uint32_t version, 1544 uint32_t kidx) 1545 { 1546 struct dyn_ipv4_state *s; 1547 struct dyn_parent *limit; 1548 uint32_t bucket; 1549 1550 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1551 DYN_BUCKET_LOCK(bucket); 1552 if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_add)) { 1553 /* 1554 * Bucket version has been changed since last lookup, 1555 * do lookup again to be sure that state does not exist. 1556 */ 1557 s = dyn_lookup_ipv4_parent_locked(pkt, rule, ruleid, 1558 rulenum, bucket); 1559 if (s != NULL) { 1560 /* 1561 * Simultaneous thread has already created this 1562 * state. Just return it. 1563 */ 1564 DYNSTATE_CRITICAL_ENTER(); 1565 DYNSTATE_PROTECT(s); 1566 DYN_BUCKET_UNLOCK(bucket); 1567 return (s); 1568 } 1569 } 1570 1571 limit = dyn_alloc_parent(rule, ruleid, rulenum, hashval); 1572 if (limit == NULL) { 1573 DYN_BUCKET_UNLOCK(bucket); 1574 return (NULL); 1575 } 1576 1577 s = dyn_alloc_ipv4_state(pkt, kidx, O_LIMIT_PARENT); 1578 if (s == NULL) { 1579 DYN_BUCKET_UNLOCK(bucket); 1580 uma_zfree(V_dyn_parent_zone, limit); 1581 return (NULL); 1582 } 1583 1584 s->limit = limit; 1585 CK_SLIST_INSERT_HEAD(&V_dyn_ipv4_parent[bucket], s, entry); 1586 DYN_COUNT_INC(dyn_parent_count); 1587 DYN_BUCKET_VERSION_BUMP(bucket, ipv4_parent_add); 1588 DYNSTATE_CRITICAL_ENTER(); 1589 DYNSTATE_PROTECT(s); 1590 DYN_BUCKET_UNLOCK(bucket); 1591 return (s); 1592 } 1593 1594 static int 1595 dyn_add_ipv4_state(void *parent, uint32_t ruleid, uint32_t rulenum, 1596 const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, 1597 uint32_t hashval, struct ipfw_dyn_info *info, uint16_t fibnum, 1598 uint32_t kidx, uint8_t type) 1599 { 1600 struct dyn_ipv4_state *s; 1601 void *data; 1602 uint32_t bucket; 1603 1604 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1605 DYN_BUCKET_LOCK(bucket); 1606 if (info->direction == MATCH_UNKNOWN || 1607 info->kidx != kidx || 1608 info->hashval != hashval || 1609 info->version != DYN_BUCKET_VERSION(bucket, ipv4_add)) { 1610 /* 1611 * Bucket version has been changed since last lookup, 1612 * do lookup again to be sure that state does not exist. 1613 */ 1614 if (dyn_lookup_ipv4_state_locked(pkt, ulp, pktlen, 1615 bucket, kidx) != 0) { 1616 DYN_BUCKET_UNLOCK(bucket); 1617 return (EEXIST); 1618 } 1619 } 1620 1621 data = dyn_alloc_dyndata(parent, ruleid, rulenum, pkt, ulp, 1622 pktlen, hashval, fibnum); 1623 if (data == NULL) { 1624 DYN_BUCKET_UNLOCK(bucket); 1625 return (ENOMEM); 1626 } 1627 1628 s = dyn_alloc_ipv4_state(pkt, kidx, type); 1629 if (s == NULL) { 1630 DYN_BUCKET_UNLOCK(bucket); 1631 uma_zfree(V_dyn_data_zone, data); 1632 return (ENOMEM); 1633 } 1634 1635 s->data = data; 1636 CK_SLIST_INSERT_HEAD(&V_dyn_ipv4[bucket], s, entry); 1637 DYN_COUNT_INC(dyn_count); 1638 DYN_BUCKET_VERSION_BUMP(bucket, ipv4_add); 1639 DYN_BUCKET_UNLOCK(bucket); 1640 return (0); 1641 } 1642 1643 #ifdef INET6 1644 static struct dyn_ipv6_state * 1645 dyn_alloc_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1646 uint32_t kidx, uint8_t type) 1647 { 1648 struct dyn_ipv6_state *s; 1649 1650 s = uma_zalloc(V_dyn_ipv6_zone, M_NOWAIT | M_ZERO); 1651 if (s == NULL) 1652 return (NULL); 1653 1654 s->type = type; 1655 s->kidx = kidx; 1656 s->zoneid = zoneid; 1657 s->proto = pkt->proto; 1658 s->sport = pkt->src_port; 1659 s->dport = pkt->dst_port; 1660 s->src = pkt->src_ip6; 1661 s->dst = pkt->dst_ip6; 1662 return (s); 1663 } 1664 1665 /* 1666 * Add IPv6 parent state. 1667 * Returns pointer to parent state. When it is not NULL we are in 1668 * critical section and pointer protected by hazard pointer. 1669 * When some error occurs, it return NULL and exit from critical section 1670 * is not needed. 1671 */ 1672 static struct dyn_ipv6_state * 1673 dyn_add_ipv6_parent(void *rule, uint32_t ruleid, uint32_t rulenum, 1674 const struct ipfw_flow_id *pkt, uint32_t zoneid, uint32_t hashval, 1675 uint32_t version, uint32_t kidx) 1676 { 1677 struct dyn_ipv6_state *s; 1678 struct dyn_parent *limit; 1679 uint32_t bucket; 1680 1681 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1682 DYN_BUCKET_LOCK(bucket); 1683 if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_add)) { 1684 /* 1685 * Bucket version has been changed since last lookup, 1686 * do lookup again to be sure that state does not exist. 1687 */ 1688 s = dyn_lookup_ipv6_parent_locked(pkt, zoneid, rule, ruleid, 1689 rulenum, bucket); 1690 if (s != NULL) { 1691 /* 1692 * Simultaneous thread has already created this 1693 * state. Just return it. 1694 */ 1695 DYNSTATE_CRITICAL_ENTER(); 1696 DYNSTATE_PROTECT(s); 1697 DYN_BUCKET_UNLOCK(bucket); 1698 return (s); 1699 } 1700 } 1701 1702 limit = dyn_alloc_parent(rule, ruleid, rulenum, hashval); 1703 if (limit == NULL) { 1704 DYN_BUCKET_UNLOCK(bucket); 1705 return (NULL); 1706 } 1707 1708 s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, O_LIMIT_PARENT); 1709 if (s == NULL) { 1710 DYN_BUCKET_UNLOCK(bucket); 1711 uma_zfree(V_dyn_parent_zone, limit); 1712 return (NULL); 1713 } 1714 1715 s->limit = limit; 1716 CK_SLIST_INSERT_HEAD(&V_dyn_ipv6_parent[bucket], s, entry); 1717 DYN_COUNT_INC(dyn_parent_count); 1718 DYN_BUCKET_VERSION_BUMP(bucket, ipv6_parent_add); 1719 DYNSTATE_CRITICAL_ENTER(); 1720 DYNSTATE_PROTECT(s); 1721 DYN_BUCKET_UNLOCK(bucket); 1722 return (s); 1723 } 1724 1725 static int 1726 dyn_add_ipv6_state(void *parent, uint32_t ruleid, uint32_t rulenum, 1727 const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *ulp, 1728 int pktlen, uint32_t hashval, struct ipfw_dyn_info *info, 1729 uint16_t fibnum, uint32_t kidx, uint8_t type) 1730 { 1731 struct dyn_ipv6_state *s; 1732 struct dyn_data *data; 1733 uint32_t bucket; 1734 1735 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1736 DYN_BUCKET_LOCK(bucket); 1737 if (info->direction == MATCH_UNKNOWN || 1738 info->kidx != kidx || 1739 info->hashval != hashval || 1740 info->version != DYN_BUCKET_VERSION(bucket, ipv6_add)) { 1741 /* 1742 * Bucket version has been changed since last lookup, 1743 * do lookup again to be sure that state does not exist. 1744 */ 1745 if (dyn_lookup_ipv6_state_locked(pkt, zoneid, ulp, pktlen, 1746 bucket, kidx) != 0) { 1747 DYN_BUCKET_UNLOCK(bucket); 1748 return (EEXIST); 1749 } 1750 } 1751 1752 data = dyn_alloc_dyndata(parent, ruleid, rulenum, pkt, ulp, 1753 pktlen, hashval, fibnum); 1754 if (data == NULL) { 1755 DYN_BUCKET_UNLOCK(bucket); 1756 return (ENOMEM); 1757 } 1758 1759 s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, type); 1760 if (s == NULL) { 1761 DYN_BUCKET_UNLOCK(bucket); 1762 uma_zfree(V_dyn_data_zone, data); 1763 return (ENOMEM); 1764 } 1765 1766 s->data = data; 1767 CK_SLIST_INSERT_HEAD(&V_dyn_ipv6[bucket], s, entry); 1768 DYN_COUNT_INC(dyn_count); 1769 DYN_BUCKET_VERSION_BUMP(bucket, ipv6_add); 1770 DYN_BUCKET_UNLOCK(bucket); 1771 return (0); 1772 } 1773 #endif /* INET6 */ 1774 1775 static void * 1776 dyn_get_parent_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1777 struct ip_fw *rule, uint32_t hashval, uint32_t limit, uint32_t kidx) 1778 { 1779 char sbuf[24]; 1780 struct dyn_parent *p; 1781 void *ret; 1782 uint32_t bucket, version; 1783 1784 p = NULL; 1785 ret = NULL; 1786 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1787 DYNSTATE_CRITICAL_ENTER(); 1788 if (IS_IP4_FLOW_ID(pkt)) { 1789 struct dyn_ipv4_state *s; 1790 1791 version = DYN_BUCKET_VERSION(bucket, ipv4_parent_add); 1792 s = dyn_lookup_ipv4_parent(pkt, rule, rule->id, 1793 rule->rulenum, bucket); 1794 if (s == NULL) { 1795 /* 1796 * Exit from critical section because dyn_add_parent() 1797 * will acquire bucket lock. 1798 */ 1799 DYNSTATE_CRITICAL_EXIT(); 1800 1801 s = dyn_add_ipv4_parent(rule, rule->id, 1802 rule->rulenum, pkt, hashval, version, kidx); 1803 if (s == NULL) 1804 return (NULL); 1805 /* Now we are in critical section again. */ 1806 } 1807 ret = s; 1808 p = s->limit; 1809 } 1810 #ifdef INET6 1811 else if (IS_IP6_FLOW_ID(pkt)) { 1812 struct dyn_ipv6_state *s; 1813 1814 version = DYN_BUCKET_VERSION(bucket, ipv6_parent_add); 1815 s = dyn_lookup_ipv6_parent(pkt, zoneid, rule, rule->id, 1816 rule->rulenum, bucket); 1817 if (s == NULL) { 1818 /* 1819 * Exit from critical section because dyn_add_parent() 1820 * can acquire bucket mutex. 1821 */ 1822 DYNSTATE_CRITICAL_EXIT(); 1823 1824 s = dyn_add_ipv6_parent(rule, rule->id, 1825 rule->rulenum, pkt, zoneid, hashval, version, 1826 kidx); 1827 if (s == NULL) 1828 return (NULL); 1829 /* Now we are in critical section again. */ 1830 } 1831 ret = s; 1832 p = s->limit; 1833 } 1834 #endif 1835 else { 1836 DYNSTATE_CRITICAL_EXIT(); 1837 return (NULL); 1838 } 1839 1840 /* Check the limit */ 1841 if (DPARENT_COUNT(p) >= limit) { 1842 DYNSTATE_CRITICAL_EXIT(); 1843 if (V_fw_verbose && last_log != time_uptime) { 1844 last_log = time_uptime; 1845 snprintf(sbuf, sizeof(sbuf), "%u drop session", 1846 rule->rulenum); 1847 print_dyn_rule_flags(pkt, O_LIMIT, 1848 LOG_SECURITY | LOG_DEBUG, sbuf, 1849 "too many entries"); 1850 } 1851 return (NULL); 1852 } 1853 1854 /* Take new session into account. */ 1855 DPARENT_COUNT_INC(p); 1856 /* 1857 * We must exit from critical section because the following code 1858 * can acquire bucket mutex. 1859 * We rely on the 'count' field. The state will not expire 1860 * until it has some child states, i.e. 'count' field is not zero. 1861 * Return state pointer, it will be used by child states as parent. 1862 */ 1863 DYNSTATE_CRITICAL_EXIT(); 1864 return (ret); 1865 } 1866 1867 static int 1868 dyn_install_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1869 uint16_t fibnum, const void *ulp, int pktlen, struct ip_fw *rule, 1870 struct ipfw_dyn_info *info, uint32_t limit, uint16_t limit_mask, 1871 uint32_t kidx, uint8_t type) 1872 { 1873 struct ipfw_flow_id id; 1874 uint32_t hashval, parent_hashval, ruleid, rulenum; 1875 int ret; 1876 1877 MPASS(type == O_LIMIT || type == O_KEEP_STATE); 1878 1879 ruleid = rule->id; 1880 rulenum = rule->rulenum; 1881 if (type == O_LIMIT) { 1882 /* Create masked flow id and calculate bucket */ 1883 id.addr_type = pkt->addr_type; 1884 id.proto = pkt->proto; 1885 id.fib = fibnum; /* unused */ 1886 id.src_port = (limit_mask & DYN_SRC_PORT) ? 1887 pkt->src_port: 0; 1888 id.dst_port = (limit_mask & DYN_DST_PORT) ? 1889 pkt->dst_port: 0; 1890 if (IS_IP4_FLOW_ID(pkt)) { 1891 id.src_ip = (limit_mask & DYN_SRC_ADDR) ? 1892 pkt->src_ip: 0; 1893 id.dst_ip = (limit_mask & DYN_DST_ADDR) ? 1894 pkt->dst_ip: 0; 1895 } 1896 #ifdef INET6 1897 else if (IS_IP6_FLOW_ID(pkt)) { 1898 if (limit_mask & DYN_SRC_ADDR) 1899 id.src_ip6 = pkt->src_ip6; 1900 else 1901 memset(&id.src_ip6, 0, sizeof(id.src_ip6)); 1902 if (limit_mask & DYN_DST_ADDR) 1903 id.dst_ip6 = pkt->dst_ip6; 1904 else 1905 memset(&id.dst_ip6, 0, sizeof(id.dst_ip6)); 1906 } 1907 #endif 1908 else 1909 return (EAFNOSUPPORT); 1910 1911 parent_hashval = hash_parent(&id, rule); 1912 rule = dyn_get_parent_state(&id, zoneid, rule, parent_hashval, 1913 limit, kidx); 1914 if (rule == NULL) { 1915 #if 0 1916 if (V_fw_verbose && last_log != time_uptime) { 1917 last_log = time_uptime; 1918 snprintf(sbuf, sizeof(sbuf), 1919 "%u drop session", rule->rulenum); 1920 print_dyn_rule_flags(pkt, O_LIMIT, 1921 LOG_SECURITY | LOG_DEBUG, sbuf, 1922 "too many entries"); 1923 } 1924 #endif 1925 return (EACCES); 1926 } 1927 /* 1928 * Limit is not reached, create new state. 1929 * Now rule points to parent state. 1930 */ 1931 } 1932 1933 hashval = hash_packet(pkt); 1934 if (IS_IP4_FLOW_ID(pkt)) 1935 ret = dyn_add_ipv4_state(rule, ruleid, rulenum, pkt, 1936 ulp, pktlen, hashval, info, fibnum, kidx, type); 1937 #ifdef INET6 1938 else if (IS_IP6_FLOW_ID(pkt)) 1939 ret = dyn_add_ipv6_state(rule, ruleid, rulenum, pkt, 1940 zoneid, ulp, pktlen, hashval, info, fibnum, kidx, type); 1941 #endif /* INET6 */ 1942 else 1943 ret = EAFNOSUPPORT; 1944 1945 if (type == O_LIMIT) { 1946 if (ret != 0) { 1947 /* 1948 * We failed to create child state for O_LIMIT 1949 * opcode. Since we already counted it in the parent, 1950 * we must revert counter back. The 'rule' points to 1951 * parent state, use it to get dyn_parent. 1952 * 1953 * XXXAE: it should be safe to use 'rule' pointer 1954 * without extra lookup, parent state is referenced 1955 * and should not be freed. 1956 */ 1957 if (IS_IP4_FLOW_ID(&id)) 1958 DPARENT_COUNT_DEC( 1959 ((struct dyn_ipv4_state *)rule)->limit); 1960 #ifdef INET6 1961 else if (IS_IP6_FLOW_ID(&id)) 1962 DPARENT_COUNT_DEC( 1963 ((struct dyn_ipv6_state *)rule)->limit); 1964 #endif 1965 } 1966 } 1967 /* 1968 * EEXIST means that simultaneous thread has created this 1969 * state. Consider this as success. 1970 * 1971 * XXXAE: should we invalidate 'info' content here? 1972 */ 1973 if (ret == EEXIST) 1974 return (0); 1975 return (ret); 1976 } 1977 1978 /* 1979 * Install dynamic state. 1980 * chain - ipfw's instance; 1981 * rule - the parent rule that installs the state; 1982 * cmd - opcode that installs the state; 1983 * args - ipfw arguments; 1984 * ulp - upper level protocol header; 1985 * pktlen - packet length; 1986 * info - dynamic state lookup info; 1987 * tablearg - tablearg id. 1988 * 1989 * Returns non-zero value (failure) if state is not installed because 1990 * of errors or because session limitations are enforced. 1991 */ 1992 int 1993 ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, 1994 const ipfw_insn_limit *cmd, const struct ip_fw_args *args, 1995 const void *ulp, int pktlen, struct ipfw_dyn_info *info, 1996 uint32_t tablearg) 1997 { 1998 uint32_t limit; 1999 uint16_t limit_mask; 2000 2001 if (cmd->o.opcode == O_LIMIT) { 2002 limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); 2003 limit_mask = cmd->limit_mask; 2004 } else { 2005 limit = 0; 2006 limit_mask = 0; 2007 } 2008 /* 2009 * NOTE: we assume that kidx field of struct ipfw_insn_kidx 2010 * located in the same place as kidx field of ipfw_insn_limit. 2011 */ 2012 return (dyn_install_state(&args->f_id, 2013 #ifdef INET6 2014 IS_IP6_FLOW_ID(&args->f_id) ? dyn_getscopeid(args): 2015 #endif 2016 0, M_GETFIB(args->m), ulp, pktlen, rule, info, limit, 2017 limit_mask, cmd->kidx, cmd->o.opcode)); 2018 } 2019 2020 /* 2021 * Free safe to remove state entries from expired lists. 2022 */ 2023 static void 2024 dyn_free_states(struct ip_fw_chain *chain) 2025 { 2026 struct dyn_ipv4_state *s4, *s4n; 2027 #ifdef INET6 2028 struct dyn_ipv6_state *s6, *s6n; 2029 #endif 2030 int cached_count, i; 2031 2032 /* 2033 * We keep pointers to objects that are in use on each CPU 2034 * in the per-cpu dyn_hp pointer. When object is going to be 2035 * removed, first of it is unlinked from the corresponding 2036 * list. This leads to changing of dyn_bucket_xxx_delver version. 2037 * Unlinked objects is placed into corresponding dyn_expired_xxx 2038 * list. Reader that is going to dereference object pointer checks 2039 * dyn_bucket_xxx_delver version before and after storing pointer 2040 * into dyn_hp. If version is the same, the object is protected 2041 * from freeing and it is safe to dereference. Othervise reader 2042 * tries to iterate list again from the beginning, but this object 2043 * now unlinked and thus will not be accessible. 2044 * 2045 * Copy dyn_hp pointers for each CPU into dyn_hp_cache array. 2046 * It does not matter that some pointer can be changed in 2047 * time while we are copying. We need to check, that objects 2048 * removed in the previous pass are not in use. And if dyn_hp 2049 * pointer does not contain it in the time when we are copying, 2050 * it will not appear there, because it is already unlinked. 2051 * And for new pointers we will not free objects that will be 2052 * unlinked in this pass. 2053 */ 2054 cached_count = 0; 2055 CPU_FOREACH(i) { 2056 dyn_hp_cache[cached_count] = DYNSTATE_GET(i); 2057 if (dyn_hp_cache[cached_count] != NULL) 2058 cached_count++; 2059 } 2060 2061 /* 2062 * Free expired states that are safe to free. 2063 * Check each entry from previous pass in the dyn_expired_xxx 2064 * list, if pointer to the object is in the dyn_hp_cache array, 2065 * keep it until next pass. Otherwise it is safe to free the 2066 * object. 2067 * 2068 * XXXAE: optimize this to use SLIST_REMOVE_AFTER. 2069 */ 2070 #define DYN_FREE_STATES(s, next, name) do { \ 2071 s = SLIST_FIRST(&V_dyn_expired_ ## name); \ 2072 while (s != NULL) { \ 2073 next = SLIST_NEXT(s, expired); \ 2074 for (i = 0; i < cached_count; i++) \ 2075 if (dyn_hp_cache[i] == s) \ 2076 break; \ 2077 if (i == cached_count) { \ 2078 if (s->type == O_LIMIT_PARENT && \ 2079 s->limit->count != 0) { \ 2080 s = next; \ 2081 continue; \ 2082 } \ 2083 SLIST_REMOVE(&V_dyn_expired_ ## name, \ 2084 s, dyn_ ## name ## _state, expired); \ 2085 if (s->type == O_LIMIT_PARENT) \ 2086 uma_zfree(V_dyn_parent_zone, s->limit); \ 2087 else \ 2088 uma_zfree(V_dyn_data_zone, s->data); \ 2089 uma_zfree(V_dyn_ ## name ## _zone, s); \ 2090 } \ 2091 s = next; \ 2092 } \ 2093 } while (0) 2094 2095 /* 2096 * Protect access to expired lists with DYN_EXPIRED_LOCK. 2097 * Userland can invoke ipfw_expire_dyn_states() to delete 2098 * specific states, this will lead to modification of expired 2099 * lists. 2100 * 2101 * XXXAE: do we need DYN_EXPIRED_LOCK? We can just use 2102 * IPFW_UH_WLOCK to protect access to these lists. 2103 */ 2104 DYN_EXPIRED_LOCK(); 2105 DYN_FREE_STATES(s4, s4n, ipv4); 2106 #ifdef INET6 2107 DYN_FREE_STATES(s6, s6n, ipv6); 2108 #endif 2109 DYN_EXPIRED_UNLOCK(); 2110 #undef DYN_FREE_STATES 2111 } 2112 2113 /* 2114 * Returns: 2115 * 0 when state is not matched by specified range; 2116 * 1 when state is matched by specified range; 2117 * 2 when state is matched by specified range and requested deletion of 2118 * dynamic states. 2119 */ 2120 static int 2121 dyn_match_range(uint32_t rulenum, uint8_t set, const ipfw_range_tlv *rt) 2122 { 2123 2124 MPASS(rt != NULL); 2125 /* flush all states */ 2126 if (rt->flags & IPFW_RCFLAG_ALL) { 2127 if (rt->flags & IPFW_RCFLAG_DYNAMIC) 2128 return (2); /* forced */ 2129 return (1); 2130 } 2131 if ((rt->flags & IPFW_RCFLAG_SET) != 0 && set != rt->set) 2132 return (0); 2133 if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && 2134 (rulenum < rt->start_rule || rulenum > rt->end_rule)) 2135 return (0); 2136 if (rt->flags & IPFW_RCFLAG_DYNAMIC) 2137 return (2); 2138 return (1); 2139 } 2140 2141 static void 2142 dyn_acquire_rule(struct ip_fw_chain *ch, struct dyn_data *data, 2143 struct ip_fw *rule, uint32_t kidx) 2144 { 2145 struct dyn_state_obj *obj; 2146 2147 /* 2148 * Do not acquire reference twice. 2149 * This can happen when rule deletion executed for 2150 * the same range, but different ruleset id. 2151 */ 2152 if (data->flags & DYN_REFERENCED) 2153 return; 2154 2155 IPFW_UH_WLOCK_ASSERT(ch); 2156 MPASS(kidx != 0); 2157 2158 data->flags |= DYN_REFERENCED; 2159 /* Reference the named object */ 2160 obj = SRV_OBJECT(ch, kidx); 2161 obj->no.refcnt++; 2162 MPASS(obj->no.etlv == IPFW_TLV_STATE_NAME); 2163 2164 /* Reference the parent rule */ 2165 rule->refcnt++; 2166 } 2167 2168 static void 2169 dyn_release_rule(struct ip_fw_chain *ch, struct dyn_data *data, 2170 struct ip_fw *rule, uint32_t kidx) 2171 { 2172 struct dyn_state_obj *obj; 2173 2174 IPFW_UH_WLOCK_ASSERT(ch); 2175 MPASS(kidx != 0); 2176 2177 obj = SRV_OBJECT(ch, kidx); 2178 if (obj->no.refcnt == 1) 2179 dyn_destroy(ch, &obj->no); 2180 else 2181 obj->no.refcnt--; 2182 2183 if (--rule->refcnt == 1) 2184 ipfw_free_rule(rule); 2185 } 2186 2187 /* 2188 * We do not keep O_LIMIT_PARENT states when V_dyn_keep_states is enabled. 2189 * O_LIMIT state is created when new connection is going to be established 2190 * and there is no matching state. So, since the old parent rule was deleted 2191 * we can't create new states with old parent, and thus we can not account 2192 * new connections with already established connections, and can not do 2193 * proper limiting. 2194 */ 2195 static int 2196 dyn_match_ipv4_state(struct ip_fw_chain *ch, struct dyn_ipv4_state *s, 2197 const ipfw_range_tlv *rt) 2198 { 2199 struct ip_fw *rule; 2200 int ret; 2201 2202 if (s->type == O_LIMIT_PARENT) { 2203 rule = s->limit->parent; 2204 return (dyn_match_range(s->limit->rulenum, rule->set, rt)); 2205 } 2206 2207 rule = s->data->parent; 2208 if (s->type == O_LIMIT) 2209 rule = ((struct dyn_ipv4_state *)rule)->limit->parent; 2210 2211 ret = dyn_match_range(s->data->rulenum, rule->set, rt); 2212 if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) 2213 return (ret); 2214 2215 dyn_acquire_rule(ch, s->data, rule, s->kidx); 2216 return (0); 2217 } 2218 2219 #ifdef INET6 2220 static int 2221 dyn_match_ipv6_state(struct ip_fw_chain *ch, struct dyn_ipv6_state *s, 2222 const ipfw_range_tlv *rt) 2223 { 2224 struct ip_fw *rule; 2225 int ret; 2226 2227 if (s->type == O_LIMIT_PARENT) { 2228 rule = s->limit->parent; 2229 return (dyn_match_range(s->limit->rulenum, rule->set, rt)); 2230 } 2231 2232 rule = s->data->parent; 2233 if (s->type == O_LIMIT) 2234 rule = ((struct dyn_ipv6_state *)rule)->limit->parent; 2235 2236 ret = dyn_match_range(s->data->rulenum, rule->set, rt); 2237 if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) 2238 return (ret); 2239 2240 dyn_acquire_rule(ch, s->data, rule, s->kidx); 2241 return (0); 2242 } 2243 #endif 2244 2245 /* 2246 * Unlink expired entries from states lists. 2247 * @rt can be used to specify the range of states for deletion. 2248 */ 2249 static void 2250 dyn_expire_states(struct ip_fw_chain *ch, ipfw_range_tlv *rt) 2251 { 2252 struct dyn_ipv4_slist expired_ipv4; 2253 #ifdef INET6 2254 struct dyn_ipv6_slist expired_ipv6; 2255 struct dyn_ipv6_state *s6, *s6n, *s6p; 2256 #endif 2257 struct dyn_ipv4_state *s4, *s4n, *s4p; 2258 void *rule; 2259 int bucket, removed, length, max_length; 2260 2261 IPFW_UH_WLOCK_ASSERT(ch); 2262 2263 /* 2264 * Unlink expired states from each bucket. 2265 * With acquired bucket lock iterate entries of each lists: 2266 * ipv4, ipv4_parent, ipv6, and ipv6_parent. Check expired time 2267 * and unlink entry from the list, link entry into temporary 2268 * expired_xxx lists then bump "del" bucket version. 2269 * 2270 * When an entry is removed, corresponding states counter is 2271 * decremented. If entry has O_LIMIT type, parent's reference 2272 * counter is decremented. 2273 * 2274 * NOTE: this function can be called from userspace context 2275 * when user deletes rules. In this case all matched states 2276 * will be forcedly unlinked. O_LIMIT_PARENT states will be kept 2277 * in the expired lists until reference counter become zero. 2278 */ 2279 #define DYN_UNLINK_STATES(s, prev, next, exp, af, name, extra) do { \ 2280 length = 0; \ 2281 removed = 0; \ 2282 prev = NULL; \ 2283 s = CK_SLIST_FIRST(&V_dyn_ ## name [bucket]); \ 2284 while (s != NULL) { \ 2285 next = CK_SLIST_NEXT(s, entry); \ 2286 if ((TIME_LEQ((s)->exp, time_uptime) && extra) || \ 2287 (rt != NULL && \ 2288 dyn_match_ ## af ## _state(ch, s, rt))) { \ 2289 if (prev != NULL) \ 2290 CK_SLIST_REMOVE_AFTER(prev, entry); \ 2291 else \ 2292 CK_SLIST_REMOVE_HEAD( \ 2293 &V_dyn_ ## name [bucket], entry); \ 2294 removed++; \ 2295 SLIST_INSERT_HEAD(&expired_ ## af, s, expired); \ 2296 if (s->type == O_LIMIT_PARENT) \ 2297 DYN_COUNT_DEC(dyn_parent_count); \ 2298 else { \ 2299 DYN_COUNT_DEC(dyn_count); \ 2300 if (s->data->flags & DYN_REFERENCED) { \ 2301 rule = s->data->parent; \ 2302 if (s->type == O_LIMIT) \ 2303 rule = ((__typeof(s)) \ 2304 rule)->limit->parent;\ 2305 dyn_release_rule(ch, s->data, \ 2306 rule, s->kidx); \ 2307 } \ 2308 if (s->type == O_LIMIT) { \ 2309 s = s->data->parent; \ 2310 DPARENT_COUNT_DEC(s->limit); \ 2311 } \ 2312 } \ 2313 } else { \ 2314 prev = s; \ 2315 length++; \ 2316 } \ 2317 s = next; \ 2318 } \ 2319 if (removed != 0) \ 2320 DYN_BUCKET_VERSION_BUMP(bucket, name ## _del); \ 2321 if (length > max_length) \ 2322 max_length = length; \ 2323 } while (0) 2324 2325 SLIST_INIT(&expired_ipv4); 2326 #ifdef INET6 2327 SLIST_INIT(&expired_ipv6); 2328 #endif 2329 max_length = 0; 2330 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2331 DYN_BUCKET_LOCK(bucket); 2332 DYN_UNLINK_STATES(s4, s4p, s4n, data->expire, ipv4, ipv4, 1); 2333 DYN_UNLINK_STATES(s4, s4p, s4n, limit->expire, ipv4, 2334 ipv4_parent, (s4->limit->count == 0)); 2335 #ifdef INET6 2336 DYN_UNLINK_STATES(s6, s6p, s6n, data->expire, ipv6, ipv6, 1); 2337 DYN_UNLINK_STATES(s6, s6p, s6n, limit->expire, ipv6, 2338 ipv6_parent, (s6->limit->count == 0)); 2339 #endif 2340 DYN_BUCKET_UNLOCK(bucket); 2341 } 2342 /* Update curr_max_length for statistics. */ 2343 V_curr_max_length = max_length; 2344 /* 2345 * Concatenate temporary lists with global expired lists. 2346 */ 2347 DYN_EXPIRED_LOCK(); 2348 SLIST_CONCAT(&V_dyn_expired_ipv4, &expired_ipv4, 2349 dyn_ipv4_state, expired); 2350 #ifdef INET6 2351 SLIST_CONCAT(&V_dyn_expired_ipv6, &expired_ipv6, 2352 dyn_ipv6_state, expired); 2353 #endif 2354 DYN_EXPIRED_UNLOCK(); 2355 #undef DYN_UNLINK_STATES 2356 #undef DYN_UNREF_STATES 2357 } 2358 2359 static struct mbuf * 2360 dyn_mgethdr(int len, uint16_t fibnum) 2361 { 2362 struct mbuf *m; 2363 2364 m = m_gethdr(M_NOWAIT, MT_DATA); 2365 if (m == NULL) 2366 return (NULL); 2367 #ifdef MAC 2368 mac_netinet_firewall_send(m); 2369 #endif 2370 M_SETFIB(m, fibnum); 2371 m->m_data += max_linkhdr; 2372 m->m_flags |= M_SKIP_FIREWALL; 2373 m->m_len = m->m_pkthdr.len = len; 2374 bzero(m->m_data, len); 2375 return (m); 2376 } 2377 2378 static void 2379 dyn_make_keepalive_ipv4(struct mbuf *m, in_addr_t src, in_addr_t dst, 2380 uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport) 2381 { 2382 struct tcphdr *tcp; 2383 struct ip *ip; 2384 2385 ip = mtod(m, struct ip *); 2386 ip->ip_v = 4; 2387 ip->ip_hl = sizeof(*ip) >> 2; 2388 ip->ip_tos = IPTOS_LOWDELAY; 2389 ip->ip_len = htons(m->m_len); 2390 ip->ip_off |= htons(IP_DF); 2391 ip->ip_ttl = V_ip_defttl; 2392 ip->ip_p = IPPROTO_TCP; 2393 ip->ip_src.s_addr = htonl(src); 2394 ip->ip_dst.s_addr = htonl(dst); 2395 2396 tcp = mtodo(m, sizeof(struct ip)); 2397 tcp->th_sport = htons(sport); 2398 tcp->th_dport = htons(dport); 2399 tcp->th_off = sizeof(struct tcphdr) >> 2; 2400 tcp->th_seq = htonl(seq); 2401 tcp->th_ack = htonl(ack); 2402 tcp_set_flags(tcp, TH_ACK); 2403 tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 2404 htons(sizeof(struct tcphdr) + IPPROTO_TCP)); 2405 2406 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 2407 m->m_pkthdr.csum_flags = CSUM_TCP; 2408 } 2409 2410 static void 2411 dyn_enqueue_keepalive_ipv4(struct mbufq *q, const struct dyn_ipv4_state *s) 2412 { 2413 struct mbuf *m; 2414 2415 if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { 2416 m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), 2417 s->data->fibnum); 2418 if (m != NULL) { 2419 dyn_make_keepalive_ipv4(m, s->dst, s->src, 2420 s->data->ack_fwd - 1, s->data->ack_rev, 2421 s->dport, s->sport); 2422 if (mbufq_enqueue(q, m)) { 2423 m_freem(m); 2424 log(LOG_DEBUG, "ipfw: limit for IPv4 " 2425 "keepalive queue is reached.\n"); 2426 return; 2427 } 2428 } 2429 } 2430 2431 if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { 2432 m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), 2433 s->data->fibnum); 2434 if (m != NULL) { 2435 dyn_make_keepalive_ipv4(m, s->src, s->dst, 2436 s->data->ack_rev - 1, s->data->ack_fwd, 2437 s->sport, s->dport); 2438 if (mbufq_enqueue(q, m)) { 2439 m_freem(m); 2440 log(LOG_DEBUG, "ipfw: limit for IPv4 " 2441 "keepalive queue is reached.\n"); 2442 return; 2443 } 2444 } 2445 } 2446 } 2447 2448 /* 2449 * Prepare and send keep-alive packets. 2450 */ 2451 static void 2452 dyn_send_keepalive_ipv4(struct ip_fw_chain *chain) 2453 { 2454 struct mbufq q; 2455 struct mbuf *m; 2456 struct dyn_ipv4_state *s; 2457 uint32_t bucket; 2458 2459 mbufq_init(&q, INT_MAX); 2460 IPFW_UH_RLOCK(chain); 2461 /* 2462 * It is safe to not use hazard pointer and just do lockless 2463 * access to the lists, because states entries can not be deleted 2464 * while we hold IPFW_UH_RLOCK. 2465 */ 2466 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2467 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 2468 /* 2469 * Only established TCP connections that will 2470 * become expired within dyn_keepalive_interval. 2471 */ 2472 if (s->proto != IPPROTO_TCP || 2473 (s->data->state & BOTH_SYN) != BOTH_SYN || 2474 TIME_LEQ(time_uptime + V_dyn_keepalive_interval, 2475 s->data->expire)) 2476 continue; 2477 dyn_enqueue_keepalive_ipv4(&q, s); 2478 } 2479 } 2480 IPFW_UH_RUNLOCK(chain); 2481 while ((m = mbufq_dequeue(&q)) != NULL) 2482 ip_output(m, NULL, NULL, 0, NULL, NULL); 2483 } 2484 2485 #ifdef INET6 2486 static void 2487 dyn_make_keepalive_ipv6(struct mbuf *m, const struct in6_addr *src, 2488 const struct in6_addr *dst, uint32_t zoneid, uint32_t seq, uint32_t ack, 2489 uint16_t sport, uint16_t dport) 2490 { 2491 struct tcphdr *tcp; 2492 struct ip6_hdr *ip6; 2493 2494 ip6 = mtod(m, struct ip6_hdr *); 2495 ip6->ip6_vfc |= IPV6_VERSION; 2496 ip6->ip6_plen = htons(sizeof(struct tcphdr)); 2497 ip6->ip6_nxt = IPPROTO_TCP; 2498 ip6->ip6_hlim = IPV6_DEFHLIM; 2499 ip6->ip6_src = *src; 2500 if (IN6_IS_ADDR_LINKLOCAL(src)) 2501 ip6->ip6_src.s6_addr16[1] = htons(zoneid & 0xffff); 2502 ip6->ip6_dst = *dst; 2503 if (IN6_IS_ADDR_LINKLOCAL(dst)) 2504 ip6->ip6_dst.s6_addr16[1] = htons(zoneid & 0xffff); 2505 2506 tcp = mtodo(m, sizeof(struct ip6_hdr)); 2507 tcp->th_sport = htons(sport); 2508 tcp->th_dport = htons(dport); 2509 tcp->th_off = sizeof(struct tcphdr) >> 2; 2510 tcp->th_seq = htonl(seq); 2511 tcp->th_ack = htonl(ack); 2512 tcp_set_flags(tcp, TH_ACK); 2513 tcp->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr), 2514 IPPROTO_TCP, 0); 2515 2516 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 2517 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 2518 } 2519 2520 static void 2521 dyn_enqueue_keepalive_ipv6(struct mbufq *q, const struct dyn_ipv6_state *s) 2522 { 2523 struct mbuf *m; 2524 2525 if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { 2526 m = dyn_mgethdr(sizeof(struct ip6_hdr) + 2527 sizeof(struct tcphdr), s->data->fibnum); 2528 if (m != NULL) { 2529 dyn_make_keepalive_ipv6(m, &s->dst, &s->src, 2530 s->zoneid, s->data->ack_fwd - 1, s->data->ack_rev, 2531 s->dport, s->sport); 2532 if (mbufq_enqueue(q, m)) { 2533 m_freem(m); 2534 log(LOG_DEBUG, "ipfw: limit for IPv6 " 2535 "keepalive queue is reached.\n"); 2536 return; 2537 } 2538 } 2539 } 2540 2541 if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { 2542 m = dyn_mgethdr(sizeof(struct ip6_hdr) + 2543 sizeof(struct tcphdr), s->data->fibnum); 2544 if (m != NULL) { 2545 dyn_make_keepalive_ipv6(m, &s->src, &s->dst, 2546 s->zoneid, s->data->ack_rev - 1, s->data->ack_fwd, 2547 s->sport, s->dport); 2548 if (mbufq_enqueue(q, m)) { 2549 m_freem(m); 2550 log(LOG_DEBUG, "ipfw: limit for IPv6 " 2551 "keepalive queue is reached.\n"); 2552 return; 2553 } 2554 } 2555 } 2556 } 2557 2558 static void 2559 dyn_send_keepalive_ipv6(struct ip_fw_chain *chain) 2560 { 2561 struct mbufq q; 2562 struct mbuf *m; 2563 struct dyn_ipv6_state *s; 2564 uint32_t bucket; 2565 2566 mbufq_init(&q, INT_MAX); 2567 IPFW_UH_RLOCK(chain); 2568 /* 2569 * It is safe to not use hazard pointer and just do lockless 2570 * access to the lists, because states entries can not be deleted 2571 * while we hold IPFW_UH_RLOCK. 2572 */ 2573 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2574 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 2575 /* 2576 * Only established TCP connections that will 2577 * become expired within dyn_keepalive_interval. 2578 */ 2579 if (s->proto != IPPROTO_TCP || 2580 (s->data->state & BOTH_SYN) != BOTH_SYN || 2581 TIME_LEQ(time_uptime + V_dyn_keepalive_interval, 2582 s->data->expire)) 2583 continue; 2584 dyn_enqueue_keepalive_ipv6(&q, s); 2585 } 2586 } 2587 IPFW_UH_RUNLOCK(chain); 2588 while ((m = mbufq_dequeue(&q)) != NULL) 2589 ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); 2590 } 2591 #endif /* INET6 */ 2592 2593 static void 2594 dyn_grow_hashtable(struct ip_fw_chain *chain, uint32_t new, int flags) 2595 { 2596 #ifdef INET6 2597 struct dyn_ipv6ck_slist *ipv6, *ipv6_parent; 2598 uint32_t *ipv6_add, *ipv6_del, *ipv6_parent_add, *ipv6_parent_del; 2599 struct dyn_ipv6_state *s6; 2600 #endif 2601 struct dyn_ipv4ck_slist *ipv4, *ipv4_parent; 2602 uint32_t *ipv4_add, *ipv4_del, *ipv4_parent_add, *ipv4_parent_del; 2603 struct dyn_ipv4_state *s4; 2604 struct mtx *bucket_lock; 2605 void *tmp; 2606 uint32_t bucket; 2607 2608 MPASS(powerof2(new)); 2609 DYN_DEBUG("grow hash size %u -> %u", V_curr_dyn_buckets, new); 2610 /* 2611 * Allocate and initialize new lists. 2612 */ 2613 bucket_lock = malloc(new * sizeof(struct mtx), M_IPFW, 2614 flags | M_ZERO); 2615 if (bucket_lock == NULL) 2616 return; 2617 2618 ipv4 = ipv4_parent = NULL; 2619 ipv4_add = ipv4_del = ipv4_parent_add = ipv4_parent_del = NULL; 2620 #ifdef INET6 2621 ipv6 = ipv6_parent = NULL; 2622 ipv6_add = ipv6_del = ipv6_parent_add = ipv6_parent_del = NULL; 2623 #endif 2624 2625 ipv4 = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, 2626 flags | M_ZERO); 2627 if (ipv4 == NULL) 2628 goto bad; 2629 ipv4_parent = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, 2630 flags | M_ZERO); 2631 if (ipv4_parent == NULL) 2632 goto bad; 2633 ipv4_add = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); 2634 if (ipv4_add == NULL) 2635 goto bad; 2636 ipv4_del = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); 2637 if (ipv4_del == NULL) 2638 goto bad; 2639 ipv4_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, 2640 flags | M_ZERO); 2641 if (ipv4_parent_add == NULL) 2642 goto bad; 2643 ipv4_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, 2644 flags | M_ZERO); 2645 if (ipv4_parent_del == NULL) 2646 goto bad; 2647 #ifdef INET6 2648 ipv6 = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, 2649 flags | M_ZERO); 2650 if (ipv6 == NULL) 2651 goto bad; 2652 ipv6_parent = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, 2653 flags | M_ZERO); 2654 if (ipv6_parent == NULL) 2655 goto bad; 2656 ipv6_add = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); 2657 if (ipv6_add == NULL) 2658 goto bad; 2659 ipv6_del = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); 2660 if (ipv6_del == NULL) 2661 goto bad; 2662 ipv6_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, 2663 flags | M_ZERO); 2664 if (ipv6_parent_add == NULL) 2665 goto bad; 2666 ipv6_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, 2667 flags | M_ZERO); 2668 if (ipv6_parent_del == NULL) 2669 goto bad; 2670 #endif 2671 for (bucket = 0; bucket < new; bucket++) { 2672 DYN_BUCKET_LOCK_INIT(bucket_lock, bucket); 2673 CK_SLIST_INIT(&ipv4[bucket]); 2674 CK_SLIST_INIT(&ipv4_parent[bucket]); 2675 #ifdef INET6 2676 CK_SLIST_INIT(&ipv6[bucket]); 2677 CK_SLIST_INIT(&ipv6_parent[bucket]); 2678 #endif 2679 } 2680 2681 #define DYN_RELINK_STATES(s, hval, i, head, ohead) do { \ 2682 while ((s = CK_SLIST_FIRST(&V_dyn_ ## ohead[i])) != NULL) { \ 2683 CK_SLIST_REMOVE_HEAD(&V_dyn_ ## ohead[i], entry); \ 2684 CK_SLIST_INSERT_HEAD(&head[DYN_BUCKET(s->hval, new)], \ 2685 s, entry); \ 2686 } \ 2687 } while (0) 2688 /* 2689 * Prevent rules changing from userland. 2690 */ 2691 IPFW_UH_WLOCK(chain); 2692 /* 2693 * Hold traffic processing until we finish resize to 2694 * prevent access to states lists. 2695 */ 2696 IPFW_WLOCK(chain); 2697 /* Re-link all dynamic states */ 2698 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2699 DYN_RELINK_STATES(s4, data->hashval, bucket, ipv4, ipv4); 2700 DYN_RELINK_STATES(s4, limit->hashval, bucket, ipv4_parent, 2701 ipv4_parent); 2702 #ifdef INET6 2703 DYN_RELINK_STATES(s6, data->hashval, bucket, ipv6, ipv6); 2704 DYN_RELINK_STATES(s6, limit->hashval, bucket, ipv6_parent, 2705 ipv6_parent); 2706 #endif 2707 } 2708 2709 #define DYN_SWAP_PTR(old, new, tmp) do { \ 2710 tmp = old; \ 2711 old = new; \ 2712 new = tmp; \ 2713 } while (0) 2714 /* Swap pointers */ 2715 DYN_SWAP_PTR(V_dyn_bucket_lock, bucket_lock, tmp); 2716 DYN_SWAP_PTR(V_dyn_ipv4, ipv4, tmp); 2717 DYN_SWAP_PTR(V_dyn_ipv4_parent, ipv4_parent, tmp); 2718 DYN_SWAP_PTR(V_dyn_ipv4_add, ipv4_add, tmp); 2719 DYN_SWAP_PTR(V_dyn_ipv4_parent_add, ipv4_parent_add, tmp); 2720 DYN_SWAP_PTR(V_dyn_ipv4_del, ipv4_del, tmp); 2721 DYN_SWAP_PTR(V_dyn_ipv4_parent_del, ipv4_parent_del, tmp); 2722 2723 #ifdef INET6 2724 DYN_SWAP_PTR(V_dyn_ipv6, ipv6, tmp); 2725 DYN_SWAP_PTR(V_dyn_ipv6_parent, ipv6_parent, tmp); 2726 DYN_SWAP_PTR(V_dyn_ipv6_add, ipv6_add, tmp); 2727 DYN_SWAP_PTR(V_dyn_ipv6_parent_add, ipv6_parent_add, tmp); 2728 DYN_SWAP_PTR(V_dyn_ipv6_del, ipv6_del, tmp); 2729 DYN_SWAP_PTR(V_dyn_ipv6_parent_del, ipv6_parent_del, tmp); 2730 #endif 2731 bucket = V_curr_dyn_buckets; 2732 V_curr_dyn_buckets = new; 2733 2734 IPFW_WUNLOCK(chain); 2735 IPFW_UH_WUNLOCK(chain); 2736 2737 /* Release old resources */ 2738 while (bucket-- != 0) 2739 DYN_BUCKET_LOCK_DESTROY(bucket_lock, bucket); 2740 bad: 2741 free(bucket_lock, M_IPFW); 2742 free(ipv4, M_IPFW); 2743 free(ipv4_parent, M_IPFW); 2744 free(ipv4_add, M_IPFW); 2745 free(ipv4_parent_add, M_IPFW); 2746 free(ipv4_del, M_IPFW); 2747 free(ipv4_parent_del, M_IPFW); 2748 #ifdef INET6 2749 free(ipv6, M_IPFW); 2750 free(ipv6_parent, M_IPFW); 2751 free(ipv6_add, M_IPFW); 2752 free(ipv6_parent_add, M_IPFW); 2753 free(ipv6_del, M_IPFW); 2754 free(ipv6_parent_del, M_IPFW); 2755 #endif 2756 } 2757 2758 /* 2759 * This function is used to perform various maintenance 2760 * on dynamic hash lists. Currently it is called every second. 2761 */ 2762 static void 2763 dyn_tick(void *vnetx) 2764 { 2765 struct epoch_tracker et; 2766 uint32_t buckets; 2767 2768 CURVNET_SET((struct vnet *)vnetx); 2769 /* 2770 * First free states unlinked in previous passes. 2771 */ 2772 dyn_free_states(&V_layer3_chain); 2773 /* 2774 * Now unlink others expired states. 2775 * We use IPFW_UH_WLOCK to avoid concurrent call of 2776 * dyn_expire_states(). It is the only function that does 2777 * deletion of state entries from states lists. 2778 */ 2779 IPFW_UH_WLOCK(&V_layer3_chain); 2780 dyn_expire_states(&V_layer3_chain, NULL); 2781 IPFW_UH_WUNLOCK(&V_layer3_chain); 2782 /* 2783 * Send keepalives if they are enabled and the time has come. 2784 */ 2785 if (V_dyn_keepalive != 0 && 2786 V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) { 2787 V_dyn_keepalive_last = time_uptime; 2788 NET_EPOCH_ENTER(et); 2789 dyn_send_keepalive_ipv4(&V_layer3_chain); 2790 #ifdef INET6 2791 dyn_send_keepalive_ipv6(&V_layer3_chain); 2792 #endif 2793 NET_EPOCH_EXIT(et); 2794 } 2795 /* 2796 * Check if we need to resize the hash: 2797 * if current number of states exceeds number of buckets in hash, 2798 * and dyn_buckets_max permits to grow the number of buckets, then 2799 * do it. Grow hash size to the minimum power of 2 which is bigger 2800 * than current states count. 2801 */ 2802 if (V_curr_dyn_buckets < V_dyn_buckets_max && 2803 (V_curr_dyn_buckets < V_dyn_count / 2 || ( 2804 V_curr_dyn_buckets < V_dyn_count && V_curr_max_length > 8))) { 2805 buckets = 1 << fls(V_dyn_count); 2806 if (buckets > V_dyn_buckets_max) 2807 buckets = V_dyn_buckets_max; 2808 dyn_grow_hashtable(&V_layer3_chain, buckets, M_NOWAIT); 2809 } 2810 2811 callout_reset_on(&V_dyn_timeout, hz, dyn_tick, vnetx, 0); 2812 CURVNET_RESTORE(); 2813 } 2814 2815 void 2816 ipfw_expire_dyn_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt) 2817 { 2818 /* 2819 * Do not perform any checks if we currently have no dynamic states 2820 */ 2821 if (V_dyn_count == 0) 2822 return; 2823 2824 IPFW_UH_WLOCK_ASSERT(chain); 2825 dyn_expire_states(chain, rt); 2826 } 2827 2828 /* 2829 * Pass through all states and reset eaction for orphaned rules. 2830 */ 2831 void 2832 ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint32_t eaction_id, 2833 uint32_t default_id, uint32_t instance_id) 2834 { 2835 #ifdef INET6 2836 struct dyn_ipv6_state *s6; 2837 #endif 2838 struct dyn_ipv4_state *s4; 2839 struct ip_fw *rule; 2840 uint32_t bucket; 2841 2842 #define DYN_RESET_EACTION(s, h, b) \ 2843 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 2844 if ((s->data->flags & DYN_REFERENCED) == 0) \ 2845 continue; \ 2846 rule = s->data->parent; \ 2847 if (s->type == O_LIMIT) \ 2848 rule = ((__typeof(s))rule)->limit->parent; \ 2849 ipfw_reset_eaction(ch, rule, eaction_id, \ 2850 default_id, instance_id); \ 2851 } 2852 2853 IPFW_UH_WLOCK_ASSERT(ch); 2854 if (V_dyn_count == 0) 2855 return; 2856 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2857 DYN_RESET_EACTION(s4, ipv4, bucket); 2858 #ifdef INET6 2859 DYN_RESET_EACTION(s6, ipv6, bucket); 2860 #endif 2861 } 2862 } 2863 2864 /* 2865 * Returns size of dynamic states in legacy format 2866 */ 2867 int 2868 ipfw_dyn_len(void) 2869 { 2870 2871 return ((V_dyn_count + V_dyn_parent_count) * sizeof(ipfw_dyn_rule)); 2872 } 2873 2874 /* 2875 * Returns number of dynamic states. 2876 * Marks every named object index used by dynamic states with bit in @bmask. 2877 * Returns number of named objects accounted in bmask via @nocnt. 2878 * Used by dump format v1 (current). 2879 */ 2880 uint32_t 2881 ipfw_dyn_get_count(uint32_t *bmask, int *nocnt) 2882 { 2883 #ifdef INET6 2884 struct dyn_ipv6_state *s6; 2885 #endif 2886 struct dyn_ipv4_state *s4; 2887 uint32_t bucket; 2888 2889 #define DYN_COUNT_OBJECTS(s, h, b) \ 2890 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 2891 MPASS(s->kidx != 0); \ 2892 if (ipfw_mark_object_kidx(bmask, IPFW_TLV_STATE_NAME, \ 2893 s->kidx) != 0) \ 2894 (*nocnt)++; \ 2895 } 2896 2897 IPFW_UH_RLOCK_ASSERT(&V_layer3_chain); 2898 2899 /* No need to pass through all the buckets. */ 2900 *nocnt = 0; 2901 if (V_dyn_count + V_dyn_parent_count == 0) 2902 return (0); 2903 2904 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2905 DYN_COUNT_OBJECTS(s4, ipv4, bucket); 2906 #ifdef INET6 2907 DYN_COUNT_OBJECTS(s6, ipv6, bucket); 2908 #endif 2909 } 2910 2911 return (V_dyn_count + V_dyn_parent_count); 2912 } 2913 2914 /* 2915 * Check if rule contains at least one dynamic opcode. 2916 * 2917 * Returns 1 if such opcode is found, 0 otherwise. 2918 */ 2919 int 2920 ipfw_is_dyn_rule(struct ip_fw *rule) 2921 { 2922 int cmdlen, l; 2923 ipfw_insn *cmd; 2924 2925 l = rule->cmd_len; 2926 cmd = rule->cmd; 2927 cmdlen = 0; 2928 for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { 2929 cmdlen = F_LEN(cmd); 2930 2931 switch (cmd->opcode) { 2932 case O_LIMIT: 2933 case O_KEEP_STATE: 2934 case O_PROBE_STATE: 2935 case O_CHECK_STATE: 2936 return (1); 2937 } 2938 } 2939 2940 return (0); 2941 } 2942 2943 static void 2944 dyn_export_parent(const struct dyn_parent *p, uint32_t kidx, uint8_t set, 2945 ipfw_dyn_rule *dst) 2946 { 2947 2948 dst->type = O_LIMIT_PARENT; 2949 dst->set = set; 2950 dst->kidx = kidx; 2951 dst->rulenum = p->rulenum; 2952 dst->count = DPARENT_COUNT(p); 2953 dst->expire = TIME_LEQ(p->expire, time_uptime) ? 0: 2954 p->expire - time_uptime; 2955 dst->hashval = p->hashval; 2956 2957 /* unused fields */ 2958 dst->pad = 0; 2959 dst->pcnt = 0; 2960 dst->bcnt = 0; 2961 dst->ack_fwd = 0; 2962 dst->ack_rev = 0; 2963 } 2964 2965 static void 2966 dyn_export_data(const struct dyn_data *data, uint32_t kidx, uint8_t type, 2967 uint8_t set, ipfw_dyn_rule *dst) 2968 { 2969 2970 dst->type = type; 2971 dst->set = set; 2972 dst->kidx = kidx; 2973 dst->rulenum = data->rulenum; 2974 dst->pcnt = data->pcnt_fwd + data->pcnt_rev; 2975 dst->bcnt = data->bcnt_fwd + data->bcnt_rev; 2976 dst->expire = TIME_LEQ(data->expire, time_uptime) ? 0: 2977 data->expire - time_uptime; 2978 dst->state = data->state; 2979 if (data->flags & DYN_REFERENCED) 2980 dst->state |= IPFW_DYN_ORPHANED; 2981 2982 dst->ack_fwd = data->ack_fwd; 2983 dst->ack_rev = data->ack_rev; 2984 dst->hashval = data->hashval; 2985 } 2986 2987 static void 2988 dyn_export_ipv4_state(const struct dyn_ipv4_state *s, ipfw_dyn_rule *dst) 2989 { 2990 struct ip_fw *rule; 2991 2992 switch (s->type) { 2993 case O_LIMIT_PARENT: 2994 rule = s->limit->parent; 2995 dyn_export_parent(s->limit, s->kidx, rule->set, dst); 2996 break; 2997 default: 2998 rule = s->data->parent; 2999 if (s->type == O_LIMIT) 3000 rule = ((struct dyn_ipv4_state *)rule)->limit->parent; 3001 dyn_export_data(s->data, s->kidx, s->type, rule->set, dst); 3002 } 3003 3004 dst->id.dst_ip = s->dst; 3005 dst->id.src_ip = s->src; 3006 dst->id.dst_port = s->dport; 3007 dst->id.src_port = s->sport; 3008 dst->id.fib = s->data->fibnum; 3009 dst->id.proto = s->proto; 3010 dst->id._flags = 0; 3011 dst->id.addr_type = 4; 3012 3013 memset(&dst->id.dst_ip6, 0, sizeof(dst->id.dst_ip6)); 3014 memset(&dst->id.src_ip6, 0, sizeof(dst->id.src_ip6)); 3015 dst->id.flow_id6 = dst->id.extra = 0; 3016 } 3017 3018 #ifdef INET6 3019 static void 3020 dyn_export_ipv6_state(const struct dyn_ipv6_state *s, ipfw_dyn_rule *dst) 3021 { 3022 struct ip_fw *rule; 3023 3024 switch (s->type) { 3025 case O_LIMIT_PARENT: 3026 rule = s->limit->parent; 3027 dyn_export_parent(s->limit, s->kidx, rule->set, dst); 3028 break; 3029 default: 3030 rule = s->data->parent; 3031 if (s->type == O_LIMIT) 3032 rule = ((struct dyn_ipv6_state *)rule)->limit->parent; 3033 dyn_export_data(s->data, s->kidx, s->type, rule->set, dst); 3034 } 3035 3036 dst->id.src_ip6 = s->src; 3037 dst->id.dst_ip6 = s->dst; 3038 dst->id.dst_port = s->dport; 3039 dst->id.src_port = s->sport; 3040 dst->id.fib = s->data->fibnum; 3041 dst->id.proto = s->proto; 3042 dst->id._flags = 0; 3043 dst->id.addr_type = 6; 3044 3045 dst->id.dst_ip = dst->id.src_ip = 0; 3046 dst->id.flow_id6 = dst->id.extra = 0; 3047 } 3048 #endif /* INET6 */ 3049 3050 /* 3051 * Fills the buffer given by @sd with dynamic states. 3052 * Used by dump format v1 (current). 3053 * 3054 * Returns 0 on success. 3055 */ 3056 int 3057 ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) 3058 { 3059 #ifdef INET6 3060 struct dyn_ipv6_state *s6; 3061 #endif 3062 struct dyn_ipv4_state *s4; 3063 ipfw_obj_dyntlv *dst, *last; 3064 ipfw_obj_ctlv *ctlv; 3065 uint32_t bucket; 3066 3067 if (V_dyn_count == 0) 3068 return (0); 3069 3070 /* 3071 * IPFW_UH_RLOCK garantees that another userland request 3072 * and callout thread will not delete entries from states 3073 * lists. 3074 */ 3075 IPFW_UH_RLOCK_ASSERT(chain); 3076 3077 ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); 3078 if (ctlv == NULL) 3079 return (ENOMEM); 3080 ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; 3081 ctlv->objsize = sizeof(ipfw_obj_dyntlv); 3082 last = NULL; 3083 3084 #define DYN_EXPORT_STATES(s, af, h, b) \ 3085 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 3086 dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, \ 3087 sizeof(ipfw_obj_dyntlv)); \ 3088 if (dst == NULL) \ 3089 return (ENOMEM); \ 3090 dyn_export_ ## af ## _state(s, &dst->state); \ 3091 dst->head.length = sizeof(ipfw_obj_dyntlv); \ 3092 dst->head.type = IPFW_TLV_DYN_ENT; \ 3093 last = dst; \ 3094 } 3095 3096 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 3097 DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket); 3098 DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket); 3099 #ifdef INET6 3100 DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket); 3101 DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket); 3102 #endif /* INET6 */ 3103 } 3104 3105 /* mark last dynamic rule */ 3106 if (last != NULL) 3107 last->head.flags = IPFW_DF_LAST; /* XXX: unused */ 3108 return (0); 3109 #undef DYN_EXPORT_STATES 3110 } 3111 3112 void 3113 ipfw_dyn_init(struct ip_fw_chain *chain) 3114 { 3115 3116 #ifdef IPFIREWALL_JENKINSHASH 3117 V_dyn_hashseed = arc4random(); 3118 #endif 3119 V_dyn_max = 16384; /* max # of states */ 3120 V_dyn_parent_max = 4096; /* max # of parent states */ 3121 V_dyn_buckets_max = 8192; /* must be power of 2 */ 3122 3123 V_dyn_ack_lifetime = 300; 3124 V_dyn_syn_lifetime = 20; 3125 V_dyn_fin_lifetime = 1; 3126 V_dyn_rst_lifetime = 1; 3127 V_dyn_udp_lifetime = 10; 3128 V_dyn_short_lifetime = 5; 3129 3130 V_dyn_keepalive_interval = 20; 3131 V_dyn_keepalive_period = 5; 3132 V_dyn_keepalive = 1; /* send keepalives */ 3133 V_dyn_keepalive_last = time_uptime; 3134 3135 V_dyn_data_zone = uma_zcreate("IPFW dynamic states data", 3136 sizeof(struct dyn_data), NULL, NULL, NULL, NULL, 3137 UMA_ALIGN_PTR, 0); 3138 uma_zone_set_max(V_dyn_data_zone, V_dyn_max); 3139 3140 V_dyn_parent_zone = uma_zcreate("IPFW parent dynamic states", 3141 sizeof(struct dyn_parent), NULL, NULL, NULL, NULL, 3142 UMA_ALIGN_PTR, 0); 3143 uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); 3144 3145 SLIST_INIT(&V_dyn_expired_ipv4); 3146 V_dyn_ipv4 = NULL; 3147 V_dyn_ipv4_parent = NULL; 3148 V_dyn_ipv4_zone = uma_zcreate("IPFW IPv4 dynamic states", 3149 sizeof(struct dyn_ipv4_state), NULL, NULL, NULL, NULL, 3150 UMA_ALIGN_PTR, 0); 3151 3152 #ifdef INET6 3153 SLIST_INIT(&V_dyn_expired_ipv6); 3154 V_dyn_ipv6 = NULL; 3155 V_dyn_ipv6_parent = NULL; 3156 V_dyn_ipv6_zone = uma_zcreate("IPFW IPv6 dynamic states", 3157 sizeof(struct dyn_ipv6_state), NULL, NULL, NULL, NULL, 3158 UMA_ALIGN_PTR, 0); 3159 #endif 3160 3161 /* Initialize buckets. */ 3162 V_curr_dyn_buckets = 0; 3163 V_dyn_bucket_lock = NULL; 3164 dyn_grow_hashtable(chain, 256, M_WAITOK); 3165 3166 if (IS_DEFAULT_VNET(curvnet)) 3167 dyn_hp_cache = malloc(mp_ncpus * sizeof(void *), M_IPFW, 3168 M_WAITOK | M_ZERO); 3169 3170 DYN_EXPIRED_LOCK_INIT(); 3171 callout_init(&V_dyn_timeout, 1); 3172 callout_reset(&V_dyn_timeout, hz, dyn_tick, curvnet); 3173 IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); 3174 } 3175 3176 void 3177 ipfw_dyn_uninit(int pass) 3178 { 3179 #ifdef INET6 3180 struct dyn_ipv6_state *s6; 3181 #endif 3182 struct dyn_ipv4_state *s4; 3183 int bucket; 3184 3185 if (pass == 0) { 3186 callout_drain(&V_dyn_timeout); 3187 return; 3188 } 3189 IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); 3190 DYN_EXPIRED_LOCK_DESTROY(); 3191 3192 #define DYN_FREE_STATES_FORCED(CK, s, af, name, en) do { \ 3193 while ((s = CK ## SLIST_FIRST(&V_dyn_ ## name)) != NULL) { \ 3194 CK ## SLIST_REMOVE_HEAD(&V_dyn_ ## name, en); \ 3195 if (s->type == O_LIMIT_PARENT) \ 3196 uma_zfree(V_dyn_parent_zone, s->limit); \ 3197 else \ 3198 uma_zfree(V_dyn_data_zone, s->data); \ 3199 uma_zfree(V_dyn_ ## af ## _zone, s); \ 3200 } \ 3201 } while (0) 3202 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 3203 DYN_BUCKET_LOCK_DESTROY(V_dyn_bucket_lock, bucket); 3204 3205 DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4[bucket], entry); 3206 DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4_parent[bucket], 3207 entry); 3208 #ifdef INET6 3209 DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6[bucket], entry); 3210 DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6_parent[bucket], 3211 entry); 3212 #endif /* INET6 */ 3213 } 3214 DYN_FREE_STATES_FORCED(, s4, ipv4, expired_ipv4, expired); 3215 #ifdef INET6 3216 DYN_FREE_STATES_FORCED(, s6, ipv6, expired_ipv6, expired); 3217 #endif 3218 #undef DYN_FREE_STATES_FORCED 3219 3220 uma_zdestroy(V_dyn_ipv4_zone); 3221 uma_zdestroy(V_dyn_data_zone); 3222 uma_zdestroy(V_dyn_parent_zone); 3223 #ifdef INET6 3224 uma_zdestroy(V_dyn_ipv6_zone); 3225 free(V_dyn_ipv6, M_IPFW); 3226 free(V_dyn_ipv6_parent, M_IPFW); 3227 free(V_dyn_ipv6_add, M_IPFW); 3228 free(V_dyn_ipv6_parent_add, M_IPFW); 3229 free(V_dyn_ipv6_del, M_IPFW); 3230 free(V_dyn_ipv6_parent_del, M_IPFW); 3231 #endif 3232 free(V_dyn_bucket_lock, M_IPFW); 3233 free(V_dyn_ipv4, M_IPFW); 3234 free(V_dyn_ipv4_parent, M_IPFW); 3235 free(V_dyn_ipv4_add, M_IPFW); 3236 free(V_dyn_ipv4_parent_add, M_IPFW); 3237 free(V_dyn_ipv4_del, M_IPFW); 3238 free(V_dyn_ipv4_parent_del, M_IPFW); 3239 if (IS_DEFAULT_VNET(curvnet)) 3240 free(dyn_hp_cache, M_IPFW); 3241 } 3242