1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017-2025 Yandex LLC 5 * Copyright (c) 2017-2025 Andrey V. Elsukov <ae@FreeBSD.org> 6 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_ipfw.h" 34 #ifndef INET 35 #error IPFIREWALL requires INET. 36 #endif /* INET */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/hash.h> 41 #include <sys/mbuf.h> 42 #include <sys/kernel.h> 43 #include <sys/lock.h> 44 #include <sys/pcpu.h> 45 #include <sys/queue.h> 46 #include <sys/rmlock.h> 47 #include <sys/smp.h> 48 #include <sys/socket.h> 49 #include <sys/sysctl.h> 50 #include <sys/syslog.h> 51 #include <net/ethernet.h> 52 #include <net/if.h> 53 #include <net/if_var.h> 54 #include <net/vnet.h> 55 56 #include <netinet/in.h> 57 #include <netinet/ip.h> 58 #include <netinet/ip_var.h> 59 #include <netinet/ip_fw.h> 60 #include <netinet/tcp.h> 61 #include <netinet/udp.h> 62 63 #include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */ 64 #ifdef INET6 65 #include <netinet6/in6_var.h> 66 #include <netinet6/ip6_var.h> 67 #include <netinet6/scope6_var.h> 68 #endif 69 70 #include <netpfil/ipfw/ip_fw_private.h> 71 72 #include <machine/in_cksum.h> /* XXX for in_cksum */ 73 74 #ifdef MAC 75 #include <security/mac/mac_framework.h> 76 #endif 77 78 /* 79 * Description of dynamic states. 80 * 81 * Dynamic states are stored in lists accessed through a hash tables 82 * whose size is curr_dyn_buckets. This value can be modified through 83 * the sysctl variable dyn_buckets. 84 * 85 * Currently there are four tables: dyn_ipv4, dyn_ipv6, dyn_ipv4_parent, 86 * and dyn_ipv6_parent. 87 * 88 * When a packet is received, its address fields hashed, then matched 89 * against the entries in the corresponding list by addr_type. 90 * Dynamic states can be used for different purposes: 91 * + stateful rules; 92 * + enforcing limits on the number of sessions; 93 * + in-kernel NAT (not implemented yet) 94 * 95 * The lifetime of dynamic states is regulated by dyn_*_lifetime, 96 * measured in seconds and depending on the flags. 97 * 98 * The total number of dynamic states is equal to UMA zone items count. 99 * The max number of dynamic states is dyn_max. When we reach 100 * the maximum number of rules we do not create anymore. This is 101 * done to avoid consuming too much memory, but also too much 102 * time when searching on each packet (ideally, we should try instead 103 * to put a limit on the length of the list on each bucket...). 104 * 105 * Each state holds a pointer to the parent ipfw rule so we know what 106 * action to perform. Dynamic rules are removed when the parent rule is 107 * deleted. 108 * 109 * There are some limitations with dynamic rules -- we do not 110 * obey the 'randomized match', and we do not do multiple 111 * passes through the firewall. XXX check the latter!!! 112 */ 113 114 /* By default use jenkins hash function */ 115 #define IPFIREWALL_JENKINSHASH 116 117 #define DYN_COUNTER_INC(d, dir, pktlen) do { \ 118 (d)->pcnt_ ## dir++; \ 119 (d)->bcnt_ ## dir += pktlen; \ 120 } while (0) 121 122 #define DYN_REFERENCED 0x01 123 /* 124 * DYN_REFERENCED flag is used to show that state keeps reference to named 125 * object, and this reference should be released when state becomes expired. 126 */ 127 128 struct dyn_data { 129 void *parent; /* pointer to parent rule */ 130 uint32_t chain_id; /* cached ruleset id */ 131 uint32_t f_pos; /* cached rule index */ 132 133 uint32_t hashval; /* hash value used for hash resize */ 134 uint16_t fibnum; /* fib used to send keepalives */ 135 uint8_t _pad; 136 uint8_t flags; /* internal flags */ 137 uint32_t rulenum; /* parent rule number */ 138 uint32_t ruleid; /* parent rule id */ 139 140 uint32_t state; /* TCP session state and flags */ 141 uint32_t ack_fwd; /* most recent ACKs in forward */ 142 uint32_t ack_rev; /* and reverse direction (used */ 143 /* to generate keepalives) */ 144 uint32_t sync; /* synchronization time */ 145 uint32_t expire; /* expire time */ 146 147 uint64_t pcnt_fwd; /* packets counter in forward */ 148 uint64_t bcnt_fwd; /* bytes counter in forward */ 149 uint64_t pcnt_rev; /* packets counter in reverse */ 150 uint64_t bcnt_rev; /* bytes counter in reverse */ 151 }; 152 153 #define DPARENT_COUNT_DEC(p) do { \ 154 MPASS(p->count > 0); \ 155 ck_pr_dec_32(&(p)->count); \ 156 } while (0) 157 #define DPARENT_COUNT_INC(p) ck_pr_inc_32(&(p)->count) 158 #define DPARENT_COUNT(p) ck_pr_load_32(&(p)->count) 159 struct dyn_parent { 160 void *parent; /* pointer to parent rule */ 161 uint32_t count; /* number of linked states */ 162 uint32_t rulenum; /* parent rule number */ 163 uint32_t ruleid; /* parent rule id */ 164 uint32_t hashval; /* hash value used for hash resize */ 165 uint32_t expire; /* expire time */ 166 }; 167 168 struct dyn_ipv4_state { 169 uint8_t type; /* State type */ 170 uint8_t proto; /* UL Protocol */ 171 uint16_t spare; 172 uint32_t kidx; /* named object index */ 173 uint16_t sport, dport; /* ULP source and destination ports */ 174 in_addr_t src, dst; /* IPv4 source and destination */ 175 176 union { 177 struct dyn_data *data; 178 struct dyn_parent *limit; 179 }; 180 CK_SLIST_ENTRY(dyn_ipv4_state) entry; 181 SLIST_ENTRY(dyn_ipv4_state) expired; 182 }; 183 CK_SLIST_HEAD(dyn_ipv4ck_slist, dyn_ipv4_state); 184 VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4); 185 VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4_parent); 186 187 SLIST_HEAD(dyn_ipv4_slist, dyn_ipv4_state); 188 VNET_DEFINE_STATIC(struct dyn_ipv4_slist, dyn_expired_ipv4); 189 #define V_dyn_ipv4 VNET(dyn_ipv4) 190 #define V_dyn_ipv4_parent VNET(dyn_ipv4_parent) 191 #define V_dyn_expired_ipv4 VNET(dyn_expired_ipv4) 192 193 #ifdef INET6 194 struct dyn_ipv6_state { 195 uint8_t type; /* State type */ 196 uint8_t proto; /* UL Protocol */ 197 uint16_t kidx; /* named object index */ 198 uint16_t sport, dport; /* ULP source and destination ports */ 199 struct in6_addr src, dst; /* IPv6 source and destination */ 200 uint32_t zoneid; /* IPv6 scope zone id */ 201 union { 202 struct dyn_data *data; 203 struct dyn_parent *limit; 204 }; 205 CK_SLIST_ENTRY(dyn_ipv6_state) entry; 206 SLIST_ENTRY(dyn_ipv6_state) expired; 207 }; 208 CK_SLIST_HEAD(dyn_ipv6ck_slist, dyn_ipv6_state); 209 VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6); 210 VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6_parent); 211 212 SLIST_HEAD(dyn_ipv6_slist, dyn_ipv6_state); 213 VNET_DEFINE_STATIC(struct dyn_ipv6_slist, dyn_expired_ipv6); 214 #define V_dyn_ipv6 VNET(dyn_ipv6) 215 #define V_dyn_ipv6_parent VNET(dyn_ipv6_parent) 216 #define V_dyn_expired_ipv6 VNET(dyn_expired_ipv6) 217 #endif /* INET6 */ 218 219 /* 220 * Per-CPU pointer indicates that specified state is currently in use 221 * and must not be reclaimed by expiration callout. 222 */ 223 static void **dyn_hp_cache; 224 DPCPU_DEFINE_STATIC(void *, dyn_hp); 225 #define DYNSTATE_GET(cpu) ck_pr_load_ptr(DPCPU_ID_PTR((cpu), dyn_hp)) 226 #define DYNSTATE_PROTECT(v) ck_pr_store_ptr(DPCPU_PTR(dyn_hp), (v)) 227 #define DYNSTATE_RELEASE() DYNSTATE_PROTECT(NULL) 228 #define DYNSTATE_CRITICAL_ENTER() critical_enter() 229 #define DYNSTATE_CRITICAL_EXIT() do { \ 230 DYNSTATE_RELEASE(); \ 231 critical_exit(); \ 232 } while (0); 233 234 /* 235 * We keep two version numbers, one is updated when new entry added to 236 * the list. Second is updated when an entry deleted from the list. 237 * Versions are updated under bucket lock. 238 * 239 * Bucket "add" version number is used to know, that in the time between 240 * state lookup (i.e. ipfw_dyn_lookup_state()) and the followed state 241 * creation (i.e. ipfw_dyn_install_state()) another concurrent thread did 242 * not install some state in this bucket. Using this info we can avoid 243 * additional state lookup, because we are sure that we will not install 244 * the state twice. 245 * 246 * Also doing the tracking of bucket "del" version during lookup we can 247 * be sure, that state entry was not unlinked and freed in time between 248 * we read the state pointer and protect it with hazard pointer. 249 * 250 * An entry unlinked from CK list keeps unchanged until it is freed. 251 * Unlinked entries are linked into expired lists using "expired" field. 252 */ 253 254 /* 255 * dyn_expire_lock is used to protect access to dyn_expired_xxx lists. 256 * dyn_bucket_lock is used to get write access to lists in specific bucket. 257 * Currently one dyn_bucket_lock is used for all ipv4, ipv4_parent, ipv6, 258 * and ipv6_parent lists. 259 */ 260 VNET_DEFINE_STATIC(struct mtx, dyn_expire_lock); 261 VNET_DEFINE_STATIC(struct mtx *, dyn_bucket_lock); 262 #define V_dyn_expire_lock VNET(dyn_expire_lock) 263 #define V_dyn_bucket_lock VNET(dyn_bucket_lock) 264 265 /* 266 * Bucket's add/delete generation versions. 267 */ 268 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_add); 269 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_del); 270 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_add); 271 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_del); 272 #define V_dyn_ipv4_add VNET(dyn_ipv4_add) 273 #define V_dyn_ipv4_del VNET(dyn_ipv4_del) 274 #define V_dyn_ipv4_parent_add VNET(dyn_ipv4_parent_add) 275 #define V_dyn_ipv4_parent_del VNET(dyn_ipv4_parent_del) 276 277 #ifdef INET6 278 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_add); 279 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_del); 280 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_add); 281 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_del); 282 #define V_dyn_ipv6_add VNET(dyn_ipv6_add) 283 #define V_dyn_ipv6_del VNET(dyn_ipv6_del) 284 #define V_dyn_ipv6_parent_add VNET(dyn_ipv6_parent_add) 285 #define V_dyn_ipv6_parent_del VNET(dyn_ipv6_parent_del) 286 #endif /* INET6 */ 287 288 #define DYN_BUCKET(h, b) ((h) & (b - 1)) 289 #define DYN_BUCKET_VERSION(b, v) ck_pr_load_32(&V_dyn_ ## v[(b)]) 290 #define DYN_BUCKET_VERSION_BUMP(b, v) ck_pr_inc_32(&V_dyn_ ## v[(b)]) 291 292 #define DYN_BUCKET_LOCK_INIT(lock, b) \ 293 mtx_init(&lock[(b)], "IPFW dynamic bucket", NULL, MTX_DEF) 294 #define DYN_BUCKET_LOCK_DESTROY(lock, b) mtx_destroy(&lock[(b)]) 295 #define DYN_BUCKET_LOCK(b) mtx_lock(&V_dyn_bucket_lock[(b)]) 296 #define DYN_BUCKET_UNLOCK(b) mtx_unlock(&V_dyn_bucket_lock[(b)]) 297 #define DYN_BUCKET_ASSERT(b) mtx_assert(&V_dyn_bucket_lock[(b)], MA_OWNED) 298 299 #define DYN_EXPIRED_LOCK_INIT() \ 300 mtx_init(&V_dyn_expire_lock, "IPFW expired states list", NULL, MTX_DEF) 301 #define DYN_EXPIRED_LOCK_DESTROY() mtx_destroy(&V_dyn_expire_lock) 302 #define DYN_EXPIRED_LOCK() mtx_lock(&V_dyn_expire_lock) 303 #define DYN_EXPIRED_UNLOCK() mtx_unlock(&V_dyn_expire_lock) 304 305 VNET_DEFINE_STATIC(uint32_t, dyn_buckets_max); 306 VNET_DEFINE_STATIC(uint32_t, curr_dyn_buckets); 307 VNET_DEFINE_STATIC(struct callout, dyn_timeout); 308 #define V_dyn_buckets_max VNET(dyn_buckets_max) 309 #define V_curr_dyn_buckets VNET(curr_dyn_buckets) 310 #define V_dyn_timeout VNET(dyn_timeout) 311 312 /* Maximum length of states chain in a bucket */ 313 VNET_DEFINE_STATIC(uint32_t, curr_max_length); 314 #define V_curr_max_length VNET(curr_max_length) 315 316 VNET_DEFINE_STATIC(uint32_t, dyn_keep_states); 317 #define V_dyn_keep_states VNET(dyn_keep_states) 318 319 VNET_DEFINE_STATIC(uma_zone_t, dyn_data_zone); 320 VNET_DEFINE_STATIC(uma_zone_t, dyn_parent_zone); 321 VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv4_zone); 322 #ifdef INET6 323 VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv6_zone); 324 #define V_dyn_ipv6_zone VNET(dyn_ipv6_zone) 325 #endif /* INET6 */ 326 #define V_dyn_data_zone VNET(dyn_data_zone) 327 #define V_dyn_parent_zone VNET(dyn_parent_zone) 328 #define V_dyn_ipv4_zone VNET(dyn_ipv4_zone) 329 330 /* 331 * Timeouts for various events in handing dynamic rules. 332 */ 333 VNET_DEFINE_STATIC(uint32_t, dyn_ack_lifetime); 334 VNET_DEFINE_STATIC(uint32_t, dyn_syn_lifetime); 335 VNET_DEFINE_STATIC(uint32_t, dyn_fin_lifetime); 336 VNET_DEFINE_STATIC(uint32_t, dyn_rst_lifetime); 337 VNET_DEFINE_STATIC(uint32_t, dyn_udp_lifetime); 338 VNET_DEFINE_STATIC(uint32_t, dyn_short_lifetime); 339 340 #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) 341 #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) 342 #define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) 343 #define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) 344 #define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) 345 #define V_dyn_short_lifetime VNET(dyn_short_lifetime) 346 347 /* 348 * Keepalives are sent if dyn_keepalive is set. They are sent every 349 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval 350 * seconds of lifetime of a rule. 351 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower 352 * than dyn_keepalive_period. 353 */ 354 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_interval); 355 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_period); 356 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive); 357 VNET_DEFINE_STATIC(time_t, dyn_keepalive_last); 358 359 #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) 360 #define V_dyn_keepalive_period VNET(dyn_keepalive_period) 361 #define V_dyn_keepalive VNET(dyn_keepalive) 362 #define V_dyn_keepalive_last VNET(dyn_keepalive_last) 363 364 VNET_DEFINE_STATIC(uint32_t, dyn_max); /* max # of dynamic states */ 365 VNET_DEFINE_STATIC(uint32_t, dyn_count); /* number of states */ 366 VNET_DEFINE_STATIC(uint32_t, dyn_parent_max); /* max # of parent states */ 367 VNET_DEFINE_STATIC(uint32_t, dyn_parent_count); /* number of parent states */ 368 369 #define V_dyn_max VNET(dyn_max) 370 #define V_dyn_count VNET(dyn_count) 371 #define V_dyn_parent_max VNET(dyn_parent_max) 372 #define V_dyn_parent_count VNET(dyn_parent_count) 373 374 #define DYN_COUNT_DEC(name) do { \ 375 MPASS((V_ ## name) > 0); \ 376 ck_pr_dec_32(&(V_ ## name)); \ 377 } while (0) 378 #define DYN_COUNT_INC(name) ck_pr_inc_32(&(V_ ## name)) 379 #define DYN_COUNT(name) ck_pr_load_32(&(V_ ## name)) 380 381 static time_t last_log; /* Log ratelimiting */ 382 383 /* 384 * Get/set maximum number of dynamic states in given VNET instance. 385 */ 386 static int 387 sysctl_dyn_max(SYSCTL_HANDLER_ARGS) 388 { 389 uint32_t nstates; 390 int error; 391 392 nstates = V_dyn_max; 393 error = sysctl_handle_32(oidp, &nstates, 0, req); 394 /* Read operation or some error */ 395 if ((error != 0) || (req->newptr == NULL)) 396 return (error); 397 398 V_dyn_max = nstates; 399 uma_zone_set_max(V_dyn_data_zone, V_dyn_max); 400 return (0); 401 } 402 403 static int 404 sysctl_dyn_parent_max(SYSCTL_HANDLER_ARGS) 405 { 406 uint32_t nstates; 407 int error; 408 409 nstates = V_dyn_parent_max; 410 error = sysctl_handle_32(oidp, &nstates, 0, req); 411 /* Read operation or some error */ 412 if ((error != 0) || (req->newptr == NULL)) 413 return (error); 414 415 V_dyn_parent_max = nstates; 416 uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); 417 return (0); 418 } 419 420 static int 421 sysctl_dyn_buckets(SYSCTL_HANDLER_ARGS) 422 { 423 uint32_t nbuckets; 424 int error; 425 426 nbuckets = V_dyn_buckets_max; 427 error = sysctl_handle_32(oidp, &nbuckets, 0, req); 428 /* Read operation or some error */ 429 if ((error != 0) || (req->newptr == NULL)) 430 return (error); 431 432 if (nbuckets > 256) 433 V_dyn_buckets_max = 1 << fls(nbuckets - 1); 434 else 435 return (EINVAL); 436 return (0); 437 } 438 439 SYSCTL_DECL(_net_inet_ip_fw); 440 441 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_count, 442 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_count), 0, 443 "Current number of dynamic states."); 444 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_parent_count, 445 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_parent_count), 0, 446 "Current number of parent states. "); 447 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, 448 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, 449 "Current number of buckets for states hash table."); 450 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_max_length, 451 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_max_length), 0, 452 "Current maximum length of states chains in hash buckets."); 453 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_buckets, 454 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 455 0, 0, sysctl_dyn_buckets, "IU", 456 "Max number of buckets for dynamic states hash table."); 457 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, 458 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 459 0, 0, sysctl_dyn_max, "IU", 460 "Max number of dynamic states."); 461 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_parent_max, 462 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 463 0, 0, sysctl_dyn_parent_max, "IU", 464 "Max number of parent dynamic states."); 465 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, 466 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, 467 "Lifetime of dynamic states for TCP ACK."); 468 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, 469 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, 470 "Lifetime of dynamic states for TCP SYN."); 471 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, 472 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, 473 "Lifetime of dynamic states for TCP FIN."); 474 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, 475 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, 476 "Lifetime of dynamic states for TCP RST."); 477 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, 478 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, 479 "Lifetime of dynamic states for UDP."); 480 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, 481 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, 482 "Lifetime of dynamic states for other situations."); 483 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, 484 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, 485 "Enable keepalives for dynamic states."); 486 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, 487 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, 488 "Do not flush dynamic states on rule deletion"); 489 490 #ifdef IPFIREWALL_DYNDEBUG 491 #define DYN_DEBUG(fmt, ...) do { \ 492 printf("%s: " fmt "\n", __func__, __VA_ARGS__); \ 493 } while (0) 494 #else 495 #define DYN_DEBUG(fmt, ...) 496 #endif /* !IPFIREWALL_DYNDEBUG */ 497 498 #ifdef INET6 499 /* Functions to work with IPv6 states */ 500 static struct dyn_ipv6_state *dyn_lookup_ipv6_state( 501 const struct ipfw_flow_id *, uint32_t, const void *, 502 struct ipfw_dyn_info *, int); 503 static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *, 504 uint32_t, const void *, int, uint32_t, uint32_t); 505 static struct dyn_ipv6_state *dyn_alloc_ipv6_state( 506 const struct ipfw_flow_id *, uint32_t, uint32_t, uint8_t); 507 static int dyn_add_ipv6_state(void *, uint32_t, uint32_t, 508 const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t, 509 struct ipfw_dyn_info *, uint16_t, uint32_t, uint8_t); 510 static void dyn_export_ipv6_state(const struct dyn_ipv6_state *, 511 ipfw_dyn_rule *); 512 513 static uint32_t dyn_getscopeid(const struct ip_fw_args *); 514 static void dyn_make_keepalive_ipv6(struct mbuf *, const struct in6_addr *, 515 const struct in6_addr *, uint32_t, uint32_t, uint32_t, uint16_t, 516 uint16_t); 517 static void dyn_enqueue_keepalive_ipv6(struct mbufq *, 518 const struct dyn_ipv6_state *); 519 static void dyn_send_keepalive_ipv6(struct ip_fw_chain *); 520 521 static struct dyn_ipv6_state *dyn_lookup_ipv6_parent( 522 const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint32_t, 523 uint32_t); 524 static struct dyn_ipv6_state *dyn_lookup_ipv6_parent_locked( 525 const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint32_t, 526 uint32_t); 527 static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint32_t, 528 const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t, uint32_t); 529 #endif /* INET6 */ 530 531 /* Functions to work with limit states */ 532 static void *dyn_get_parent_state(const struct ipfw_flow_id *, uint32_t, 533 struct ip_fw *, uint32_t, uint32_t, uint32_t); 534 static struct dyn_ipv4_state *dyn_lookup_ipv4_parent( 535 const struct ipfw_flow_id *, const void *, uint32_t, uint32_t, uint32_t); 536 static struct dyn_ipv4_state *dyn_lookup_ipv4_parent_locked( 537 const struct ipfw_flow_id *, const void *, uint32_t, uint32_t, uint32_t); 538 static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint32_t, 539 uint32_t); 540 static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint32_t, 541 const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t); 542 543 static void dyn_tick(void *); 544 static void dyn_expire_states(struct ip_fw_chain *, ipfw_range_tlv *); 545 static void dyn_free_states(struct ip_fw_chain *); 546 static void dyn_export_parent(const struct dyn_parent *, uint32_t, uint8_t, 547 ipfw_dyn_rule *); 548 static void dyn_export_data(const struct dyn_data *, uint32_t, uint8_t, 549 uint8_t, ipfw_dyn_rule *); 550 static uint32_t dyn_update_tcp_state(struct dyn_data *, 551 const struct ipfw_flow_id *, const struct tcphdr *, int); 552 static void dyn_update_proto_state(struct dyn_data *, 553 const struct ipfw_flow_id *, const void *, int, int); 554 555 /* Functions to work with IPv4 states */ 556 struct dyn_ipv4_state *dyn_lookup_ipv4_state(const struct ipfw_flow_id *, 557 const void *, struct ipfw_dyn_info *, int); 558 static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *, 559 const void *, int, uint32_t, uint32_t); 560 static struct dyn_ipv4_state *dyn_alloc_ipv4_state( 561 const struct ipfw_flow_id *, uint32_t, uint8_t); 562 static int dyn_add_ipv4_state(void *, uint32_t, uint32_t, 563 const struct ipfw_flow_id *, const void *, int, uint32_t, 564 struct ipfw_dyn_info *, uint16_t, uint32_t, uint8_t); 565 static void dyn_export_ipv4_state(const struct dyn_ipv4_state *, 566 ipfw_dyn_rule *); 567 568 /* 569 * Named states support. 570 */ 571 static char *default_state_name = "default"; 572 struct dyn_state_obj { 573 struct named_object no; 574 char name[64]; 575 }; 576 577 /* 578 * Classifier callback. 579 * Return 0 if opcode contains object that should be referenced 580 * or rewritten. 581 */ 582 static int 583 dyn_classify(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) 584 { 585 ipfw_insn_kidx *cmd; 586 587 if (F_LEN(cmd0) < 2) 588 return (EINVAL); 589 590 /* 591 * NOTE: ipfw_insn_kidx and ipfw_insn_limit has overlapped kidx 592 * field, so we can use one type to get access to kidx field. 593 */ 594 cmd = insntod(cmd0, kidx); 595 DYN_DEBUG("opcode %u, kidx %u", cmd0->opcode, cmd->kidx); 596 /* Don't rewrite "check-state any" */ 597 if (cmd->kidx == 0 && 598 cmd0->opcode == O_CHECK_STATE) 599 return (1); 600 601 *puidx = cmd->kidx; 602 *ptype = 0; 603 return (0); 604 } 605 606 static void 607 dyn_update(ipfw_insn *cmd0, uint32_t idx) 608 { 609 610 insntod(cmd0, kidx)->kidx = idx; 611 DYN_DEBUG("opcode %u, kidx %u", cmd0->opcode, idx); 612 } 613 614 static int 615 dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, 616 struct named_object **pno) 617 { 618 ipfw_obj_ntlv *ntlv; 619 const char *name; 620 621 DYN_DEBUG("uidx %u", ti->uidx); 622 if (ti->uidx != 0) { 623 if (ti->tlvs == NULL) 624 return (EINVAL); 625 /* Search ntlv in the buffer provided by user */ 626 ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, 627 IPFW_TLV_STATE_NAME); 628 if (ntlv == NULL) 629 return (EINVAL); 630 name = ntlv->name; 631 } else 632 name = default_state_name; 633 /* 634 * Search named object with corresponding name. 635 * Since states objects are global - ignore the set value 636 * and use zero instead. 637 */ 638 *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, 639 IPFW_TLV_STATE_NAME, name); 640 /* 641 * We always return success here. 642 * The caller will check *pno and mark object as unresolved, 643 * then it will automatically create "default" object. 644 */ 645 return (0); 646 } 647 648 static struct named_object * 649 dyn_findbykidx(struct ip_fw_chain *ch, uint32_t idx) 650 { 651 652 DYN_DEBUG("kidx %u", idx); 653 return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); 654 } 655 656 static int 657 dyn_create(struct ip_fw_chain *ch, struct tid_info *ti, 658 uint32_t *pkidx) 659 { 660 struct namedobj_instance *ni; 661 struct dyn_state_obj *obj; 662 struct named_object *no; 663 ipfw_obj_ntlv *ntlv; 664 char *name; 665 666 DYN_DEBUG("uidx %u", ti->uidx); 667 if (ti->uidx != 0) { 668 if (ti->tlvs == NULL) 669 return (EINVAL); 670 ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, 671 IPFW_TLV_STATE_NAME); 672 if (ntlv == NULL) 673 return (EINVAL); 674 name = ntlv->name; 675 } else 676 name = default_state_name; 677 678 ni = CHAIN_TO_SRV(ch); 679 obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); 680 obj->no.name = obj->name; 681 obj->no.etlv = IPFW_TLV_STATE_NAME; 682 strlcpy(obj->name, name, sizeof(obj->name)); 683 684 IPFW_UH_WLOCK(ch); 685 no = ipfw_objhash_lookup_name_type(ni, 0, 686 IPFW_TLV_STATE_NAME, name); 687 if (no != NULL) { 688 /* 689 * Object is already created. 690 * Just return its kidx and bump refcount. 691 */ 692 *pkidx = no->kidx; 693 no->refcnt++; 694 IPFW_UH_WUNLOCK(ch); 695 free(obj, M_IPFW); 696 DYN_DEBUG("\tfound kidx %u for name '%s'", *pkidx, no->name); 697 return (0); 698 } 699 if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { 700 DYN_DEBUG("\talloc_idx failed for %s", name); 701 IPFW_UH_WUNLOCK(ch); 702 free(obj, M_IPFW); 703 return (ENOSPC); 704 } 705 ipfw_objhash_add(ni, &obj->no); 706 SRV_OBJECT(ch, obj->no.kidx) = obj; 707 obj->no.refcnt++; 708 *pkidx = obj->no.kidx; 709 IPFW_UH_WUNLOCK(ch); 710 DYN_DEBUG("\tcreated kidx %u for name '%s'", *pkidx, name); 711 return (0); 712 } 713 714 static void 715 dyn_destroy(struct ip_fw_chain *ch, struct named_object *no) 716 { 717 struct dyn_state_obj *obj; 718 719 IPFW_UH_WLOCK_ASSERT(ch); 720 721 KASSERT(no->etlv == IPFW_TLV_STATE_NAME, 722 ("%s: wrong object type %u", __func__, no->etlv)); 723 KASSERT(no->refcnt == 1, 724 ("Destroying object '%s' (type %u, idx %u) with refcnt %u", 725 no->name, no->etlv, no->kidx, no->refcnt)); 726 DYN_DEBUG("kidx %u", no->kidx); 727 obj = SRV_OBJECT(ch, no->kidx); 728 SRV_OBJECT(ch, no->kidx) = NULL; 729 ipfw_objhash_del(CHAIN_TO_SRV(ch), no); 730 ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx); 731 732 free(obj, M_IPFW); 733 } 734 735 static struct opcode_obj_rewrite dyn_opcodes[] = { 736 { 737 O_KEEP_STATE, IPFW_TLV_STATE_NAME, 738 dyn_classify, dyn_update, 739 dyn_findbyname, dyn_findbykidx, 740 dyn_create, dyn_destroy 741 }, 742 { 743 O_CHECK_STATE, IPFW_TLV_STATE_NAME, 744 dyn_classify, dyn_update, 745 dyn_findbyname, dyn_findbykidx, 746 dyn_create, dyn_destroy 747 }, 748 { 749 O_PROBE_STATE, IPFW_TLV_STATE_NAME, 750 dyn_classify, dyn_update, 751 dyn_findbyname, dyn_findbykidx, 752 dyn_create, dyn_destroy 753 }, 754 { 755 O_LIMIT, IPFW_TLV_STATE_NAME, 756 dyn_classify, dyn_update, 757 dyn_findbyname, dyn_findbykidx, 758 dyn_create, dyn_destroy 759 }, 760 }; 761 762 /* 763 * IMPORTANT: the hash function for dynamic rules must be commutative 764 * in source and destination (ip,port), because rules are bidirectional 765 * and we want to find both in the same bucket. 766 */ 767 #ifndef IPFIREWALL_JENKINSHASH 768 static __inline uint32_t 769 hash_packet(const struct ipfw_flow_id *id) 770 { 771 uint32_t i; 772 773 #ifdef INET6 774 if (IS_IP6_FLOW_ID(id)) 775 i = ntohl((id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ 776 (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ 777 (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ 778 (id->src_ip6.__u6_addr.__u6_addr32[3])); 779 else 780 #endif /* INET6 */ 781 i = (id->dst_ip) ^ (id->src_ip); 782 i ^= (id->dst_port) ^ (id->src_port); 783 return (i); 784 } 785 786 static __inline uint32_t 787 hash_parent(const struct ipfw_flow_id *id, const void *rule) 788 { 789 790 return (hash_packet(id) ^ ((uintptr_t)rule)); 791 } 792 793 #else /* IPFIREWALL_JENKINSHASH */ 794 795 VNET_DEFINE_STATIC(uint32_t, dyn_hashseed); 796 #define V_dyn_hashseed VNET(dyn_hashseed) 797 798 static __inline int 799 addrcmp4(const struct ipfw_flow_id *id) 800 { 801 802 if (id->src_ip < id->dst_ip) 803 return (0); 804 if (id->src_ip > id->dst_ip) 805 return (1); 806 if (id->src_port <= id->dst_port) 807 return (0); 808 return (1); 809 } 810 811 #ifdef INET6 812 static __inline int 813 addrcmp6(const struct ipfw_flow_id *id) 814 { 815 int ret; 816 817 ret = memcmp(&id->src_ip6, &id->dst_ip6, sizeof(struct in6_addr)); 818 if (ret < 0) 819 return (0); 820 if (ret > 0) 821 return (1); 822 if (id->src_port <= id->dst_port) 823 return (0); 824 return (1); 825 } 826 827 static __inline uint32_t 828 hash_packet6(const struct ipfw_flow_id *id) 829 { 830 struct tuple6 { 831 struct in6_addr addr[2]; 832 uint16_t port[2]; 833 } t6; 834 835 if (addrcmp6(id) == 0) { 836 t6.addr[0] = id->src_ip6; 837 t6.addr[1] = id->dst_ip6; 838 t6.port[0] = id->src_port; 839 t6.port[1] = id->dst_port; 840 } else { 841 t6.addr[0] = id->dst_ip6; 842 t6.addr[1] = id->src_ip6; 843 t6.port[0] = id->dst_port; 844 t6.port[1] = id->src_port; 845 } 846 return (jenkins_hash32((const uint32_t *)&t6, 847 sizeof(t6) / sizeof(uint32_t), V_dyn_hashseed)); 848 } 849 #endif 850 851 static __inline uint32_t 852 hash_packet(const struct ipfw_flow_id *id) 853 { 854 struct tuple4 { 855 in_addr_t addr[2]; 856 uint16_t port[2]; 857 } t4; 858 859 if (IS_IP4_FLOW_ID(id)) { 860 /* All fields are in host byte order */ 861 if (addrcmp4(id) == 0) { 862 t4.addr[0] = id->src_ip; 863 t4.addr[1] = id->dst_ip; 864 t4.port[0] = id->src_port; 865 t4.port[1] = id->dst_port; 866 } else { 867 t4.addr[0] = id->dst_ip; 868 t4.addr[1] = id->src_ip; 869 t4.port[0] = id->dst_port; 870 t4.port[1] = id->src_port; 871 } 872 return (jenkins_hash32((const uint32_t *)&t4, 873 sizeof(t4) / sizeof(uint32_t), V_dyn_hashseed)); 874 } else 875 #ifdef INET6 876 if (IS_IP6_FLOW_ID(id)) 877 return (hash_packet6(id)); 878 #endif 879 return (0); 880 } 881 882 static __inline uint32_t 883 hash_parent(const struct ipfw_flow_id *id, const void *rule) 884 { 885 886 return (jenkins_hash32((const uint32_t *)&rule, 887 sizeof(rule) / sizeof(uint32_t), hash_packet(id))); 888 } 889 #endif /* IPFIREWALL_JENKINSHASH */ 890 891 /* 892 * Print customizable flow id description via log(9) facility. 893 */ 894 static void 895 print_dyn_rule_flags(const struct ipfw_flow_id *id, int dyn_type, 896 int log_flags, char *prefix, char *postfix) 897 { 898 struct in_addr da; 899 #ifdef INET6 900 char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; 901 #else 902 char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; 903 #endif 904 905 #ifdef INET6 906 if (IS_IP6_FLOW_ID(id)) { 907 ip6_sprintf(src, &id->src_ip6); 908 ip6_sprintf(dst, &id->dst_ip6); 909 } else 910 #endif 911 { 912 da.s_addr = htonl(id->src_ip); 913 inet_ntop(AF_INET, &da, src, sizeof(src)); 914 da.s_addr = htonl(id->dst_ip); 915 inet_ntop(AF_INET, &da, dst, sizeof(dst)); 916 } 917 log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", 918 prefix, dyn_type, src, id->src_port, dst, 919 id->dst_port, V_dyn_count, postfix); 920 } 921 922 #define print_dyn_rule(id, dtype, prefix, postfix) \ 923 print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) 924 925 #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) 926 #define TIME_LE(a,b) ((int)((a)-(b)) < 0) 927 #define _SEQ_GE(a,b) ((int)((a)-(b)) >= 0) 928 #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) 929 #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) 930 #define BOTH_RST (TH_RST | (TH_RST << 8)) 931 #define TCP_FLAGS (BOTH_SYN | BOTH_FIN | BOTH_RST) 932 #define ACK_FWD 0x00010000 /* fwd ack seen */ 933 #define ACK_REV 0x00020000 /* rev ack seen */ 934 #define ACK_BOTH (ACK_FWD | ACK_REV) 935 936 static uint32_t 937 dyn_update_tcp_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, 938 const struct tcphdr *tcp, int dir) 939 { 940 uint32_t ack, expire; 941 uint32_t state, old; 942 uint8_t th_flags; 943 944 expire = data->expire; 945 old = state = data->state; 946 th_flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); 947 state |= (dir == MATCH_FORWARD) ? th_flags: (th_flags << 8); 948 switch (state & TCP_FLAGS) { 949 case TH_SYN: /* opening */ 950 expire = time_uptime + V_dyn_syn_lifetime; 951 break; 952 953 case BOTH_SYN: /* move to established */ 954 case BOTH_SYN | TH_FIN: /* one side tries to close */ 955 case BOTH_SYN | (TH_FIN << 8): 956 if (tcp == NULL) 957 break; 958 ack = ntohl(tcp->th_ack); 959 if (dir == MATCH_FORWARD) { 960 if (data->ack_fwd == 0 || 961 _SEQ_GE(ack, data->ack_fwd)) { 962 state |= ACK_FWD; 963 if (data->ack_fwd != ack) 964 ck_pr_store_32(&data->ack_fwd, ack); 965 } 966 } else { 967 if (data->ack_rev == 0 || 968 _SEQ_GE(ack, data->ack_rev)) { 969 state |= ACK_REV; 970 if (data->ack_rev != ack) 971 ck_pr_store_32(&data->ack_rev, ack); 972 } 973 } 974 if ((state & ACK_BOTH) == ACK_BOTH) { 975 /* 976 * Set expire time to V_dyn_ack_lifetime only if 977 * we got ACKs for both directions. 978 * We use XOR here to avoid possible state 979 * overwriting in concurrent thread. 980 */ 981 expire = time_uptime + V_dyn_ack_lifetime; 982 ck_pr_xor_32(&data->state, ACK_BOTH); 983 } else if ((data->state & ACK_BOTH) != (state & ACK_BOTH)) 984 ck_pr_or_32(&data->state, state & ACK_BOTH); 985 break; 986 987 case BOTH_SYN | BOTH_FIN: /* both sides closed */ 988 if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) 989 V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; 990 expire = time_uptime + V_dyn_fin_lifetime; 991 break; 992 993 default: 994 if (V_dyn_keepalive != 0 && 995 V_dyn_rst_lifetime >= V_dyn_keepalive_period) 996 V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; 997 expire = time_uptime + V_dyn_rst_lifetime; 998 } 999 /* Save TCP state if it was changed */ 1000 if ((state & TCP_FLAGS) != (old & TCP_FLAGS)) 1001 ck_pr_or_32(&data->state, state & TCP_FLAGS); 1002 return (expire); 1003 } 1004 1005 /* 1006 * Update ULP specific state. 1007 * For TCP we keep sequence numbers and flags. For other protocols 1008 * currently we update only expire time. Packets and bytes counters 1009 * are also updated here. 1010 */ 1011 static void 1012 dyn_update_proto_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, 1013 const void *ulp, int pktlen, int dir) 1014 { 1015 uint32_t expire; 1016 1017 /* NOTE: we are in critical section here. */ 1018 switch (pkt->proto) { 1019 case IPPROTO_UDP: 1020 case IPPROTO_UDPLITE: 1021 expire = time_uptime + V_dyn_udp_lifetime; 1022 break; 1023 case IPPROTO_TCP: 1024 expire = dyn_update_tcp_state(data, pkt, ulp, dir); 1025 break; 1026 default: 1027 expire = time_uptime + V_dyn_short_lifetime; 1028 } 1029 /* 1030 * Expiration timer has the per-second granularity, no need to update 1031 * it every time when state is matched. 1032 */ 1033 if (data->expire != expire) 1034 ck_pr_store_32(&data->expire, expire); 1035 1036 if (dir == MATCH_FORWARD) 1037 DYN_COUNTER_INC(data, fwd, pktlen); 1038 else 1039 DYN_COUNTER_INC(data, rev, pktlen); 1040 } 1041 1042 /* 1043 * Lookup IPv4 state. 1044 * Must be called in critical section. 1045 */ 1046 struct dyn_ipv4_state * 1047 dyn_lookup_ipv4_state(const struct ipfw_flow_id *pkt, const void *ulp, 1048 struct ipfw_dyn_info *info, int pktlen) 1049 { 1050 struct dyn_ipv4_state *s; 1051 uint32_t version, bucket; 1052 1053 bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); 1054 info->version = DYN_BUCKET_VERSION(bucket, ipv4_add); 1055 restart: 1056 version = DYN_BUCKET_VERSION(bucket, ipv4_del); 1057 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 1058 DYNSTATE_PROTECT(s); 1059 if (version != DYN_BUCKET_VERSION(bucket, ipv4_del)) 1060 goto restart; 1061 if (s->proto != pkt->proto) 1062 continue; 1063 if (info->kidx != 0 && s->kidx != info->kidx) 1064 continue; 1065 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1066 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1067 info->direction = MATCH_FORWARD; 1068 break; 1069 } 1070 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1071 s->src == pkt->dst_ip && s->dst == pkt->src_ip) { 1072 info->direction = MATCH_REVERSE; 1073 break; 1074 } 1075 } 1076 1077 if (s != NULL) 1078 dyn_update_proto_state(s->data, pkt, ulp, pktlen, 1079 info->direction); 1080 return (s); 1081 } 1082 1083 /* 1084 * Lookup IPv4 state. 1085 * Simplifed version is used to check that matching state doesn't exist. 1086 */ 1087 static int 1088 dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *pkt, 1089 const void *ulp, int pktlen, uint32_t bucket, uint32_t kidx) 1090 { 1091 struct dyn_ipv4_state *s; 1092 int dir; 1093 1094 dir = MATCH_NONE; 1095 DYN_BUCKET_ASSERT(bucket); 1096 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 1097 if (s->proto != pkt->proto || 1098 s->kidx != kidx) 1099 continue; 1100 if (s->sport == pkt->src_port && 1101 s->dport == pkt->dst_port && 1102 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1103 dir = MATCH_FORWARD; 1104 break; 1105 } 1106 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1107 s->src == pkt->dst_ip && s->dst == pkt->src_ip) { 1108 dir = MATCH_REVERSE; 1109 break; 1110 } 1111 } 1112 if (s != NULL) 1113 dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); 1114 return (s != NULL); 1115 } 1116 1117 struct dyn_ipv4_state * 1118 dyn_lookup_ipv4_parent(const struct ipfw_flow_id *pkt, const void *rule, 1119 uint32_t ruleid, uint32_t rulenum, uint32_t hashval) 1120 { 1121 struct dyn_ipv4_state *s; 1122 uint32_t version, bucket; 1123 1124 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1125 restart: 1126 version = DYN_BUCKET_VERSION(bucket, ipv4_parent_del); 1127 CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { 1128 DYNSTATE_PROTECT(s); 1129 if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_del)) 1130 goto restart; 1131 /* 1132 * NOTE: we do not need to check kidx, because parent rule 1133 * can not create states with different kidx. 1134 * And parent rule always created for forward direction. 1135 */ 1136 if (s->limit->parent == rule && 1137 s->limit->ruleid == ruleid && 1138 s->limit->rulenum == rulenum && 1139 s->proto == pkt->proto && 1140 s->sport == pkt->src_port && 1141 s->dport == pkt->dst_port && 1142 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1143 if (s->limit->expire != time_uptime + 1144 V_dyn_short_lifetime) 1145 ck_pr_store_32(&s->limit->expire, 1146 time_uptime + V_dyn_short_lifetime); 1147 break; 1148 } 1149 } 1150 return (s); 1151 } 1152 1153 static struct dyn_ipv4_state * 1154 dyn_lookup_ipv4_parent_locked(const struct ipfw_flow_id *pkt, 1155 const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t bucket) 1156 { 1157 struct dyn_ipv4_state *s; 1158 1159 DYN_BUCKET_ASSERT(bucket); 1160 CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { 1161 if (s->limit->parent == rule && 1162 s->limit->ruleid == ruleid && 1163 s->limit->rulenum == rulenum && 1164 s->proto == pkt->proto && 1165 s->sport == pkt->src_port && 1166 s->dport == pkt->dst_port && 1167 s->src == pkt->src_ip && s->dst == pkt->dst_ip) 1168 break; 1169 } 1170 return (s); 1171 } 1172 1173 #ifdef INET6 1174 static uint32_t 1175 dyn_getscopeid(const struct ip_fw_args *args) 1176 { 1177 1178 /* 1179 * If source or destination address is an scopeid address, we need 1180 * determine the scope zone id to resolve address scope ambiguity. 1181 */ 1182 if (IN6_IS_ADDR_LINKLOCAL(&args->f_id.src_ip6) || 1183 IN6_IS_ADDR_LINKLOCAL(&args->f_id.dst_ip6)) 1184 return (in6_getscopezone(args->ifp, IPV6_ADDR_SCOPE_LINKLOCAL)); 1185 1186 return (0); 1187 } 1188 1189 /* 1190 * Lookup IPv6 state. 1191 * Must be called in critical section. 1192 */ 1193 static struct dyn_ipv6_state * 1194 dyn_lookup_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1195 const void *ulp, struct ipfw_dyn_info *info, int pktlen) 1196 { 1197 struct dyn_ipv6_state *s; 1198 uint32_t version, bucket; 1199 1200 bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); 1201 info->version = DYN_BUCKET_VERSION(bucket, ipv6_add); 1202 restart: 1203 version = DYN_BUCKET_VERSION(bucket, ipv6_del); 1204 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 1205 DYNSTATE_PROTECT(s); 1206 if (version != DYN_BUCKET_VERSION(bucket, ipv6_del)) 1207 goto restart; 1208 if (s->proto != pkt->proto || s->zoneid != zoneid) 1209 continue; 1210 if (info->kidx != 0 && s->kidx != info->kidx) 1211 continue; 1212 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1213 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1214 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1215 info->direction = MATCH_FORWARD; 1216 break; 1217 } 1218 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1219 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && 1220 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { 1221 info->direction = MATCH_REVERSE; 1222 break; 1223 } 1224 } 1225 if (s != NULL) 1226 dyn_update_proto_state(s->data, pkt, ulp, pktlen, 1227 info->direction); 1228 return (s); 1229 } 1230 1231 /* 1232 * Lookup IPv6 state. 1233 * Simplifed version is used to check that matching state doesn't exist. 1234 */ 1235 static int 1236 dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1237 const void *ulp, int pktlen, uint32_t bucket, uint32_t kidx) 1238 { 1239 struct dyn_ipv6_state *s; 1240 int dir; 1241 1242 dir = MATCH_NONE; 1243 DYN_BUCKET_ASSERT(bucket); 1244 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 1245 if (s->proto != pkt->proto || s->kidx != kidx || 1246 s->zoneid != zoneid) 1247 continue; 1248 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1249 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1250 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1251 dir = MATCH_FORWARD; 1252 break; 1253 } 1254 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1255 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && 1256 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { 1257 dir = MATCH_REVERSE; 1258 break; 1259 } 1260 } 1261 if (s != NULL) 1262 dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); 1263 return (s != NULL); 1264 } 1265 1266 static struct dyn_ipv6_state * 1267 dyn_lookup_ipv6_parent(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1268 const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t hashval) 1269 { 1270 struct dyn_ipv6_state *s; 1271 uint32_t version, bucket; 1272 1273 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1274 restart: 1275 version = DYN_BUCKET_VERSION(bucket, ipv6_parent_del); 1276 CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { 1277 DYNSTATE_PROTECT(s); 1278 if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_del)) 1279 goto restart; 1280 /* 1281 * NOTE: we do not need to check kidx, because parent rule 1282 * can not create states with different kidx. 1283 * Also parent rule always created for forward direction. 1284 */ 1285 if (s->limit->parent == rule && 1286 s->limit->ruleid == ruleid && 1287 s->limit->rulenum == rulenum && 1288 s->proto == pkt->proto && 1289 s->sport == pkt->src_port && 1290 s->dport == pkt->dst_port && s->zoneid == zoneid && 1291 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1292 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1293 if (s->limit->expire != time_uptime + 1294 V_dyn_short_lifetime) 1295 ck_pr_store_32(&s->limit->expire, 1296 time_uptime + V_dyn_short_lifetime); 1297 break; 1298 } 1299 } 1300 return (s); 1301 } 1302 1303 static struct dyn_ipv6_state * 1304 dyn_lookup_ipv6_parent_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1305 const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t bucket) 1306 { 1307 struct dyn_ipv6_state *s; 1308 1309 DYN_BUCKET_ASSERT(bucket); 1310 CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { 1311 if (s->limit->parent == rule && 1312 s->limit->ruleid == ruleid && 1313 s->limit->rulenum == rulenum && 1314 s->proto == pkt->proto && 1315 s->sport == pkt->src_port && 1316 s->dport == pkt->dst_port && s->zoneid == zoneid && 1317 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1318 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) 1319 break; 1320 } 1321 return (s); 1322 } 1323 1324 #endif /* INET6 */ 1325 1326 static int 1327 dyn_handle_orphaned(struct ip_fw *old_rule, struct dyn_data *data) 1328 { 1329 struct ip_fw *rule; 1330 const ipfw_insn *cmd, *old_cmd; 1331 1332 old_cmd = ACTION_PTR(old_rule); 1333 switch (old_cmd->opcode) { 1334 case O_SETMARK: 1335 case O_SKIPTO: 1336 /* 1337 * Rule pointer was changed. For O_SKIPTO action it can be 1338 * dangerous to keep use old rule. If new rule has the same 1339 * action and the same destination number, then use this dynamic 1340 * state. Otherwise it is better to create new one. 1341 */ 1342 rule = V_layer3_chain.map[data->f_pos]; 1343 cmd = ACTION_PTR(rule); 1344 if (cmd->opcode != old_cmd->opcode || 1345 cmd->len != old_cmd->len || cmd->arg1 != old_cmd->arg1 || 1346 insntoc(cmd, u32)->d[0] != insntoc(old_cmd, u32)->d[0]) 1347 return (-1); 1348 break; 1349 } 1350 return (0); 1351 } 1352 1353 /* 1354 * Lookup dynamic state. 1355 * pkt - filled by ipfw_chk() ipfw_flow_id; 1356 * ulp - determined by ipfw_chk() upper level protocol header; 1357 * dyn_info - info about matched state to return back; 1358 * Returns pointer to state's parent rule and dyn_info. If there is 1359 * no state, NULL is returned. 1360 * On match ipfw_dyn_lookup() updates state's counters. 1361 */ 1362 struct ip_fw * 1363 ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp, 1364 int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info) 1365 { 1366 struct dyn_data *data; 1367 struct ip_fw *rule; 1368 1369 IPFW_RLOCK_ASSERT(&V_layer3_chain); 1370 MPASS(F_LEN(cmd) >= F_INSN_SIZE(ipfw_insn_kidx)); 1371 1372 data = NULL; 1373 rule = NULL; 1374 info->kidx = insntoc(cmd, kidx)->kidx; 1375 info->direction = MATCH_NONE; 1376 info->hashval = hash_packet(&args->f_id); 1377 1378 DYNSTATE_CRITICAL_ENTER(); 1379 if (IS_IP4_FLOW_ID(&args->f_id)) { 1380 struct dyn_ipv4_state *s; 1381 1382 s = dyn_lookup_ipv4_state(&args->f_id, ulp, info, pktlen); 1383 if (s != NULL) { 1384 /* 1385 * Dynamic states are created using the same 5-tuple, 1386 * so it is assumed, that parent rule for O_LIMIT 1387 * state has the same address family. 1388 */ 1389 data = s->data; 1390 if (s->type == O_LIMIT) { 1391 s = data->parent; 1392 rule = s->limit->parent; 1393 } else 1394 rule = data->parent; 1395 } 1396 } 1397 #ifdef INET6 1398 else if (IS_IP6_FLOW_ID(&args->f_id)) { 1399 struct dyn_ipv6_state *s; 1400 1401 s = dyn_lookup_ipv6_state(&args->f_id, dyn_getscopeid(args), 1402 ulp, info, pktlen); 1403 if (s != NULL) { 1404 data = s->data; 1405 if (s->type == O_LIMIT) { 1406 s = data->parent; 1407 rule = s->limit->parent; 1408 } else 1409 rule = data->parent; 1410 } 1411 } 1412 #endif 1413 if (data != NULL) { 1414 /* 1415 * If cached chain id is the same, we can avoid rule index 1416 * lookup. Otherwise do lookup and update chain_id and f_pos. 1417 * It is safe even if there is concurrent thread that want 1418 * update the same state, because chain->id can be changed 1419 * only under IPFW_WLOCK(). 1420 */ 1421 if (data->chain_id != V_layer3_chain.id) { 1422 data->f_pos = ipfw_find_rule(&V_layer3_chain, 1423 data->rulenum, data->ruleid); 1424 /* 1425 * Check that found state has not orphaned. 1426 * When chain->id being changed the parent 1427 * rule can be deleted. If found rule doesn't 1428 * match the parent pointer, consider this 1429 * result as MATCH_NONE and return NULL. 1430 * 1431 * This will lead to creation of new similar state 1432 * that will be added into head of this bucket. 1433 * And the state that we currently have matched 1434 * should be deleted by dyn_expire_states(). 1435 * 1436 * In case when dyn_keep_states is enabled, return 1437 * pointer to deleted rule and f_pos value 1438 * corresponding to penultimate rule. 1439 * When we have enabled V_dyn_keep_states, states 1440 * that become orphaned will get the DYN_REFERENCED 1441 * flag and rule will keep around. So we can return 1442 * it. But since it is not in the rules map, we need 1443 * return such f_pos value, so after the state 1444 * handling if the search will continue, the next rule 1445 * will be the last one - the default rule. 1446 */ 1447 if (V_layer3_chain.map[data->f_pos] == rule) { 1448 data->chain_id = V_layer3_chain.id; 1449 } else if (V_dyn_keep_states != 0) { 1450 /* 1451 * The original rule pointer is still usable. 1452 * So, we return it, but f_pos need to be 1453 * changed to point to the penultimate rule. 1454 */ 1455 MPASS(V_layer3_chain.n_rules > 1); 1456 if (dyn_handle_orphaned(rule, data) == 0) { 1457 data->chain_id = V_layer3_chain.id; 1458 data->f_pos = V_layer3_chain.n_rules - 2; 1459 } else { 1460 rule = NULL; 1461 info->direction = MATCH_NONE; 1462 } 1463 } else { 1464 rule = NULL; 1465 info->direction = MATCH_NONE; 1466 DYN_DEBUG("rule %p [%u, %u] is considered " 1467 "invalid in data %p", rule, data->ruleid, 1468 data->rulenum, data); 1469 /* info->f_pos doesn't matter here. */ 1470 } 1471 } 1472 info->f_pos = data->f_pos; 1473 } 1474 DYNSTATE_CRITICAL_EXIT(); 1475 #if 0 1476 /* 1477 * Return MATCH_NONE if parent rule is in disabled set. 1478 * This will lead to creation of new similar state that 1479 * will be added into head of this bucket. 1480 * 1481 * XXXAE: we need to be able update state's set when parent 1482 * rule set is changed. 1483 */ 1484 if (rule != NULL && (V_set_disable & (1 << rule->set))) { 1485 rule = NULL; 1486 info->direction = MATCH_NONE; 1487 } 1488 #endif 1489 return (rule); 1490 } 1491 1492 static struct dyn_parent * 1493 dyn_alloc_parent(void *parent, uint32_t ruleid, uint32_t rulenum, 1494 uint32_t hashval) 1495 { 1496 struct dyn_parent *limit; 1497 1498 limit = uma_zalloc(V_dyn_parent_zone, M_NOWAIT | M_ZERO); 1499 if (limit == NULL) { 1500 if (last_log != time_uptime) { 1501 last_log = time_uptime; 1502 log(LOG_DEBUG, 1503 "ipfw: Cannot allocate parent dynamic state, " 1504 "consider increasing " 1505 "net.inet.ip.fw.dyn_parent_max\n"); 1506 } 1507 return (NULL); 1508 } 1509 1510 limit->parent = parent; 1511 limit->ruleid = ruleid; 1512 limit->rulenum = rulenum; 1513 limit->hashval = hashval; 1514 limit->expire = time_uptime + V_dyn_short_lifetime; 1515 return (limit); 1516 } 1517 1518 static struct dyn_data * 1519 dyn_alloc_dyndata(void *parent, uint32_t ruleid, uint32_t rulenum, 1520 const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, 1521 uint32_t hashval, uint16_t fibnum) 1522 { 1523 struct dyn_data *data; 1524 1525 data = uma_zalloc(V_dyn_data_zone, M_NOWAIT | M_ZERO); 1526 if (data == NULL) { 1527 if (last_log != time_uptime) { 1528 last_log = time_uptime; 1529 log(LOG_DEBUG, 1530 "ipfw: Cannot allocate dynamic state, " 1531 "consider increasing net.inet.ip.fw.dyn_max\n"); 1532 } 1533 return (NULL); 1534 } 1535 1536 data->parent = parent; 1537 data->ruleid = ruleid; 1538 data->rulenum = rulenum; 1539 data->fibnum = fibnum; 1540 data->hashval = hashval; 1541 data->expire = time_uptime + V_dyn_syn_lifetime; 1542 dyn_update_proto_state(data, pkt, ulp, pktlen, MATCH_FORWARD); 1543 return (data); 1544 } 1545 1546 static struct dyn_ipv4_state * 1547 dyn_alloc_ipv4_state(const struct ipfw_flow_id *pkt, uint32_t kidx, 1548 uint8_t type) 1549 { 1550 struct dyn_ipv4_state *s; 1551 1552 s = uma_zalloc(V_dyn_ipv4_zone, M_NOWAIT | M_ZERO); 1553 if (s == NULL) 1554 return (NULL); 1555 1556 s->type = type; 1557 s->kidx = kidx; 1558 s->proto = pkt->proto; 1559 s->sport = pkt->src_port; 1560 s->dport = pkt->dst_port; 1561 s->src = pkt->src_ip; 1562 s->dst = pkt->dst_ip; 1563 return (s); 1564 } 1565 1566 /* 1567 * Add IPv4 parent state. 1568 * Returns pointer to parent state. When it is not NULL we are in 1569 * critical section and pointer protected by hazard pointer. 1570 * When some error occurs, it returns NULL and exit from critical section 1571 * is not needed. 1572 */ 1573 static struct dyn_ipv4_state * 1574 dyn_add_ipv4_parent(void *rule, uint32_t ruleid, uint32_t rulenum, 1575 const struct ipfw_flow_id *pkt, uint32_t hashval, uint32_t version, 1576 uint32_t kidx) 1577 { 1578 struct dyn_ipv4_state *s; 1579 struct dyn_parent *limit; 1580 uint32_t bucket; 1581 1582 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1583 DYN_BUCKET_LOCK(bucket); 1584 if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_add)) { 1585 /* 1586 * Bucket version has been changed since last lookup, 1587 * do lookup again to be sure that state does not exist. 1588 */ 1589 s = dyn_lookup_ipv4_parent_locked(pkt, rule, ruleid, 1590 rulenum, bucket); 1591 if (s != NULL) { 1592 /* 1593 * Simultaneous thread has already created this 1594 * state. Just return it. 1595 */ 1596 DYNSTATE_CRITICAL_ENTER(); 1597 DYNSTATE_PROTECT(s); 1598 DYN_BUCKET_UNLOCK(bucket); 1599 return (s); 1600 } 1601 } 1602 1603 limit = dyn_alloc_parent(rule, ruleid, rulenum, hashval); 1604 if (limit == NULL) { 1605 DYN_BUCKET_UNLOCK(bucket); 1606 return (NULL); 1607 } 1608 1609 s = dyn_alloc_ipv4_state(pkt, kidx, O_LIMIT_PARENT); 1610 if (s == NULL) { 1611 DYN_BUCKET_UNLOCK(bucket); 1612 uma_zfree(V_dyn_parent_zone, limit); 1613 return (NULL); 1614 } 1615 1616 s->limit = limit; 1617 CK_SLIST_INSERT_HEAD(&V_dyn_ipv4_parent[bucket], s, entry); 1618 DYN_COUNT_INC(dyn_parent_count); 1619 DYN_BUCKET_VERSION_BUMP(bucket, ipv4_parent_add); 1620 DYNSTATE_CRITICAL_ENTER(); 1621 DYNSTATE_PROTECT(s); 1622 DYN_BUCKET_UNLOCK(bucket); 1623 return (s); 1624 } 1625 1626 static int 1627 dyn_add_ipv4_state(void *parent, uint32_t ruleid, uint32_t rulenum, 1628 const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, 1629 uint32_t hashval, struct ipfw_dyn_info *info, uint16_t fibnum, 1630 uint32_t kidx, uint8_t type) 1631 { 1632 struct dyn_ipv4_state *s; 1633 void *data; 1634 uint32_t bucket; 1635 1636 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1637 DYN_BUCKET_LOCK(bucket); 1638 if (info->direction == MATCH_UNKNOWN || 1639 info->kidx != kidx || 1640 info->hashval != hashval || 1641 info->version != DYN_BUCKET_VERSION(bucket, ipv4_add)) { 1642 /* 1643 * Bucket version has been changed since last lookup, 1644 * do lookup again to be sure that state does not exist. 1645 */ 1646 if (dyn_lookup_ipv4_state_locked(pkt, ulp, pktlen, 1647 bucket, kidx) != 0) { 1648 DYN_BUCKET_UNLOCK(bucket); 1649 return (EEXIST); 1650 } 1651 } 1652 1653 data = dyn_alloc_dyndata(parent, ruleid, rulenum, pkt, ulp, 1654 pktlen, hashval, fibnum); 1655 if (data == NULL) { 1656 DYN_BUCKET_UNLOCK(bucket); 1657 return (ENOMEM); 1658 } 1659 1660 s = dyn_alloc_ipv4_state(pkt, kidx, type); 1661 if (s == NULL) { 1662 DYN_BUCKET_UNLOCK(bucket); 1663 uma_zfree(V_dyn_data_zone, data); 1664 return (ENOMEM); 1665 } 1666 1667 s->data = data; 1668 CK_SLIST_INSERT_HEAD(&V_dyn_ipv4[bucket], s, entry); 1669 DYN_COUNT_INC(dyn_count); 1670 DYN_BUCKET_VERSION_BUMP(bucket, ipv4_add); 1671 DYN_BUCKET_UNLOCK(bucket); 1672 return (0); 1673 } 1674 1675 #ifdef INET6 1676 static struct dyn_ipv6_state * 1677 dyn_alloc_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1678 uint32_t kidx, uint8_t type) 1679 { 1680 struct dyn_ipv6_state *s; 1681 1682 s = uma_zalloc(V_dyn_ipv6_zone, M_NOWAIT | M_ZERO); 1683 if (s == NULL) 1684 return (NULL); 1685 1686 s->type = type; 1687 s->kidx = kidx; 1688 s->zoneid = zoneid; 1689 s->proto = pkt->proto; 1690 s->sport = pkt->src_port; 1691 s->dport = pkt->dst_port; 1692 s->src = pkt->src_ip6; 1693 s->dst = pkt->dst_ip6; 1694 return (s); 1695 } 1696 1697 /* 1698 * Add IPv6 parent state. 1699 * Returns pointer to parent state. When it is not NULL we are in 1700 * critical section and pointer protected by hazard pointer. 1701 * When some error occurs, it return NULL and exit from critical section 1702 * is not needed. 1703 */ 1704 static struct dyn_ipv6_state * 1705 dyn_add_ipv6_parent(void *rule, uint32_t ruleid, uint32_t rulenum, 1706 const struct ipfw_flow_id *pkt, uint32_t zoneid, uint32_t hashval, 1707 uint32_t version, uint32_t kidx) 1708 { 1709 struct dyn_ipv6_state *s; 1710 struct dyn_parent *limit; 1711 uint32_t bucket; 1712 1713 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1714 DYN_BUCKET_LOCK(bucket); 1715 if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_add)) { 1716 /* 1717 * Bucket version has been changed since last lookup, 1718 * do lookup again to be sure that state does not exist. 1719 */ 1720 s = dyn_lookup_ipv6_parent_locked(pkt, zoneid, rule, ruleid, 1721 rulenum, bucket); 1722 if (s != NULL) { 1723 /* 1724 * Simultaneous thread has already created this 1725 * state. Just return it. 1726 */ 1727 DYNSTATE_CRITICAL_ENTER(); 1728 DYNSTATE_PROTECT(s); 1729 DYN_BUCKET_UNLOCK(bucket); 1730 return (s); 1731 } 1732 } 1733 1734 limit = dyn_alloc_parent(rule, ruleid, rulenum, hashval); 1735 if (limit == NULL) { 1736 DYN_BUCKET_UNLOCK(bucket); 1737 return (NULL); 1738 } 1739 1740 s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, O_LIMIT_PARENT); 1741 if (s == NULL) { 1742 DYN_BUCKET_UNLOCK(bucket); 1743 uma_zfree(V_dyn_parent_zone, limit); 1744 return (NULL); 1745 } 1746 1747 s->limit = limit; 1748 CK_SLIST_INSERT_HEAD(&V_dyn_ipv6_parent[bucket], s, entry); 1749 DYN_COUNT_INC(dyn_parent_count); 1750 DYN_BUCKET_VERSION_BUMP(bucket, ipv6_parent_add); 1751 DYNSTATE_CRITICAL_ENTER(); 1752 DYNSTATE_PROTECT(s); 1753 DYN_BUCKET_UNLOCK(bucket); 1754 return (s); 1755 } 1756 1757 static int 1758 dyn_add_ipv6_state(void *parent, uint32_t ruleid, uint32_t rulenum, 1759 const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *ulp, 1760 int pktlen, uint32_t hashval, struct ipfw_dyn_info *info, 1761 uint16_t fibnum, uint32_t kidx, uint8_t type) 1762 { 1763 struct dyn_ipv6_state *s; 1764 struct dyn_data *data; 1765 uint32_t bucket; 1766 1767 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1768 DYN_BUCKET_LOCK(bucket); 1769 if (info->direction == MATCH_UNKNOWN || 1770 info->kidx != kidx || 1771 info->hashval != hashval || 1772 info->version != DYN_BUCKET_VERSION(bucket, ipv6_add)) { 1773 /* 1774 * Bucket version has been changed since last lookup, 1775 * do lookup again to be sure that state does not exist. 1776 */ 1777 if (dyn_lookup_ipv6_state_locked(pkt, zoneid, ulp, pktlen, 1778 bucket, kidx) != 0) { 1779 DYN_BUCKET_UNLOCK(bucket); 1780 return (EEXIST); 1781 } 1782 } 1783 1784 data = dyn_alloc_dyndata(parent, ruleid, rulenum, pkt, ulp, 1785 pktlen, hashval, fibnum); 1786 if (data == NULL) { 1787 DYN_BUCKET_UNLOCK(bucket); 1788 return (ENOMEM); 1789 } 1790 1791 s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, type); 1792 if (s == NULL) { 1793 DYN_BUCKET_UNLOCK(bucket); 1794 uma_zfree(V_dyn_data_zone, data); 1795 return (ENOMEM); 1796 } 1797 1798 s->data = data; 1799 CK_SLIST_INSERT_HEAD(&V_dyn_ipv6[bucket], s, entry); 1800 DYN_COUNT_INC(dyn_count); 1801 DYN_BUCKET_VERSION_BUMP(bucket, ipv6_add); 1802 DYN_BUCKET_UNLOCK(bucket); 1803 return (0); 1804 } 1805 #endif /* INET6 */ 1806 1807 static void * 1808 dyn_get_parent_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1809 struct ip_fw *rule, uint32_t hashval, uint32_t limit, uint32_t kidx) 1810 { 1811 char sbuf[24]; 1812 struct dyn_parent *p; 1813 void *ret; 1814 uint32_t bucket, version; 1815 1816 p = NULL; 1817 ret = NULL; 1818 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1819 DYNSTATE_CRITICAL_ENTER(); 1820 if (IS_IP4_FLOW_ID(pkt)) { 1821 struct dyn_ipv4_state *s; 1822 1823 version = DYN_BUCKET_VERSION(bucket, ipv4_parent_add); 1824 s = dyn_lookup_ipv4_parent(pkt, rule, rule->id, 1825 rule->rulenum, bucket); 1826 if (s == NULL) { 1827 /* 1828 * Exit from critical section because dyn_add_parent() 1829 * will acquire bucket lock. 1830 */ 1831 DYNSTATE_CRITICAL_EXIT(); 1832 1833 s = dyn_add_ipv4_parent(rule, rule->id, 1834 rule->rulenum, pkt, hashval, version, kidx); 1835 if (s == NULL) 1836 return (NULL); 1837 /* Now we are in critical section again. */ 1838 } 1839 ret = s; 1840 p = s->limit; 1841 } 1842 #ifdef INET6 1843 else if (IS_IP6_FLOW_ID(pkt)) { 1844 struct dyn_ipv6_state *s; 1845 1846 version = DYN_BUCKET_VERSION(bucket, ipv6_parent_add); 1847 s = dyn_lookup_ipv6_parent(pkt, zoneid, rule, rule->id, 1848 rule->rulenum, bucket); 1849 if (s == NULL) { 1850 /* 1851 * Exit from critical section because dyn_add_parent() 1852 * can acquire bucket mutex. 1853 */ 1854 DYNSTATE_CRITICAL_EXIT(); 1855 1856 s = dyn_add_ipv6_parent(rule, rule->id, 1857 rule->rulenum, pkt, zoneid, hashval, version, 1858 kidx); 1859 if (s == NULL) 1860 return (NULL); 1861 /* Now we are in critical section again. */ 1862 } 1863 ret = s; 1864 p = s->limit; 1865 } 1866 #endif 1867 else { 1868 DYNSTATE_CRITICAL_EXIT(); 1869 return (NULL); 1870 } 1871 1872 /* Check the limit */ 1873 if (DPARENT_COUNT(p) >= limit) { 1874 DYNSTATE_CRITICAL_EXIT(); 1875 if (V_fw_verbose && last_log != time_uptime) { 1876 last_log = time_uptime; 1877 snprintf(sbuf, sizeof(sbuf), "%u drop session", 1878 rule->rulenum); 1879 print_dyn_rule_flags(pkt, O_LIMIT, 1880 LOG_SECURITY | LOG_DEBUG, sbuf, 1881 "too many entries"); 1882 } 1883 return (NULL); 1884 } 1885 1886 /* Take new session into account. */ 1887 DPARENT_COUNT_INC(p); 1888 /* 1889 * We must exit from critical section because the following code 1890 * can acquire bucket mutex. 1891 * We rely on the 'count' field. The state will not expire 1892 * until it has some child states, i.e. 'count' field is not zero. 1893 * Return state pointer, it will be used by child states as parent. 1894 */ 1895 DYNSTATE_CRITICAL_EXIT(); 1896 return (ret); 1897 } 1898 1899 static int 1900 dyn_install_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1901 uint16_t fibnum, const void *ulp, int pktlen, struct ip_fw *rule, 1902 struct ipfw_dyn_info *info, uint32_t limit, uint16_t limit_mask, 1903 uint32_t kidx, uint8_t type) 1904 { 1905 struct ipfw_flow_id id; 1906 uint32_t hashval, parent_hashval, ruleid, rulenum; 1907 int ret; 1908 1909 MPASS(type == O_LIMIT || type == O_KEEP_STATE); 1910 1911 ruleid = rule->id; 1912 rulenum = rule->rulenum; 1913 if (type == O_LIMIT) { 1914 /* Create masked flow id and calculate bucket */ 1915 id.addr_type = pkt->addr_type; 1916 id.proto = pkt->proto; 1917 id.fib = fibnum; /* unused */ 1918 id.src_port = (limit_mask & DYN_SRC_PORT) ? 1919 pkt->src_port: 0; 1920 id.dst_port = (limit_mask & DYN_DST_PORT) ? 1921 pkt->dst_port: 0; 1922 if (IS_IP4_FLOW_ID(pkt)) { 1923 id.src_ip = (limit_mask & DYN_SRC_ADDR) ? 1924 pkt->src_ip: 0; 1925 id.dst_ip = (limit_mask & DYN_DST_ADDR) ? 1926 pkt->dst_ip: 0; 1927 } 1928 #ifdef INET6 1929 else if (IS_IP6_FLOW_ID(pkt)) { 1930 if (limit_mask & DYN_SRC_ADDR) 1931 id.src_ip6 = pkt->src_ip6; 1932 else 1933 memset(&id.src_ip6, 0, sizeof(id.src_ip6)); 1934 if (limit_mask & DYN_DST_ADDR) 1935 id.dst_ip6 = pkt->dst_ip6; 1936 else 1937 memset(&id.dst_ip6, 0, sizeof(id.dst_ip6)); 1938 } 1939 #endif 1940 else 1941 return (EAFNOSUPPORT); 1942 1943 parent_hashval = hash_parent(&id, rule); 1944 rule = dyn_get_parent_state(&id, zoneid, rule, parent_hashval, 1945 limit, kidx); 1946 if (rule == NULL) { 1947 #if 0 1948 if (V_fw_verbose && last_log != time_uptime) { 1949 last_log = time_uptime; 1950 snprintf(sbuf, sizeof(sbuf), 1951 "%u drop session", rule->rulenum); 1952 print_dyn_rule_flags(pkt, O_LIMIT, 1953 LOG_SECURITY | LOG_DEBUG, sbuf, 1954 "too many entries"); 1955 } 1956 #endif 1957 return (EACCES); 1958 } 1959 /* 1960 * Limit is not reached, create new state. 1961 * Now rule points to parent state. 1962 */ 1963 } 1964 1965 hashval = hash_packet(pkt); 1966 if (IS_IP4_FLOW_ID(pkt)) 1967 ret = dyn_add_ipv4_state(rule, ruleid, rulenum, pkt, 1968 ulp, pktlen, hashval, info, fibnum, kidx, type); 1969 #ifdef INET6 1970 else if (IS_IP6_FLOW_ID(pkt)) 1971 ret = dyn_add_ipv6_state(rule, ruleid, rulenum, pkt, 1972 zoneid, ulp, pktlen, hashval, info, fibnum, kidx, type); 1973 #endif /* INET6 */ 1974 else 1975 ret = EAFNOSUPPORT; 1976 1977 if (type == O_LIMIT) { 1978 if (ret != 0) { 1979 /* 1980 * We failed to create child state for O_LIMIT 1981 * opcode. Since we already counted it in the parent, 1982 * we must revert counter back. The 'rule' points to 1983 * parent state, use it to get dyn_parent. 1984 * 1985 * XXXAE: it should be safe to use 'rule' pointer 1986 * without extra lookup, parent state is referenced 1987 * and should not be freed. 1988 */ 1989 if (IS_IP4_FLOW_ID(&id)) 1990 DPARENT_COUNT_DEC( 1991 ((struct dyn_ipv4_state *)rule)->limit); 1992 #ifdef INET6 1993 else if (IS_IP6_FLOW_ID(&id)) 1994 DPARENT_COUNT_DEC( 1995 ((struct dyn_ipv6_state *)rule)->limit); 1996 #endif 1997 } 1998 } 1999 /* 2000 * EEXIST means that simultaneous thread has created this 2001 * state. Consider this as success. 2002 * 2003 * XXXAE: should we invalidate 'info' content here? 2004 */ 2005 if (ret == EEXIST) 2006 return (0); 2007 return (ret); 2008 } 2009 2010 /* 2011 * Install dynamic state. 2012 * chain - ipfw's instance; 2013 * rule - the parent rule that installs the state; 2014 * cmd - opcode that installs the state; 2015 * args - ipfw arguments; 2016 * ulp - upper level protocol header; 2017 * pktlen - packet length; 2018 * info - dynamic state lookup info; 2019 * tablearg - tablearg id. 2020 * 2021 * Returns non-zero value (failure) if state is not installed because 2022 * of errors or because session limitations are enforced. 2023 */ 2024 int 2025 ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, 2026 const ipfw_insn_limit *cmd, const struct ip_fw_args *args, 2027 const void *ulp, int pktlen, struct ipfw_dyn_info *info, 2028 uint32_t tablearg) 2029 { 2030 uint32_t limit; 2031 uint16_t limit_mask; 2032 2033 if (cmd->o.opcode == O_LIMIT) { 2034 limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); 2035 limit_mask = cmd->limit_mask; 2036 } else { 2037 limit = 0; 2038 limit_mask = 0; 2039 } 2040 /* 2041 * NOTE: we assume that kidx field of struct ipfw_insn_kidx 2042 * located in the same place as kidx field of ipfw_insn_limit. 2043 */ 2044 return (dyn_install_state(&args->f_id, 2045 #ifdef INET6 2046 IS_IP6_FLOW_ID(&args->f_id) ? dyn_getscopeid(args): 2047 #endif 2048 0, M_GETFIB(args->m), ulp, pktlen, rule, info, limit, 2049 limit_mask, cmd->kidx, cmd->o.opcode)); 2050 } 2051 2052 /* 2053 * Free safe to remove state entries from expired lists. 2054 */ 2055 static void 2056 dyn_free_states(struct ip_fw_chain *chain) 2057 { 2058 struct dyn_ipv4_state *s4, *s4n; 2059 #ifdef INET6 2060 struct dyn_ipv6_state *s6, *s6n; 2061 #endif 2062 int cached_count, i; 2063 2064 /* 2065 * We keep pointers to objects that are in use on each CPU 2066 * in the per-cpu dyn_hp pointer. When object is going to be 2067 * removed, first of it is unlinked from the corresponding 2068 * list. This leads to changing of dyn_bucket_xxx_delver version. 2069 * Unlinked objects is placed into corresponding dyn_expired_xxx 2070 * list. Reader that is going to dereference object pointer checks 2071 * dyn_bucket_xxx_delver version before and after storing pointer 2072 * into dyn_hp. If version is the same, the object is protected 2073 * from freeing and it is safe to dereference. Othervise reader 2074 * tries to iterate list again from the beginning, but this object 2075 * now unlinked and thus will not be accessible. 2076 * 2077 * Copy dyn_hp pointers for each CPU into dyn_hp_cache array. 2078 * It does not matter that some pointer can be changed in 2079 * time while we are copying. We need to check, that objects 2080 * removed in the previous pass are not in use. And if dyn_hp 2081 * pointer does not contain it in the time when we are copying, 2082 * it will not appear there, because it is already unlinked. 2083 * And for new pointers we will not free objects that will be 2084 * unlinked in this pass. 2085 */ 2086 cached_count = 0; 2087 CPU_FOREACH(i) { 2088 dyn_hp_cache[cached_count] = DYNSTATE_GET(i); 2089 if (dyn_hp_cache[cached_count] != NULL) 2090 cached_count++; 2091 } 2092 2093 /* 2094 * Free expired states that are safe to free. 2095 * Check each entry from previous pass in the dyn_expired_xxx 2096 * list, if pointer to the object is in the dyn_hp_cache array, 2097 * keep it until next pass. Otherwise it is safe to free the 2098 * object. 2099 * 2100 * XXXAE: optimize this to use SLIST_REMOVE_AFTER. 2101 */ 2102 #define DYN_FREE_STATES(s, next, name) do { \ 2103 s = SLIST_FIRST(&V_dyn_expired_ ## name); \ 2104 while (s != NULL) { \ 2105 next = SLIST_NEXT(s, expired); \ 2106 for (i = 0; i < cached_count; i++) \ 2107 if (dyn_hp_cache[i] == s) \ 2108 break; \ 2109 if (i == cached_count) { \ 2110 if (s->type == O_LIMIT_PARENT && \ 2111 s->limit->count != 0) { \ 2112 s = next; \ 2113 continue; \ 2114 } \ 2115 SLIST_REMOVE(&V_dyn_expired_ ## name, \ 2116 s, dyn_ ## name ## _state, expired); \ 2117 if (s->type == O_LIMIT_PARENT) \ 2118 uma_zfree(V_dyn_parent_zone, s->limit); \ 2119 else \ 2120 uma_zfree(V_dyn_data_zone, s->data); \ 2121 uma_zfree(V_dyn_ ## name ## _zone, s); \ 2122 } \ 2123 s = next; \ 2124 } \ 2125 } while (0) 2126 2127 /* 2128 * Protect access to expired lists with DYN_EXPIRED_LOCK. 2129 * Userland can invoke ipfw_expire_dyn_states() to delete 2130 * specific states, this will lead to modification of expired 2131 * lists. 2132 * 2133 * XXXAE: do we need DYN_EXPIRED_LOCK? We can just use 2134 * IPFW_UH_WLOCK to protect access to these lists. 2135 */ 2136 DYN_EXPIRED_LOCK(); 2137 DYN_FREE_STATES(s4, s4n, ipv4); 2138 #ifdef INET6 2139 DYN_FREE_STATES(s6, s6n, ipv6); 2140 #endif 2141 DYN_EXPIRED_UNLOCK(); 2142 #undef DYN_FREE_STATES 2143 } 2144 2145 /* 2146 * Returns: 2147 * 0 when state is not matched by specified range; 2148 * 1 when state is matched by specified range; 2149 * 2 when state is matched by specified range and requested deletion of 2150 * dynamic states. 2151 */ 2152 static int 2153 dyn_match_range(uint32_t rulenum, uint8_t set, const ipfw_range_tlv *rt) 2154 { 2155 2156 MPASS(rt != NULL); 2157 /* flush all states */ 2158 if (rt->flags & IPFW_RCFLAG_ALL) { 2159 if (rt->flags & IPFW_RCFLAG_DYNAMIC) 2160 return (2); /* forced */ 2161 return (1); 2162 } 2163 if ((rt->flags & IPFW_RCFLAG_SET) != 0 && set != rt->set) 2164 return (0); 2165 if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && 2166 (rulenum < rt->start_rule || rulenum > rt->end_rule)) 2167 return (0); 2168 if (rt->flags & IPFW_RCFLAG_DYNAMIC) 2169 return (2); 2170 return (1); 2171 } 2172 2173 static void 2174 dyn_acquire_rule(struct ip_fw_chain *ch, struct dyn_data *data, 2175 struct ip_fw *rule, uint32_t kidx) 2176 { 2177 struct dyn_state_obj *obj; 2178 2179 /* 2180 * Do not acquire reference twice. 2181 * This can happen when rule deletion executed for 2182 * the same range, but different ruleset id. 2183 */ 2184 if (data->flags & DYN_REFERENCED) 2185 return; 2186 2187 IPFW_UH_WLOCK_ASSERT(ch); 2188 MPASS(kidx != 0); 2189 2190 data->flags |= DYN_REFERENCED; 2191 /* Reference the named object */ 2192 obj = SRV_OBJECT(ch, kidx); 2193 obj->no.refcnt++; 2194 MPASS(obj->no.etlv == IPFW_TLV_STATE_NAME); 2195 2196 /* Reference the parent rule */ 2197 rule->refcnt++; 2198 } 2199 2200 static void 2201 dyn_release_rule(struct ip_fw_chain *ch, struct dyn_data *data, 2202 struct ip_fw *rule, uint32_t kidx) 2203 { 2204 struct dyn_state_obj *obj; 2205 2206 IPFW_UH_WLOCK_ASSERT(ch); 2207 MPASS(kidx != 0); 2208 2209 obj = SRV_OBJECT(ch, kidx); 2210 if (obj->no.refcnt == 1) 2211 dyn_destroy(ch, &obj->no); 2212 else 2213 obj->no.refcnt--; 2214 2215 if (--rule->refcnt == 1) 2216 ipfw_free_rule(rule); 2217 } 2218 2219 /* 2220 * We do not keep O_LIMIT_PARENT states when V_dyn_keep_states is enabled. 2221 * O_LIMIT state is created when new connection is going to be established 2222 * and there is no matching state. So, since the old parent rule was deleted 2223 * we can't create new states with old parent, and thus we can not account 2224 * new connections with already established connections, and can not do 2225 * proper limiting. 2226 */ 2227 static int 2228 dyn_match_ipv4_state(struct ip_fw_chain *ch, struct dyn_ipv4_state *s, 2229 const ipfw_range_tlv *rt) 2230 { 2231 struct ip_fw *rule; 2232 int ret; 2233 2234 if (s->type == O_LIMIT_PARENT) { 2235 rule = s->limit->parent; 2236 return (dyn_match_range(s->limit->rulenum, rule->set, rt)); 2237 } 2238 2239 rule = s->data->parent; 2240 if (s->type == O_LIMIT) 2241 rule = ((struct dyn_ipv4_state *)rule)->limit->parent; 2242 2243 ret = dyn_match_range(s->data->rulenum, rule->set, rt); 2244 if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) 2245 return (ret); 2246 2247 dyn_acquire_rule(ch, s->data, rule, s->kidx); 2248 return (0); 2249 } 2250 2251 #ifdef INET6 2252 static int 2253 dyn_match_ipv6_state(struct ip_fw_chain *ch, struct dyn_ipv6_state *s, 2254 const ipfw_range_tlv *rt) 2255 { 2256 struct ip_fw *rule; 2257 int ret; 2258 2259 if (s->type == O_LIMIT_PARENT) { 2260 rule = s->limit->parent; 2261 return (dyn_match_range(s->limit->rulenum, rule->set, rt)); 2262 } 2263 2264 rule = s->data->parent; 2265 if (s->type == O_LIMIT) 2266 rule = ((struct dyn_ipv6_state *)rule)->limit->parent; 2267 2268 ret = dyn_match_range(s->data->rulenum, rule->set, rt); 2269 if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) 2270 return (ret); 2271 2272 dyn_acquire_rule(ch, s->data, rule, s->kidx); 2273 return (0); 2274 } 2275 #endif 2276 2277 /* 2278 * Unlink expired entries from states lists. 2279 * @rt can be used to specify the range of states for deletion. 2280 */ 2281 static void 2282 dyn_expire_states(struct ip_fw_chain *ch, ipfw_range_tlv *rt) 2283 { 2284 struct dyn_ipv4_slist expired_ipv4; 2285 #ifdef INET6 2286 struct dyn_ipv6_slist expired_ipv6; 2287 struct dyn_ipv6_state *s6, *s6n, *s6p; 2288 #endif 2289 struct dyn_ipv4_state *s4, *s4n, *s4p; 2290 void *rule; 2291 int bucket, removed, length, max_length; 2292 2293 IPFW_UH_WLOCK_ASSERT(ch); 2294 2295 /* 2296 * Unlink expired states from each bucket. 2297 * With acquired bucket lock iterate entries of each lists: 2298 * ipv4, ipv4_parent, ipv6, and ipv6_parent. Check expired time 2299 * and unlink entry from the list, link entry into temporary 2300 * expired_xxx lists then bump "del" bucket version. 2301 * 2302 * When an entry is removed, corresponding states counter is 2303 * decremented. If entry has O_LIMIT type, parent's reference 2304 * counter is decremented. 2305 * 2306 * NOTE: this function can be called from userspace context 2307 * when user deletes rules. In this case all matched states 2308 * will be forcedly unlinked. O_LIMIT_PARENT states will be kept 2309 * in the expired lists until reference counter become zero. 2310 */ 2311 #define DYN_UNLINK_STATES(s, prev, next, exp, af, name, extra) do { \ 2312 length = 0; \ 2313 removed = 0; \ 2314 prev = NULL; \ 2315 s = CK_SLIST_FIRST(&V_dyn_ ## name [bucket]); \ 2316 while (s != NULL) { \ 2317 next = CK_SLIST_NEXT(s, entry); \ 2318 if ((TIME_LEQ((s)->exp, time_uptime) && extra) || \ 2319 (rt != NULL && \ 2320 dyn_match_ ## af ## _state(ch, s, rt))) { \ 2321 if (prev != NULL) \ 2322 CK_SLIST_REMOVE_AFTER(prev, entry); \ 2323 else \ 2324 CK_SLIST_REMOVE_HEAD( \ 2325 &V_dyn_ ## name [bucket], entry); \ 2326 removed++; \ 2327 SLIST_INSERT_HEAD(&expired_ ## af, s, expired); \ 2328 if (s->type == O_LIMIT_PARENT) \ 2329 DYN_COUNT_DEC(dyn_parent_count); \ 2330 else { \ 2331 DYN_COUNT_DEC(dyn_count); \ 2332 if (s->data->flags & DYN_REFERENCED) { \ 2333 rule = s->data->parent; \ 2334 if (s->type == O_LIMIT) \ 2335 rule = ((__typeof(s)) \ 2336 rule)->limit->parent;\ 2337 dyn_release_rule(ch, s->data, \ 2338 rule, s->kidx); \ 2339 } \ 2340 if (s->type == O_LIMIT) { \ 2341 s = s->data->parent; \ 2342 DPARENT_COUNT_DEC(s->limit); \ 2343 } \ 2344 } \ 2345 } else { \ 2346 prev = s; \ 2347 length++; \ 2348 } \ 2349 s = next; \ 2350 } \ 2351 if (removed != 0) \ 2352 DYN_BUCKET_VERSION_BUMP(bucket, name ## _del); \ 2353 if (length > max_length) \ 2354 max_length = length; \ 2355 } while (0) 2356 2357 SLIST_INIT(&expired_ipv4); 2358 #ifdef INET6 2359 SLIST_INIT(&expired_ipv6); 2360 #endif 2361 max_length = 0; 2362 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2363 DYN_BUCKET_LOCK(bucket); 2364 DYN_UNLINK_STATES(s4, s4p, s4n, data->expire, ipv4, ipv4, 1); 2365 DYN_UNLINK_STATES(s4, s4p, s4n, limit->expire, ipv4, 2366 ipv4_parent, (s4->limit->count == 0)); 2367 #ifdef INET6 2368 DYN_UNLINK_STATES(s6, s6p, s6n, data->expire, ipv6, ipv6, 1); 2369 DYN_UNLINK_STATES(s6, s6p, s6n, limit->expire, ipv6, 2370 ipv6_parent, (s6->limit->count == 0)); 2371 #endif 2372 DYN_BUCKET_UNLOCK(bucket); 2373 } 2374 /* Update curr_max_length for statistics. */ 2375 V_curr_max_length = max_length; 2376 /* 2377 * Concatenate temporary lists with global expired lists. 2378 */ 2379 DYN_EXPIRED_LOCK(); 2380 SLIST_CONCAT(&V_dyn_expired_ipv4, &expired_ipv4, 2381 dyn_ipv4_state, expired); 2382 #ifdef INET6 2383 SLIST_CONCAT(&V_dyn_expired_ipv6, &expired_ipv6, 2384 dyn_ipv6_state, expired); 2385 #endif 2386 DYN_EXPIRED_UNLOCK(); 2387 #undef DYN_UNLINK_STATES 2388 #undef DYN_UNREF_STATES 2389 } 2390 2391 static struct mbuf * 2392 dyn_mgethdr(int len, uint16_t fibnum) 2393 { 2394 struct mbuf *m; 2395 2396 m = m_gethdr(M_NOWAIT, MT_DATA); 2397 if (m == NULL) 2398 return (NULL); 2399 #ifdef MAC 2400 mac_netinet_firewall_send(m); 2401 #endif 2402 M_SETFIB(m, fibnum); 2403 m->m_data += max_linkhdr; 2404 m->m_flags |= M_SKIP_FIREWALL; 2405 m->m_len = m->m_pkthdr.len = len; 2406 bzero(m->m_data, len); 2407 return (m); 2408 } 2409 2410 static void 2411 dyn_make_keepalive_ipv4(struct mbuf *m, in_addr_t src, in_addr_t dst, 2412 uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport) 2413 { 2414 struct tcphdr *tcp; 2415 struct ip *ip; 2416 2417 ip = mtod(m, struct ip *); 2418 ip->ip_v = 4; 2419 ip->ip_hl = sizeof(*ip) >> 2; 2420 ip->ip_tos = IPTOS_LOWDELAY; 2421 ip->ip_len = htons(m->m_len); 2422 ip->ip_off |= htons(IP_DF); 2423 ip->ip_ttl = V_ip_defttl; 2424 ip->ip_p = IPPROTO_TCP; 2425 ip->ip_src.s_addr = htonl(src); 2426 ip->ip_dst.s_addr = htonl(dst); 2427 2428 tcp = mtodo(m, sizeof(struct ip)); 2429 tcp->th_sport = htons(sport); 2430 tcp->th_dport = htons(dport); 2431 tcp->th_off = sizeof(struct tcphdr) >> 2; 2432 tcp->th_seq = htonl(seq); 2433 tcp->th_ack = htonl(ack); 2434 tcp_set_flags(tcp, TH_ACK); 2435 tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 2436 htons(sizeof(struct tcphdr) + IPPROTO_TCP)); 2437 2438 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 2439 m->m_pkthdr.csum_flags = CSUM_TCP; 2440 } 2441 2442 static void 2443 dyn_enqueue_keepalive_ipv4(struct mbufq *q, const struct dyn_ipv4_state *s) 2444 { 2445 struct mbuf *m; 2446 2447 if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { 2448 m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), 2449 s->data->fibnum); 2450 if (m != NULL) { 2451 dyn_make_keepalive_ipv4(m, s->dst, s->src, 2452 s->data->ack_fwd - 1, s->data->ack_rev, 2453 s->dport, s->sport); 2454 if (mbufq_enqueue(q, m)) { 2455 m_freem(m); 2456 log(LOG_DEBUG, "ipfw: limit for IPv4 " 2457 "keepalive queue is reached.\n"); 2458 return; 2459 } 2460 } 2461 } 2462 2463 if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { 2464 m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), 2465 s->data->fibnum); 2466 if (m != NULL) { 2467 dyn_make_keepalive_ipv4(m, s->src, s->dst, 2468 s->data->ack_rev - 1, s->data->ack_fwd, 2469 s->sport, s->dport); 2470 if (mbufq_enqueue(q, m)) { 2471 m_freem(m); 2472 log(LOG_DEBUG, "ipfw: limit for IPv4 " 2473 "keepalive queue is reached.\n"); 2474 return; 2475 } 2476 } 2477 } 2478 } 2479 2480 /* 2481 * Prepare and send keep-alive packets. 2482 */ 2483 static void 2484 dyn_send_keepalive_ipv4(struct ip_fw_chain *chain) 2485 { 2486 struct mbufq q; 2487 struct mbuf *m; 2488 struct dyn_ipv4_state *s; 2489 uint32_t bucket; 2490 2491 mbufq_init(&q, INT_MAX); 2492 IPFW_UH_RLOCK(chain); 2493 /* 2494 * It is safe to not use hazard pointer and just do lockless 2495 * access to the lists, because states entries can not be deleted 2496 * while we hold IPFW_UH_RLOCK. 2497 */ 2498 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2499 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 2500 /* 2501 * Only established TCP connections that will 2502 * become expired within dyn_keepalive_interval. 2503 */ 2504 if (s->proto != IPPROTO_TCP || 2505 (s->data->state & BOTH_SYN) != BOTH_SYN || 2506 TIME_LEQ(time_uptime + V_dyn_keepalive_interval, 2507 s->data->expire)) 2508 continue; 2509 dyn_enqueue_keepalive_ipv4(&q, s); 2510 } 2511 } 2512 IPFW_UH_RUNLOCK(chain); 2513 while ((m = mbufq_dequeue(&q)) != NULL) 2514 ip_output(m, NULL, NULL, 0, NULL, NULL); 2515 } 2516 2517 #ifdef INET6 2518 static void 2519 dyn_make_keepalive_ipv6(struct mbuf *m, const struct in6_addr *src, 2520 const struct in6_addr *dst, uint32_t zoneid, uint32_t seq, uint32_t ack, 2521 uint16_t sport, uint16_t dport) 2522 { 2523 struct tcphdr *tcp; 2524 struct ip6_hdr *ip6; 2525 2526 ip6 = mtod(m, struct ip6_hdr *); 2527 ip6->ip6_vfc |= IPV6_VERSION; 2528 ip6->ip6_plen = htons(sizeof(struct tcphdr)); 2529 ip6->ip6_nxt = IPPROTO_TCP; 2530 ip6->ip6_hlim = IPV6_DEFHLIM; 2531 ip6->ip6_src = *src; 2532 if (IN6_IS_ADDR_LINKLOCAL(src)) 2533 ip6->ip6_src.s6_addr16[1] = htons(zoneid & 0xffff); 2534 ip6->ip6_dst = *dst; 2535 if (IN6_IS_ADDR_LINKLOCAL(dst)) 2536 ip6->ip6_dst.s6_addr16[1] = htons(zoneid & 0xffff); 2537 2538 tcp = mtodo(m, sizeof(struct ip6_hdr)); 2539 tcp->th_sport = htons(sport); 2540 tcp->th_dport = htons(dport); 2541 tcp->th_off = sizeof(struct tcphdr) >> 2; 2542 tcp->th_seq = htonl(seq); 2543 tcp->th_ack = htonl(ack); 2544 tcp_set_flags(tcp, TH_ACK); 2545 tcp->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr), 2546 IPPROTO_TCP, 0); 2547 2548 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 2549 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 2550 } 2551 2552 static void 2553 dyn_enqueue_keepalive_ipv6(struct mbufq *q, const struct dyn_ipv6_state *s) 2554 { 2555 struct mbuf *m; 2556 2557 if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { 2558 m = dyn_mgethdr(sizeof(struct ip6_hdr) + 2559 sizeof(struct tcphdr), s->data->fibnum); 2560 if (m != NULL) { 2561 dyn_make_keepalive_ipv6(m, &s->dst, &s->src, 2562 s->zoneid, s->data->ack_fwd - 1, s->data->ack_rev, 2563 s->dport, s->sport); 2564 if (mbufq_enqueue(q, m)) { 2565 m_freem(m); 2566 log(LOG_DEBUG, "ipfw: limit for IPv6 " 2567 "keepalive queue is reached.\n"); 2568 return; 2569 } 2570 } 2571 } 2572 2573 if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { 2574 m = dyn_mgethdr(sizeof(struct ip6_hdr) + 2575 sizeof(struct tcphdr), s->data->fibnum); 2576 if (m != NULL) { 2577 dyn_make_keepalive_ipv6(m, &s->src, &s->dst, 2578 s->zoneid, s->data->ack_rev - 1, s->data->ack_fwd, 2579 s->sport, s->dport); 2580 if (mbufq_enqueue(q, m)) { 2581 m_freem(m); 2582 log(LOG_DEBUG, "ipfw: limit for IPv6 " 2583 "keepalive queue is reached.\n"); 2584 return; 2585 } 2586 } 2587 } 2588 } 2589 2590 static void 2591 dyn_send_keepalive_ipv6(struct ip_fw_chain *chain) 2592 { 2593 struct mbufq q; 2594 struct mbuf *m; 2595 struct dyn_ipv6_state *s; 2596 uint32_t bucket; 2597 2598 mbufq_init(&q, INT_MAX); 2599 IPFW_UH_RLOCK(chain); 2600 /* 2601 * It is safe to not use hazard pointer and just do lockless 2602 * access to the lists, because states entries can not be deleted 2603 * while we hold IPFW_UH_RLOCK. 2604 */ 2605 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2606 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 2607 /* 2608 * Only established TCP connections that will 2609 * become expired within dyn_keepalive_interval. 2610 */ 2611 if (s->proto != IPPROTO_TCP || 2612 (s->data->state & BOTH_SYN) != BOTH_SYN || 2613 TIME_LEQ(time_uptime + V_dyn_keepalive_interval, 2614 s->data->expire)) 2615 continue; 2616 dyn_enqueue_keepalive_ipv6(&q, s); 2617 } 2618 } 2619 IPFW_UH_RUNLOCK(chain); 2620 while ((m = mbufq_dequeue(&q)) != NULL) 2621 ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); 2622 } 2623 #endif /* INET6 */ 2624 2625 static void 2626 dyn_grow_hashtable(struct ip_fw_chain *chain, uint32_t new, int flags) 2627 { 2628 #ifdef INET6 2629 struct dyn_ipv6ck_slist *ipv6, *ipv6_parent; 2630 uint32_t *ipv6_add, *ipv6_del, *ipv6_parent_add, *ipv6_parent_del; 2631 struct dyn_ipv6_state *s6; 2632 #endif 2633 struct dyn_ipv4ck_slist *ipv4, *ipv4_parent; 2634 uint32_t *ipv4_add, *ipv4_del, *ipv4_parent_add, *ipv4_parent_del; 2635 struct dyn_ipv4_state *s4; 2636 struct mtx *bucket_lock; 2637 void *tmp; 2638 uint32_t bucket; 2639 2640 MPASS(powerof2(new)); 2641 DYN_DEBUG("grow hash size %u -> %u", V_curr_dyn_buckets, new); 2642 /* 2643 * Allocate and initialize new lists. 2644 */ 2645 bucket_lock = malloc(new * sizeof(struct mtx), M_IPFW, 2646 flags | M_ZERO); 2647 if (bucket_lock == NULL) 2648 return; 2649 2650 ipv4 = ipv4_parent = NULL; 2651 ipv4_add = ipv4_del = ipv4_parent_add = ipv4_parent_del = NULL; 2652 #ifdef INET6 2653 ipv6 = ipv6_parent = NULL; 2654 ipv6_add = ipv6_del = ipv6_parent_add = ipv6_parent_del = NULL; 2655 #endif 2656 2657 ipv4 = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, 2658 flags | M_ZERO); 2659 if (ipv4 == NULL) 2660 goto bad; 2661 ipv4_parent = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, 2662 flags | M_ZERO); 2663 if (ipv4_parent == NULL) 2664 goto bad; 2665 ipv4_add = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); 2666 if (ipv4_add == NULL) 2667 goto bad; 2668 ipv4_del = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); 2669 if (ipv4_del == NULL) 2670 goto bad; 2671 ipv4_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, 2672 flags | M_ZERO); 2673 if (ipv4_parent_add == NULL) 2674 goto bad; 2675 ipv4_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, 2676 flags | M_ZERO); 2677 if (ipv4_parent_del == NULL) 2678 goto bad; 2679 #ifdef INET6 2680 ipv6 = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, 2681 flags | M_ZERO); 2682 if (ipv6 == NULL) 2683 goto bad; 2684 ipv6_parent = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, 2685 flags | M_ZERO); 2686 if (ipv6_parent == NULL) 2687 goto bad; 2688 ipv6_add = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); 2689 if (ipv6_add == NULL) 2690 goto bad; 2691 ipv6_del = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); 2692 if (ipv6_del == NULL) 2693 goto bad; 2694 ipv6_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, 2695 flags | M_ZERO); 2696 if (ipv6_parent_add == NULL) 2697 goto bad; 2698 ipv6_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, 2699 flags | M_ZERO); 2700 if (ipv6_parent_del == NULL) 2701 goto bad; 2702 #endif 2703 for (bucket = 0; bucket < new; bucket++) { 2704 DYN_BUCKET_LOCK_INIT(bucket_lock, bucket); 2705 CK_SLIST_INIT(&ipv4[bucket]); 2706 CK_SLIST_INIT(&ipv4_parent[bucket]); 2707 #ifdef INET6 2708 CK_SLIST_INIT(&ipv6[bucket]); 2709 CK_SLIST_INIT(&ipv6_parent[bucket]); 2710 #endif 2711 } 2712 2713 #define DYN_RELINK_STATES(s, hval, i, head, ohead) do { \ 2714 while ((s = CK_SLIST_FIRST(&V_dyn_ ## ohead[i])) != NULL) { \ 2715 CK_SLIST_REMOVE_HEAD(&V_dyn_ ## ohead[i], entry); \ 2716 CK_SLIST_INSERT_HEAD(&head[DYN_BUCKET(s->hval, new)], \ 2717 s, entry); \ 2718 } \ 2719 } while (0) 2720 /* 2721 * Prevent rules changing from userland. 2722 */ 2723 IPFW_UH_WLOCK(chain); 2724 /* 2725 * Hold traffic processing until we finish resize to 2726 * prevent access to states lists. 2727 */ 2728 IPFW_WLOCK(chain); 2729 /* Re-link all dynamic states */ 2730 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2731 DYN_RELINK_STATES(s4, data->hashval, bucket, ipv4, ipv4); 2732 DYN_RELINK_STATES(s4, limit->hashval, bucket, ipv4_parent, 2733 ipv4_parent); 2734 #ifdef INET6 2735 DYN_RELINK_STATES(s6, data->hashval, bucket, ipv6, ipv6); 2736 DYN_RELINK_STATES(s6, limit->hashval, bucket, ipv6_parent, 2737 ipv6_parent); 2738 #endif 2739 } 2740 2741 #define DYN_SWAP_PTR(old, new, tmp) do { \ 2742 tmp = old; \ 2743 old = new; \ 2744 new = tmp; \ 2745 } while (0) 2746 /* Swap pointers */ 2747 DYN_SWAP_PTR(V_dyn_bucket_lock, bucket_lock, tmp); 2748 DYN_SWAP_PTR(V_dyn_ipv4, ipv4, tmp); 2749 DYN_SWAP_PTR(V_dyn_ipv4_parent, ipv4_parent, tmp); 2750 DYN_SWAP_PTR(V_dyn_ipv4_add, ipv4_add, tmp); 2751 DYN_SWAP_PTR(V_dyn_ipv4_parent_add, ipv4_parent_add, tmp); 2752 DYN_SWAP_PTR(V_dyn_ipv4_del, ipv4_del, tmp); 2753 DYN_SWAP_PTR(V_dyn_ipv4_parent_del, ipv4_parent_del, tmp); 2754 2755 #ifdef INET6 2756 DYN_SWAP_PTR(V_dyn_ipv6, ipv6, tmp); 2757 DYN_SWAP_PTR(V_dyn_ipv6_parent, ipv6_parent, tmp); 2758 DYN_SWAP_PTR(V_dyn_ipv6_add, ipv6_add, tmp); 2759 DYN_SWAP_PTR(V_dyn_ipv6_parent_add, ipv6_parent_add, tmp); 2760 DYN_SWAP_PTR(V_dyn_ipv6_del, ipv6_del, tmp); 2761 DYN_SWAP_PTR(V_dyn_ipv6_parent_del, ipv6_parent_del, tmp); 2762 #endif 2763 bucket = V_curr_dyn_buckets; 2764 V_curr_dyn_buckets = new; 2765 2766 IPFW_WUNLOCK(chain); 2767 IPFW_UH_WUNLOCK(chain); 2768 2769 /* Release old resources */ 2770 while (bucket-- != 0) 2771 DYN_BUCKET_LOCK_DESTROY(bucket_lock, bucket); 2772 bad: 2773 free(bucket_lock, M_IPFW); 2774 free(ipv4, M_IPFW); 2775 free(ipv4_parent, M_IPFW); 2776 free(ipv4_add, M_IPFW); 2777 free(ipv4_parent_add, M_IPFW); 2778 free(ipv4_del, M_IPFW); 2779 free(ipv4_parent_del, M_IPFW); 2780 #ifdef INET6 2781 free(ipv6, M_IPFW); 2782 free(ipv6_parent, M_IPFW); 2783 free(ipv6_add, M_IPFW); 2784 free(ipv6_parent_add, M_IPFW); 2785 free(ipv6_del, M_IPFW); 2786 free(ipv6_parent_del, M_IPFW); 2787 #endif 2788 } 2789 2790 /* 2791 * This function is used to perform various maintenance 2792 * on dynamic hash lists. Currently it is called every second. 2793 */ 2794 static void 2795 dyn_tick(void *vnetx) 2796 { 2797 struct epoch_tracker et; 2798 uint32_t buckets; 2799 2800 CURVNET_SET((struct vnet *)vnetx); 2801 /* 2802 * First free states unlinked in previous passes. 2803 */ 2804 dyn_free_states(&V_layer3_chain); 2805 /* 2806 * Now unlink others expired states. 2807 * We use IPFW_UH_WLOCK to avoid concurrent call of 2808 * dyn_expire_states(). It is the only function that does 2809 * deletion of state entries from states lists. 2810 */ 2811 IPFW_UH_WLOCK(&V_layer3_chain); 2812 dyn_expire_states(&V_layer3_chain, NULL); 2813 IPFW_UH_WUNLOCK(&V_layer3_chain); 2814 /* 2815 * Send keepalives if they are enabled and the time has come. 2816 */ 2817 if (V_dyn_keepalive != 0 && 2818 V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) { 2819 V_dyn_keepalive_last = time_uptime; 2820 NET_EPOCH_ENTER(et); 2821 dyn_send_keepalive_ipv4(&V_layer3_chain); 2822 #ifdef INET6 2823 dyn_send_keepalive_ipv6(&V_layer3_chain); 2824 #endif 2825 NET_EPOCH_EXIT(et); 2826 } 2827 /* 2828 * Check if we need to resize the hash: 2829 * if current number of states exceeds number of buckets in hash, 2830 * and dyn_buckets_max permits to grow the number of buckets, then 2831 * do it. Grow hash size to the minimum power of 2 which is bigger 2832 * than current states count. 2833 */ 2834 if (V_curr_dyn_buckets < V_dyn_buckets_max && 2835 (V_curr_dyn_buckets < V_dyn_count / 2 || ( 2836 V_curr_dyn_buckets < V_dyn_count && V_curr_max_length > 8))) { 2837 buckets = 1 << fls(V_dyn_count); 2838 if (buckets > V_dyn_buckets_max) 2839 buckets = V_dyn_buckets_max; 2840 dyn_grow_hashtable(&V_layer3_chain, buckets, M_NOWAIT); 2841 } 2842 2843 callout_reset_on(&V_dyn_timeout, hz, dyn_tick, vnetx, 0); 2844 CURVNET_RESTORE(); 2845 } 2846 2847 void 2848 ipfw_expire_dyn_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt) 2849 { 2850 /* 2851 * Do not perform any checks if we currently have no dynamic states 2852 */ 2853 if (V_dyn_count == 0) 2854 return; 2855 2856 IPFW_UH_WLOCK_ASSERT(chain); 2857 dyn_expire_states(chain, rt); 2858 } 2859 2860 /* 2861 * Pass through all states and reset eaction for orphaned rules. 2862 */ 2863 void 2864 ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint32_t eaction_id, 2865 uint32_t default_id, uint32_t instance_id) 2866 { 2867 #ifdef INET6 2868 struct dyn_ipv6_state *s6; 2869 #endif 2870 struct dyn_ipv4_state *s4; 2871 struct ip_fw *rule; 2872 uint32_t bucket; 2873 2874 #define DYN_RESET_EACTION(s, h, b) \ 2875 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 2876 if ((s->data->flags & DYN_REFERENCED) == 0) \ 2877 continue; \ 2878 rule = s->data->parent; \ 2879 if (s->type == O_LIMIT) \ 2880 rule = ((__typeof(s))rule)->limit->parent; \ 2881 ipfw_reset_eaction(ch, rule, eaction_id, \ 2882 default_id, instance_id); \ 2883 } 2884 2885 IPFW_UH_WLOCK_ASSERT(ch); 2886 if (V_dyn_count == 0) 2887 return; 2888 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2889 DYN_RESET_EACTION(s4, ipv4, bucket); 2890 #ifdef INET6 2891 DYN_RESET_EACTION(s6, ipv6, bucket); 2892 #endif 2893 } 2894 } 2895 2896 /* 2897 * Returns size of dynamic states in legacy format 2898 */ 2899 int 2900 ipfw_dyn_len(void) 2901 { 2902 2903 return ((V_dyn_count + V_dyn_parent_count) * sizeof(ipfw_dyn_rule)); 2904 } 2905 2906 /* 2907 * Returns number of dynamic states. 2908 * Marks every named object index used by dynamic states with bit in @bmask. 2909 * Returns number of named objects accounted in bmask via @nocnt. 2910 * Used by dump format v1 (current). 2911 */ 2912 uint32_t 2913 ipfw_dyn_get_count(uint32_t *bmask, int *nocnt) 2914 { 2915 #ifdef INET6 2916 struct dyn_ipv6_state *s6; 2917 #endif 2918 struct dyn_ipv4_state *s4; 2919 uint32_t bucket; 2920 2921 #define DYN_COUNT_OBJECTS(s, h, b) \ 2922 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 2923 MPASS(s->kidx != 0); \ 2924 if (ipfw_mark_object_kidx(bmask, IPFW_TLV_STATE_NAME, \ 2925 s->kidx) != 0) \ 2926 (*nocnt)++; \ 2927 } 2928 2929 IPFW_UH_RLOCK_ASSERT(&V_layer3_chain); 2930 2931 /* No need to pass through all the buckets. */ 2932 *nocnt = 0; 2933 if (V_dyn_count + V_dyn_parent_count == 0) 2934 return (0); 2935 2936 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2937 DYN_COUNT_OBJECTS(s4, ipv4, bucket); 2938 #ifdef INET6 2939 DYN_COUNT_OBJECTS(s6, ipv6, bucket); 2940 #endif 2941 } 2942 2943 return (V_dyn_count + V_dyn_parent_count); 2944 } 2945 2946 /* 2947 * Check if rule contains at least one dynamic opcode. 2948 * 2949 * Returns 1 if such opcode is found, 0 otherwise. 2950 */ 2951 int 2952 ipfw_is_dyn_rule(struct ip_fw *rule) 2953 { 2954 int cmdlen, l; 2955 ipfw_insn *cmd; 2956 2957 l = rule->cmd_len; 2958 cmd = rule->cmd; 2959 cmdlen = 0; 2960 for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { 2961 cmdlen = F_LEN(cmd); 2962 2963 switch (cmd->opcode) { 2964 case O_LIMIT: 2965 case O_KEEP_STATE: 2966 case O_PROBE_STATE: 2967 case O_CHECK_STATE: 2968 return (1); 2969 } 2970 } 2971 2972 return (0); 2973 } 2974 2975 static void 2976 dyn_export_parent(const struct dyn_parent *p, uint32_t kidx, uint8_t set, 2977 ipfw_dyn_rule *dst) 2978 { 2979 2980 dst->type = O_LIMIT_PARENT; 2981 dst->set = set; 2982 dst->kidx = kidx; 2983 dst->rulenum = p->rulenum; 2984 dst->count = DPARENT_COUNT(p); 2985 dst->expire = TIME_LEQ(p->expire, time_uptime) ? 0: 2986 p->expire - time_uptime; 2987 dst->hashval = p->hashval; 2988 2989 /* unused fields */ 2990 dst->pad = 0; 2991 dst->pcnt = 0; 2992 dst->bcnt = 0; 2993 dst->ack_fwd = 0; 2994 dst->ack_rev = 0; 2995 } 2996 2997 static void 2998 dyn_export_data(const struct dyn_data *data, uint32_t kidx, uint8_t type, 2999 uint8_t set, ipfw_dyn_rule *dst) 3000 { 3001 3002 dst->type = type; 3003 dst->set = set; 3004 dst->kidx = kidx; 3005 dst->rulenum = data->rulenum; 3006 dst->pcnt = data->pcnt_fwd + data->pcnt_rev; 3007 dst->bcnt = data->bcnt_fwd + data->bcnt_rev; 3008 dst->expire = TIME_LEQ(data->expire, time_uptime) ? 0: 3009 data->expire - time_uptime; 3010 dst->state = data->state; 3011 if (data->flags & DYN_REFERENCED) 3012 dst->state |= IPFW_DYN_ORPHANED; 3013 3014 dst->ack_fwd = data->ack_fwd; 3015 dst->ack_rev = data->ack_rev; 3016 dst->hashval = data->hashval; 3017 } 3018 3019 static void 3020 dyn_export_ipv4_state(const struct dyn_ipv4_state *s, ipfw_dyn_rule *dst) 3021 { 3022 struct ip_fw *rule; 3023 3024 switch (s->type) { 3025 case O_LIMIT_PARENT: 3026 rule = s->limit->parent; 3027 dyn_export_parent(s->limit, s->kidx, rule->set, dst); 3028 break; 3029 default: 3030 rule = s->data->parent; 3031 if (s->type == O_LIMIT) 3032 rule = ((struct dyn_ipv4_state *)rule)->limit->parent; 3033 dyn_export_data(s->data, s->kidx, s->type, rule->set, dst); 3034 } 3035 3036 dst->id.dst_ip = s->dst; 3037 dst->id.src_ip = s->src; 3038 dst->id.dst_port = s->dport; 3039 dst->id.src_port = s->sport; 3040 dst->id.fib = s->data->fibnum; 3041 dst->id.proto = s->proto; 3042 dst->id._flags = 0; 3043 dst->id.addr_type = 4; 3044 3045 memset(&dst->id.dst_ip6, 0, sizeof(dst->id.dst_ip6)); 3046 memset(&dst->id.src_ip6, 0, sizeof(dst->id.src_ip6)); 3047 dst->id.flow_id6 = dst->id.extra = 0; 3048 } 3049 3050 #ifdef INET6 3051 static void 3052 dyn_export_ipv6_state(const struct dyn_ipv6_state *s, ipfw_dyn_rule *dst) 3053 { 3054 struct ip_fw *rule; 3055 3056 switch (s->type) { 3057 case O_LIMIT_PARENT: 3058 rule = s->limit->parent; 3059 dyn_export_parent(s->limit, s->kidx, rule->set, dst); 3060 break; 3061 default: 3062 rule = s->data->parent; 3063 if (s->type == O_LIMIT) 3064 rule = ((struct dyn_ipv6_state *)rule)->limit->parent; 3065 dyn_export_data(s->data, s->kidx, s->type, rule->set, dst); 3066 } 3067 3068 dst->id.src_ip6 = s->src; 3069 dst->id.dst_ip6 = s->dst; 3070 dst->id.dst_port = s->dport; 3071 dst->id.src_port = s->sport; 3072 dst->id.fib = s->data->fibnum; 3073 dst->id.proto = s->proto; 3074 dst->id._flags = 0; 3075 dst->id.addr_type = 6; 3076 3077 dst->id.dst_ip = dst->id.src_ip = 0; 3078 dst->id.flow_id6 = dst->id.extra = 0; 3079 } 3080 #endif /* INET6 */ 3081 3082 /* 3083 * Fills the buffer given by @sd with dynamic states. 3084 * Used by dump format v1 (current). 3085 * 3086 * Returns 0 on success. 3087 */ 3088 int 3089 ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) 3090 { 3091 #ifdef INET6 3092 struct dyn_ipv6_state *s6; 3093 #endif 3094 struct dyn_ipv4_state *s4; 3095 ipfw_obj_dyntlv *dst, *last; 3096 ipfw_obj_ctlv *ctlv; 3097 uint32_t bucket; 3098 3099 if (V_dyn_count == 0) 3100 return (0); 3101 3102 /* 3103 * IPFW_UH_RLOCK garantees that another userland request 3104 * and callout thread will not delete entries from states 3105 * lists. 3106 */ 3107 IPFW_UH_RLOCK_ASSERT(chain); 3108 3109 ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); 3110 if (ctlv == NULL) 3111 return (ENOMEM); 3112 ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; 3113 ctlv->objsize = sizeof(ipfw_obj_dyntlv); 3114 last = NULL; 3115 3116 #define DYN_EXPORT_STATES(s, af, h, b) \ 3117 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 3118 dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, \ 3119 sizeof(ipfw_obj_dyntlv)); \ 3120 if (dst == NULL) \ 3121 return (ENOMEM); \ 3122 dyn_export_ ## af ## _state(s, &dst->state); \ 3123 dst->head.length = sizeof(ipfw_obj_dyntlv); \ 3124 dst->head.type = IPFW_TLV_DYN_ENT; \ 3125 last = dst; \ 3126 } 3127 3128 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 3129 DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket); 3130 DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket); 3131 #ifdef INET6 3132 DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket); 3133 DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket); 3134 #endif /* INET6 */ 3135 } 3136 3137 /* mark last dynamic rule */ 3138 if (last != NULL) 3139 last->head.flags = IPFW_DF_LAST; /* XXX: unused */ 3140 return (0); 3141 #undef DYN_EXPORT_STATES 3142 } 3143 3144 /* 3145 * When we have enabled V_dyn_keep_states, states that become ORPHANED 3146 * will keep pointer to original rule. Then this rule pointer is used 3147 * to apply rule action after ipfw_dyn_lookup_state(). 3148 * Some rule actions use IPFW_INC_RULE_COUNTER() directly to this rule 3149 * pointer, but other actions use chain->map[f_pos] instead. The last 3150 * case leads to incrementing counters on the wrong rule, because 3151 * ORPHANED states have not parent rule in chain->map[]. 3152 * To solve this we add protected rule: 3153 * count ip from any to any not // comment 3154 * It will be matched only by packets that are handled by ORPHANED states. 3155 */ 3156 static void 3157 dyn_add_protected_rule(struct ip_fw_chain *chain) 3158 { 3159 static const char *comment = 3160 "orphaned dynamic states counter"; 3161 struct ip_fw *rule; 3162 ipfw_insn *cmd; 3163 size_t l; 3164 3165 l = roundup(strlen(comment) + 1, sizeof(uint32_t)); 3166 rule = ipfw_alloc_rule(chain, sizeof(*rule) + sizeof(ipfw_insn) + l); 3167 cmd = rule->cmd; 3168 cmd->opcode = O_NOP; 3169 cmd->len = 1 + l/sizeof(uint32_t); 3170 cmd->len |= F_NOT; /* make rule to be not matched */ 3171 strcpy((char *)(cmd + 1), comment); 3172 cmd += F_LEN(cmd); 3173 3174 cmd->len = 1; 3175 cmd->opcode = O_COUNT; 3176 rule->act_ofs = cmd - rule->cmd; 3177 rule->cmd_len = rule->act_ofs + 1; 3178 ipfw_add_protected_rule(chain, rule, 0); 3179 } 3180 3181 void 3182 ipfw_dyn_init(struct ip_fw_chain *chain) 3183 { 3184 3185 #ifdef IPFIREWALL_JENKINSHASH 3186 V_dyn_hashseed = arc4random(); 3187 #endif 3188 V_dyn_max = 16384; /* max # of states */ 3189 V_dyn_parent_max = 4096; /* max # of parent states */ 3190 V_dyn_buckets_max = 8192; /* must be power of 2 */ 3191 3192 V_dyn_ack_lifetime = 300; 3193 V_dyn_syn_lifetime = 20; 3194 V_dyn_fin_lifetime = 1; 3195 V_dyn_rst_lifetime = 1; 3196 V_dyn_udp_lifetime = 10; 3197 V_dyn_short_lifetime = 5; 3198 3199 V_dyn_keepalive_interval = 20; 3200 V_dyn_keepalive_period = 5; 3201 V_dyn_keepalive = 1; /* send keepalives */ 3202 V_dyn_keepalive_last = time_uptime; 3203 3204 V_dyn_data_zone = uma_zcreate("IPFW dynamic states data", 3205 sizeof(struct dyn_data), NULL, NULL, NULL, NULL, 3206 UMA_ALIGN_PTR, 0); 3207 uma_zone_set_max(V_dyn_data_zone, V_dyn_max); 3208 3209 V_dyn_parent_zone = uma_zcreate("IPFW parent dynamic states", 3210 sizeof(struct dyn_parent), NULL, NULL, NULL, NULL, 3211 UMA_ALIGN_PTR, 0); 3212 uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); 3213 3214 SLIST_INIT(&V_dyn_expired_ipv4); 3215 V_dyn_ipv4 = NULL; 3216 V_dyn_ipv4_parent = NULL; 3217 V_dyn_ipv4_zone = uma_zcreate("IPFW IPv4 dynamic states", 3218 sizeof(struct dyn_ipv4_state), NULL, NULL, NULL, NULL, 3219 UMA_ALIGN_PTR, 0); 3220 3221 #ifdef INET6 3222 SLIST_INIT(&V_dyn_expired_ipv6); 3223 V_dyn_ipv6 = NULL; 3224 V_dyn_ipv6_parent = NULL; 3225 V_dyn_ipv6_zone = uma_zcreate("IPFW IPv6 dynamic states", 3226 sizeof(struct dyn_ipv6_state), NULL, NULL, NULL, NULL, 3227 UMA_ALIGN_PTR, 0); 3228 #endif 3229 3230 /* Initialize buckets. */ 3231 V_curr_dyn_buckets = 0; 3232 V_dyn_bucket_lock = NULL; 3233 dyn_grow_hashtable(chain, 256, M_WAITOK); 3234 3235 if (IS_DEFAULT_VNET(curvnet)) 3236 dyn_hp_cache = malloc(mp_ncpus * sizeof(void *), M_IPFW, 3237 M_WAITOK | M_ZERO); 3238 3239 DYN_EXPIRED_LOCK_INIT(); 3240 callout_init(&V_dyn_timeout, 1); 3241 callout_reset(&V_dyn_timeout, hz, dyn_tick, curvnet); 3242 IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); 3243 3244 dyn_add_protected_rule(chain); 3245 } 3246 3247 void 3248 ipfw_dyn_uninit(int pass) 3249 { 3250 #ifdef INET6 3251 struct dyn_ipv6_state *s6; 3252 #endif 3253 struct dyn_ipv4_state *s4; 3254 int bucket; 3255 3256 if (pass == 0) { 3257 callout_drain(&V_dyn_timeout); 3258 return; 3259 } 3260 IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); 3261 DYN_EXPIRED_LOCK_DESTROY(); 3262 3263 #define DYN_FREE_STATES_FORCED(CK, s, af, name, en) do { \ 3264 while ((s = CK ## SLIST_FIRST(&V_dyn_ ## name)) != NULL) { \ 3265 CK ## SLIST_REMOVE_HEAD(&V_dyn_ ## name, en); \ 3266 if (s->type == O_LIMIT_PARENT) \ 3267 uma_zfree(V_dyn_parent_zone, s->limit); \ 3268 else \ 3269 uma_zfree(V_dyn_data_zone, s->data); \ 3270 uma_zfree(V_dyn_ ## af ## _zone, s); \ 3271 } \ 3272 } while (0) 3273 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 3274 DYN_BUCKET_LOCK_DESTROY(V_dyn_bucket_lock, bucket); 3275 3276 DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4[bucket], entry); 3277 DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4_parent[bucket], 3278 entry); 3279 #ifdef INET6 3280 DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6[bucket], entry); 3281 DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6_parent[bucket], 3282 entry); 3283 #endif /* INET6 */ 3284 } 3285 DYN_FREE_STATES_FORCED(, s4, ipv4, expired_ipv4, expired); 3286 #ifdef INET6 3287 DYN_FREE_STATES_FORCED(, s6, ipv6, expired_ipv6, expired); 3288 #endif 3289 #undef DYN_FREE_STATES_FORCED 3290 3291 uma_zdestroy(V_dyn_ipv4_zone); 3292 uma_zdestroy(V_dyn_data_zone); 3293 uma_zdestroy(V_dyn_parent_zone); 3294 #ifdef INET6 3295 uma_zdestroy(V_dyn_ipv6_zone); 3296 free(V_dyn_ipv6, M_IPFW); 3297 free(V_dyn_ipv6_parent, M_IPFW); 3298 free(V_dyn_ipv6_add, M_IPFW); 3299 free(V_dyn_ipv6_parent_add, M_IPFW); 3300 free(V_dyn_ipv6_del, M_IPFW); 3301 free(V_dyn_ipv6_parent_del, M_IPFW); 3302 #endif 3303 free(V_dyn_bucket_lock, M_IPFW); 3304 free(V_dyn_ipv4, M_IPFW); 3305 free(V_dyn_ipv4_parent, M_IPFW); 3306 free(V_dyn_ipv4_add, M_IPFW); 3307 free(V_dyn_ipv4_parent_add, M_IPFW); 3308 free(V_dyn_ipv4_del, M_IPFW); 3309 free(V_dyn_ipv4_parent_del, M_IPFW); 3310 if (IS_DEFAULT_VNET(curvnet)) 3311 free(dyn_hp_cache, M_IPFW); 3312 } 3313