1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017-2018 Yandex LLC 5 * Copyright (c) 2017-2018 Andrey V. Elsukov <ae@FreeBSD.org> 6 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_ipfw.h" 36 #ifndef INET 37 #error IPFIREWALL requires INET. 38 #endif /* INET */ 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/hash.h> 43 #include <sys/mbuf.h> 44 #include <sys/kernel.h> 45 #include <sys/lock.h> 46 #include <sys/pcpu.h> 47 #include <sys/queue.h> 48 #include <sys/rmlock.h> 49 #include <sys/smp.h> 50 #include <sys/socket.h> 51 #include <sys/sysctl.h> 52 #include <sys/syslog.h> 53 #include <net/ethernet.h> 54 #include <net/if.h> 55 #include <net/if_var.h> 56 #include <net/vnet.h> 57 58 #include <netinet/in.h> 59 #include <netinet/ip.h> 60 #include <netinet/ip_var.h> 61 #include <netinet/ip_fw.h> 62 #include <netinet/tcp_var.h> 63 #include <netinet/udp.h> 64 65 #include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */ 66 #ifdef INET6 67 #include <netinet6/in6_var.h> 68 #include <netinet6/ip6_var.h> 69 #include <netinet6/scope6_var.h> 70 #endif 71 72 #include <netpfil/ipfw/ip_fw_private.h> 73 74 #include <machine/in_cksum.h> /* XXX for in_cksum */ 75 76 #ifdef MAC 77 #include <security/mac/mac_framework.h> 78 #endif 79 80 /* 81 * Description of dynamic states. 82 * 83 * Dynamic states are stored in lists accessed through a hash tables 84 * whose size is curr_dyn_buckets. This value can be modified through 85 * the sysctl variable dyn_buckets. 86 * 87 * Currently there are four tables: dyn_ipv4, dyn_ipv6, dyn_ipv4_parent, 88 * and dyn_ipv6_parent. 89 * 90 * When a packet is received, its address fields hashed, then matched 91 * against the entries in the corresponding list by addr_type. 92 * Dynamic states can be used for different purposes: 93 * + stateful rules; 94 * + enforcing limits on the number of sessions; 95 * + in-kernel NAT (not implemented yet) 96 * 97 * The lifetime of dynamic states is regulated by dyn_*_lifetime, 98 * measured in seconds and depending on the flags. 99 * 100 * The total number of dynamic states is equal to UMA zone items count. 101 * The max number of dynamic states is dyn_max. When we reach 102 * the maximum number of rules we do not create anymore. This is 103 * done to avoid consuming too much memory, but also too much 104 * time when searching on each packet (ideally, we should try instead 105 * to put a limit on the length of the list on each bucket...). 106 * 107 * Each state holds a pointer to the parent ipfw rule so we know what 108 * action to perform. Dynamic rules are removed when the parent rule is 109 * deleted. 110 * 111 * There are some limitations with dynamic rules -- we do not 112 * obey the 'randomized match', and we do not do multiple 113 * passes through the firewall. XXX check the latter!!! 114 */ 115 116 /* By default use jenkins hash function */ 117 #define IPFIREWALL_JENKINSHASH 118 119 #define DYN_COUNTER_INC(d, dir, pktlen) do { \ 120 (d)->pcnt_ ## dir++; \ 121 (d)->bcnt_ ## dir += pktlen; \ 122 } while (0) 123 124 #define DYN_REFERENCED 0x01 125 /* 126 * DYN_REFERENCED flag is used to show that state keeps reference to named 127 * object, and this reference should be released when state becomes expired. 128 */ 129 130 struct dyn_data { 131 void *parent; /* pointer to parent rule */ 132 uint32_t chain_id; /* cached ruleset id */ 133 uint32_t f_pos; /* cached rule index */ 134 135 uint32_t hashval; /* hash value used for hash resize */ 136 uint16_t fibnum; /* fib used to send keepalives */ 137 uint8_t _pad[2]; 138 uint8_t flags; /* internal flags */ 139 uint8_t set; /* parent rule set number */ 140 uint16_t rulenum; /* parent rule number */ 141 uint32_t ruleid; /* parent rule id */ 142 143 uint32_t state; /* TCP session state and flags */ 144 uint32_t ack_fwd; /* most recent ACKs in forward */ 145 uint32_t ack_rev; /* and reverse direction (used */ 146 /* to generate keepalives) */ 147 uint32_t sync; /* synchronization time */ 148 uint32_t expire; /* expire time */ 149 150 uint64_t pcnt_fwd; /* bytes counter in forward */ 151 uint64_t bcnt_fwd; /* packets counter in forward */ 152 uint64_t pcnt_rev; /* bytes counter in reverse */ 153 uint64_t bcnt_rev; /* packets counter in reverse */ 154 }; 155 156 #define DPARENT_COUNT_DEC(p) do { \ 157 MPASS(p->count > 0); \ 158 ck_pr_dec_32(&(p)->count); \ 159 } while (0) 160 #define DPARENT_COUNT_INC(p) ck_pr_inc_32(&(p)->count) 161 #define DPARENT_COUNT(p) ck_pr_load_32(&(p)->count) 162 struct dyn_parent { 163 void *parent; /* pointer to parent rule */ 164 uint32_t count; /* number of linked states */ 165 uint8_t _pad; 166 uint8_t set; /* parent rule set number */ 167 uint16_t rulenum; /* parent rule number */ 168 uint32_t ruleid; /* parent rule id */ 169 uint32_t hashval; /* hash value used for hash resize */ 170 uint32_t expire; /* expire time */ 171 }; 172 173 struct dyn_ipv4_state { 174 uint8_t type; /* State type */ 175 uint8_t proto; /* UL Protocol */ 176 uint16_t kidx; /* named object index */ 177 uint16_t sport, dport; /* ULP source and destination ports */ 178 in_addr_t src, dst; /* IPv4 source and destination */ 179 180 union { 181 struct dyn_data *data; 182 struct dyn_parent *limit; 183 }; 184 CK_SLIST_ENTRY(dyn_ipv4_state) entry; 185 SLIST_ENTRY(dyn_ipv4_state) expired; 186 }; 187 CK_SLIST_HEAD(dyn_ipv4ck_slist, dyn_ipv4_state); 188 VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4); 189 VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4_parent); 190 191 SLIST_HEAD(dyn_ipv4_slist, dyn_ipv4_state); 192 VNET_DEFINE_STATIC(struct dyn_ipv4_slist, dyn_expired_ipv4); 193 #define V_dyn_ipv4 VNET(dyn_ipv4) 194 #define V_dyn_ipv4_parent VNET(dyn_ipv4_parent) 195 #define V_dyn_expired_ipv4 VNET(dyn_expired_ipv4) 196 197 #ifdef INET6 198 struct dyn_ipv6_state { 199 uint8_t type; /* State type */ 200 uint8_t proto; /* UL Protocol */ 201 uint16_t kidx; /* named object index */ 202 uint16_t sport, dport; /* ULP source and destination ports */ 203 struct in6_addr src, dst; /* IPv6 source and destination */ 204 uint32_t zoneid; /* IPv6 scope zone id */ 205 union { 206 struct dyn_data *data; 207 struct dyn_parent *limit; 208 }; 209 CK_SLIST_ENTRY(dyn_ipv6_state) entry; 210 SLIST_ENTRY(dyn_ipv6_state) expired; 211 }; 212 CK_SLIST_HEAD(dyn_ipv6ck_slist, dyn_ipv6_state); 213 VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6); 214 VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6_parent); 215 216 SLIST_HEAD(dyn_ipv6_slist, dyn_ipv6_state); 217 VNET_DEFINE_STATIC(struct dyn_ipv6_slist, dyn_expired_ipv6); 218 #define V_dyn_ipv6 VNET(dyn_ipv6) 219 #define V_dyn_ipv6_parent VNET(dyn_ipv6_parent) 220 #define V_dyn_expired_ipv6 VNET(dyn_expired_ipv6) 221 #endif /* INET6 */ 222 223 /* 224 * Per-CPU pointer indicates that specified state is currently in use 225 * and must not be reclaimed by expiration callout. 226 */ 227 static void **dyn_hp_cache; 228 DPCPU_DEFINE_STATIC(void *, dyn_hp); 229 #define DYNSTATE_GET(cpu) ck_pr_load_ptr(DPCPU_ID_PTR((cpu), dyn_hp)) 230 #define DYNSTATE_PROTECT(v) ck_pr_store_ptr(DPCPU_PTR(dyn_hp), (v)) 231 #define DYNSTATE_RELEASE() DYNSTATE_PROTECT(NULL) 232 #define DYNSTATE_CRITICAL_ENTER() critical_enter() 233 #define DYNSTATE_CRITICAL_EXIT() do { \ 234 DYNSTATE_RELEASE(); \ 235 critical_exit(); \ 236 } while (0); 237 238 /* 239 * We keep two version numbers, one is updated when new entry added to 240 * the list. Second is updated when an entry deleted from the list. 241 * Versions are updated under bucket lock. 242 * 243 * Bucket "add" version number is used to know, that in the time between 244 * state lookup (i.e. ipfw_dyn_lookup_state()) and the followed state 245 * creation (i.e. ipfw_dyn_install_state()) another concurrent thread did 246 * not install some state in this bucket. Using this info we can avoid 247 * additional state lookup, because we are sure that we will not install 248 * the state twice. 249 * 250 * Also doing the tracking of bucket "del" version during lookup we can 251 * be sure, that state entry was not unlinked and freed in time between 252 * we read the state pointer and protect it with hazard pointer. 253 * 254 * An entry unlinked from CK list keeps unchanged until it is freed. 255 * Unlinked entries are linked into expired lists using "expired" field. 256 */ 257 258 /* 259 * dyn_expire_lock is used to protect access to dyn_expired_xxx lists. 260 * dyn_bucket_lock is used to get write access to lists in specific bucket. 261 * Currently one dyn_bucket_lock is used for all ipv4, ipv4_parent, ipv6, 262 * and ipv6_parent lists. 263 */ 264 VNET_DEFINE_STATIC(struct mtx, dyn_expire_lock); 265 VNET_DEFINE_STATIC(struct mtx *, dyn_bucket_lock); 266 #define V_dyn_expire_lock VNET(dyn_expire_lock) 267 #define V_dyn_bucket_lock VNET(dyn_bucket_lock) 268 269 /* 270 * Bucket's add/delete generation versions. 271 */ 272 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_add); 273 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_del); 274 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_add); 275 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_del); 276 #define V_dyn_ipv4_add VNET(dyn_ipv4_add) 277 #define V_dyn_ipv4_del VNET(dyn_ipv4_del) 278 #define V_dyn_ipv4_parent_add VNET(dyn_ipv4_parent_add) 279 #define V_dyn_ipv4_parent_del VNET(dyn_ipv4_parent_del) 280 281 #ifdef INET6 282 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_add); 283 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_del); 284 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_add); 285 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_del); 286 #define V_dyn_ipv6_add VNET(dyn_ipv6_add) 287 #define V_dyn_ipv6_del VNET(dyn_ipv6_del) 288 #define V_dyn_ipv6_parent_add VNET(dyn_ipv6_parent_add) 289 #define V_dyn_ipv6_parent_del VNET(dyn_ipv6_parent_del) 290 #endif /* INET6 */ 291 292 #define DYN_BUCKET(h, b) ((h) & (b - 1)) 293 #define DYN_BUCKET_VERSION(b, v) ck_pr_load_32(&V_dyn_ ## v[(b)]) 294 #define DYN_BUCKET_VERSION_BUMP(b, v) ck_pr_inc_32(&V_dyn_ ## v[(b)]) 295 296 #define DYN_BUCKET_LOCK_INIT(lock, b) \ 297 mtx_init(&lock[(b)], "IPFW dynamic bucket", NULL, MTX_DEF) 298 #define DYN_BUCKET_LOCK_DESTROY(lock, b) mtx_destroy(&lock[(b)]) 299 #define DYN_BUCKET_LOCK(b) mtx_lock(&V_dyn_bucket_lock[(b)]) 300 #define DYN_BUCKET_UNLOCK(b) mtx_unlock(&V_dyn_bucket_lock[(b)]) 301 #define DYN_BUCKET_ASSERT(b) mtx_assert(&V_dyn_bucket_lock[(b)], MA_OWNED) 302 303 #define DYN_EXPIRED_LOCK_INIT() \ 304 mtx_init(&V_dyn_expire_lock, "IPFW expired states list", NULL, MTX_DEF) 305 #define DYN_EXPIRED_LOCK_DESTROY() mtx_destroy(&V_dyn_expire_lock) 306 #define DYN_EXPIRED_LOCK() mtx_lock(&V_dyn_expire_lock) 307 #define DYN_EXPIRED_UNLOCK() mtx_unlock(&V_dyn_expire_lock) 308 309 VNET_DEFINE_STATIC(uint32_t, dyn_buckets_max); 310 VNET_DEFINE_STATIC(uint32_t, curr_dyn_buckets); 311 VNET_DEFINE_STATIC(struct callout, dyn_timeout); 312 #define V_dyn_buckets_max VNET(dyn_buckets_max) 313 #define V_curr_dyn_buckets VNET(curr_dyn_buckets) 314 #define V_dyn_timeout VNET(dyn_timeout) 315 316 /* Maximum length of states chain in a bucket */ 317 VNET_DEFINE_STATIC(uint32_t, curr_max_length); 318 #define V_curr_max_length VNET(curr_max_length) 319 320 VNET_DEFINE_STATIC(uint32_t, dyn_keep_states); 321 #define V_dyn_keep_states VNET(dyn_keep_states) 322 323 VNET_DEFINE_STATIC(uma_zone_t, dyn_data_zone); 324 VNET_DEFINE_STATIC(uma_zone_t, dyn_parent_zone); 325 VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv4_zone); 326 #ifdef INET6 327 VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv6_zone); 328 #define V_dyn_ipv6_zone VNET(dyn_ipv6_zone) 329 #endif /* INET6 */ 330 #define V_dyn_data_zone VNET(dyn_data_zone) 331 #define V_dyn_parent_zone VNET(dyn_parent_zone) 332 #define V_dyn_ipv4_zone VNET(dyn_ipv4_zone) 333 334 /* 335 * Timeouts for various events in handing dynamic rules. 336 */ 337 VNET_DEFINE_STATIC(uint32_t, dyn_ack_lifetime); 338 VNET_DEFINE_STATIC(uint32_t, dyn_syn_lifetime); 339 VNET_DEFINE_STATIC(uint32_t, dyn_fin_lifetime); 340 VNET_DEFINE_STATIC(uint32_t, dyn_rst_lifetime); 341 VNET_DEFINE_STATIC(uint32_t, dyn_udp_lifetime); 342 VNET_DEFINE_STATIC(uint32_t, dyn_short_lifetime); 343 344 #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) 345 #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) 346 #define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) 347 #define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) 348 #define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) 349 #define V_dyn_short_lifetime VNET(dyn_short_lifetime) 350 351 /* 352 * Keepalives are sent if dyn_keepalive is set. They are sent every 353 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval 354 * seconds of lifetime of a rule. 355 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower 356 * than dyn_keepalive_period. 357 */ 358 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_interval); 359 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_period); 360 VNET_DEFINE_STATIC(uint32_t, dyn_keepalive); 361 VNET_DEFINE_STATIC(time_t, dyn_keepalive_last); 362 363 #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) 364 #define V_dyn_keepalive_period VNET(dyn_keepalive_period) 365 #define V_dyn_keepalive VNET(dyn_keepalive) 366 #define V_dyn_keepalive_last VNET(dyn_keepalive_last) 367 368 VNET_DEFINE_STATIC(uint32_t, dyn_max); /* max # of dynamic states */ 369 VNET_DEFINE_STATIC(uint32_t, dyn_count); /* number of states */ 370 VNET_DEFINE_STATIC(uint32_t, dyn_parent_max); /* max # of parent states */ 371 VNET_DEFINE_STATIC(uint32_t, dyn_parent_count); /* number of parent states */ 372 373 #define V_dyn_max VNET(dyn_max) 374 #define V_dyn_count VNET(dyn_count) 375 #define V_dyn_parent_max VNET(dyn_parent_max) 376 #define V_dyn_parent_count VNET(dyn_parent_count) 377 378 #define DYN_COUNT_DEC(name) do { \ 379 MPASS((V_ ## name) > 0); \ 380 ck_pr_dec_32(&(V_ ## name)); \ 381 } while (0) 382 #define DYN_COUNT_INC(name) ck_pr_inc_32(&(V_ ## name)) 383 #define DYN_COUNT(name) ck_pr_load_32(&(V_ ## name)) 384 385 static time_t last_log; /* Log ratelimiting */ 386 387 /* 388 * Get/set maximum number of dynamic states in given VNET instance. 389 */ 390 static int 391 sysctl_dyn_max(SYSCTL_HANDLER_ARGS) 392 { 393 uint32_t nstates; 394 int error; 395 396 nstates = V_dyn_max; 397 error = sysctl_handle_32(oidp, &nstates, 0, req); 398 /* Read operation or some error */ 399 if ((error != 0) || (req->newptr == NULL)) 400 return (error); 401 402 V_dyn_max = nstates; 403 uma_zone_set_max(V_dyn_data_zone, V_dyn_max); 404 return (0); 405 } 406 407 static int 408 sysctl_dyn_parent_max(SYSCTL_HANDLER_ARGS) 409 { 410 uint32_t nstates; 411 int error; 412 413 nstates = V_dyn_parent_max; 414 error = sysctl_handle_32(oidp, &nstates, 0, req); 415 /* Read operation or some error */ 416 if ((error != 0) || (req->newptr == NULL)) 417 return (error); 418 419 V_dyn_parent_max = nstates; 420 uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); 421 return (0); 422 } 423 424 static int 425 sysctl_dyn_buckets(SYSCTL_HANDLER_ARGS) 426 { 427 uint32_t nbuckets; 428 int error; 429 430 nbuckets = V_dyn_buckets_max; 431 error = sysctl_handle_32(oidp, &nbuckets, 0, req); 432 /* Read operation or some error */ 433 if ((error != 0) || (req->newptr == NULL)) 434 return (error); 435 436 if (nbuckets > 256) 437 V_dyn_buckets_max = 1 << fls(nbuckets - 1); 438 else 439 return (EINVAL); 440 return (0); 441 } 442 443 SYSCTL_DECL(_net_inet_ip_fw); 444 445 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_count, 446 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_count), 0, 447 "Current number of dynamic states."); 448 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_parent_count, 449 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_parent_count), 0, 450 "Current number of parent states. "); 451 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, 452 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, 453 "Current number of buckets for states hash table."); 454 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_max_length, 455 CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_max_length), 0, 456 "Current maximum length of states chains in hash buckets."); 457 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_buckets, 458 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_buckets, 459 "IU", "Max number of buckets for dynamic states hash table."); 460 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, 461 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_max, 462 "IU", "Max number of dynamic states."); 463 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_parent_max, 464 CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_parent_max, 465 "IU", "Max number of parent dynamic states."); 466 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, 467 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, 468 "Lifetime of dynamic states for TCP ACK."); 469 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, 470 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, 471 "Lifetime of dynamic states for TCP SYN."); 472 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, 473 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, 474 "Lifetime of dynamic states for TCP FIN."); 475 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, 476 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, 477 "Lifetime of dynamic states for TCP RST."); 478 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, 479 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, 480 "Lifetime of dynamic states for UDP."); 481 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, 482 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, 483 "Lifetime of dynamic states for other situations."); 484 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, 485 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, 486 "Enable keepalives for dynamic states."); 487 SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, 488 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, 489 "Do not flush dynamic states on rule deletion"); 490 491 492 #ifdef IPFIREWALL_DYNDEBUG 493 #define DYN_DEBUG(fmt, ...) do { \ 494 printf("%s: " fmt "\n", __func__, __VA_ARGS__); \ 495 } while (0) 496 #else 497 #define DYN_DEBUG(fmt, ...) 498 #endif /* !IPFIREWALL_DYNDEBUG */ 499 500 #ifdef INET6 501 /* Functions to work with IPv6 states */ 502 static struct dyn_ipv6_state *dyn_lookup_ipv6_state( 503 const struct ipfw_flow_id *, uint32_t, const void *, 504 struct ipfw_dyn_info *, int); 505 static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *, 506 uint32_t, const void *, int, uint32_t, uint16_t); 507 static struct dyn_ipv6_state *dyn_alloc_ipv6_state( 508 const struct ipfw_flow_id *, uint32_t, uint16_t, uint8_t); 509 static int dyn_add_ipv6_state(void *, uint32_t, uint16_t, uint8_t, 510 const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t, 511 struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t); 512 static void dyn_export_ipv6_state(const struct dyn_ipv6_state *, 513 ipfw_dyn_rule *); 514 515 static uint32_t dyn_getscopeid(const struct ip_fw_args *); 516 static void dyn_make_keepalive_ipv6(struct mbuf *, const struct in6_addr *, 517 const struct in6_addr *, uint32_t, uint32_t, uint32_t, uint16_t, 518 uint16_t); 519 static void dyn_enqueue_keepalive_ipv6(struct mbufq *, 520 const struct dyn_ipv6_state *); 521 static void dyn_send_keepalive_ipv6(struct ip_fw_chain *); 522 523 static struct dyn_ipv6_state *dyn_lookup_ipv6_parent( 524 const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t, 525 uint32_t); 526 static struct dyn_ipv6_state *dyn_lookup_ipv6_parent_locked( 527 const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t, 528 uint32_t); 529 static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint16_t, 530 uint8_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t, 531 uint16_t); 532 #endif /* INET6 */ 533 534 /* Functions to work with limit states */ 535 static void *dyn_get_parent_state(const struct ipfw_flow_id *, uint32_t, 536 struct ip_fw *, uint32_t, uint32_t, uint16_t); 537 static struct dyn_ipv4_state *dyn_lookup_ipv4_parent( 538 const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t); 539 static struct dyn_ipv4_state *dyn_lookup_ipv4_parent_locked( 540 const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t); 541 static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint16_t, 542 uint8_t, uint32_t); 543 static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint16_t, 544 uint8_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint16_t); 545 546 static void dyn_tick(void *); 547 static void dyn_expire_states(struct ip_fw_chain *, ipfw_range_tlv *); 548 static void dyn_free_states(struct ip_fw_chain *); 549 static void dyn_export_parent(const struct dyn_parent *, uint16_t, 550 ipfw_dyn_rule *); 551 static void dyn_export_data(const struct dyn_data *, uint16_t, uint8_t, 552 ipfw_dyn_rule *); 553 static uint32_t dyn_update_tcp_state(struct dyn_data *, 554 const struct ipfw_flow_id *, const struct tcphdr *, int); 555 static void dyn_update_proto_state(struct dyn_data *, 556 const struct ipfw_flow_id *, const void *, int, int); 557 558 /* Functions to work with IPv4 states */ 559 struct dyn_ipv4_state *dyn_lookup_ipv4_state(const struct ipfw_flow_id *, 560 const void *, struct ipfw_dyn_info *, int); 561 static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *, 562 const void *, int, uint32_t, uint16_t); 563 static struct dyn_ipv4_state *dyn_alloc_ipv4_state( 564 const struct ipfw_flow_id *, uint16_t, uint8_t); 565 static int dyn_add_ipv4_state(void *, uint32_t, uint16_t, uint8_t, 566 const struct ipfw_flow_id *, const void *, int, uint32_t, 567 struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t); 568 static void dyn_export_ipv4_state(const struct dyn_ipv4_state *, 569 ipfw_dyn_rule *); 570 571 /* 572 * Named states support. 573 */ 574 static char *default_state_name = "default"; 575 struct dyn_state_obj { 576 struct named_object no; 577 char name[64]; 578 }; 579 580 #define DYN_STATE_OBJ(ch, cmd) \ 581 ((struct dyn_state_obj *)SRV_OBJECT(ch, (cmd)->arg1)) 582 /* 583 * Classifier callback. 584 * Return 0 if opcode contains object that should be referenced 585 * or rewritten. 586 */ 587 static int 588 dyn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) 589 { 590 591 DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); 592 /* Don't rewrite "check-state any" */ 593 if (cmd->arg1 == 0 && 594 cmd->opcode == O_CHECK_STATE) 595 return (1); 596 597 *puidx = cmd->arg1; 598 *ptype = 0; 599 return (0); 600 } 601 602 static void 603 dyn_update(ipfw_insn *cmd, uint16_t idx) 604 { 605 606 cmd->arg1 = idx; 607 DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); 608 } 609 610 static int 611 dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, 612 struct named_object **pno) 613 { 614 ipfw_obj_ntlv *ntlv; 615 const char *name; 616 617 DYN_DEBUG("uidx %d", ti->uidx); 618 if (ti->uidx != 0) { 619 if (ti->tlvs == NULL) 620 return (EINVAL); 621 /* Search ntlv in the buffer provided by user */ 622 ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, 623 IPFW_TLV_STATE_NAME); 624 if (ntlv == NULL) 625 return (EINVAL); 626 name = ntlv->name; 627 } else 628 name = default_state_name; 629 /* 630 * Search named object with corresponding name. 631 * Since states objects are global - ignore the set value 632 * and use zero instead. 633 */ 634 *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, 635 IPFW_TLV_STATE_NAME, name); 636 /* 637 * We always return success here. 638 * The caller will check *pno and mark object as unresolved, 639 * then it will automatically create "default" object. 640 */ 641 return (0); 642 } 643 644 static struct named_object * 645 dyn_findbykidx(struct ip_fw_chain *ch, uint16_t idx) 646 { 647 648 DYN_DEBUG("kidx %d", idx); 649 return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); 650 } 651 652 static int 653 dyn_create(struct ip_fw_chain *ch, struct tid_info *ti, 654 uint16_t *pkidx) 655 { 656 struct namedobj_instance *ni; 657 struct dyn_state_obj *obj; 658 struct named_object *no; 659 ipfw_obj_ntlv *ntlv; 660 char *name; 661 662 DYN_DEBUG("uidx %d", ti->uidx); 663 if (ti->uidx != 0) { 664 if (ti->tlvs == NULL) 665 return (EINVAL); 666 ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, 667 IPFW_TLV_STATE_NAME); 668 if (ntlv == NULL) 669 return (EINVAL); 670 name = ntlv->name; 671 } else 672 name = default_state_name; 673 674 ni = CHAIN_TO_SRV(ch); 675 obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); 676 obj->no.name = obj->name; 677 obj->no.etlv = IPFW_TLV_STATE_NAME; 678 strlcpy(obj->name, name, sizeof(obj->name)); 679 680 IPFW_UH_WLOCK(ch); 681 no = ipfw_objhash_lookup_name_type(ni, 0, 682 IPFW_TLV_STATE_NAME, name); 683 if (no != NULL) { 684 /* 685 * Object is already created. 686 * Just return its kidx and bump refcount. 687 */ 688 *pkidx = no->kidx; 689 no->refcnt++; 690 IPFW_UH_WUNLOCK(ch); 691 free(obj, M_IPFW); 692 DYN_DEBUG("\tfound kidx %d", *pkidx); 693 return (0); 694 } 695 if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { 696 DYN_DEBUG("\talloc_idx failed for %s", name); 697 IPFW_UH_WUNLOCK(ch); 698 free(obj, M_IPFW); 699 return (ENOSPC); 700 } 701 ipfw_objhash_add(ni, &obj->no); 702 SRV_OBJECT(ch, obj->no.kidx) = obj; 703 obj->no.refcnt++; 704 *pkidx = obj->no.kidx; 705 IPFW_UH_WUNLOCK(ch); 706 DYN_DEBUG("\tcreated kidx %d", *pkidx); 707 return (0); 708 } 709 710 static void 711 dyn_destroy(struct ip_fw_chain *ch, struct named_object *no) 712 { 713 struct dyn_state_obj *obj; 714 715 IPFW_UH_WLOCK_ASSERT(ch); 716 717 KASSERT(no->etlv == IPFW_TLV_STATE_NAME, 718 ("%s: wrong object type %u", __func__, no->etlv)); 719 KASSERT(no->refcnt == 1, 720 ("Destroying object '%s' (type %u, idx %u) with refcnt %u", 721 no->name, no->etlv, no->kidx, no->refcnt)); 722 DYN_DEBUG("kidx %d", no->kidx); 723 obj = SRV_OBJECT(ch, no->kidx); 724 SRV_OBJECT(ch, no->kidx) = NULL; 725 ipfw_objhash_del(CHAIN_TO_SRV(ch), no); 726 ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx); 727 728 free(obj, M_IPFW); 729 } 730 731 static struct opcode_obj_rewrite dyn_opcodes[] = { 732 { 733 O_KEEP_STATE, IPFW_TLV_STATE_NAME, 734 dyn_classify, dyn_update, 735 dyn_findbyname, dyn_findbykidx, 736 dyn_create, dyn_destroy 737 }, 738 { 739 O_CHECK_STATE, IPFW_TLV_STATE_NAME, 740 dyn_classify, dyn_update, 741 dyn_findbyname, dyn_findbykidx, 742 dyn_create, dyn_destroy 743 }, 744 { 745 O_PROBE_STATE, IPFW_TLV_STATE_NAME, 746 dyn_classify, dyn_update, 747 dyn_findbyname, dyn_findbykidx, 748 dyn_create, dyn_destroy 749 }, 750 { 751 O_LIMIT, IPFW_TLV_STATE_NAME, 752 dyn_classify, dyn_update, 753 dyn_findbyname, dyn_findbykidx, 754 dyn_create, dyn_destroy 755 }, 756 }; 757 758 /* 759 * IMPORTANT: the hash function for dynamic rules must be commutative 760 * in source and destination (ip,port), because rules are bidirectional 761 * and we want to find both in the same bucket. 762 */ 763 #ifndef IPFIREWALL_JENKINSHASH 764 static __inline uint32_t 765 hash_packet(const struct ipfw_flow_id *id) 766 { 767 uint32_t i; 768 769 #ifdef INET6 770 if (IS_IP6_FLOW_ID(id)) 771 i = ntohl((id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ 772 (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ 773 (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ 774 (id->src_ip6.__u6_addr.__u6_addr32[3])); 775 else 776 #endif /* INET6 */ 777 i = (id->dst_ip) ^ (id->src_ip); 778 i ^= (id->dst_port) ^ (id->src_port); 779 return (i); 780 } 781 782 static __inline uint32_t 783 hash_parent(const struct ipfw_flow_id *id, const void *rule) 784 { 785 786 return (hash_packet(id) ^ ((uintptr_t)rule)); 787 } 788 789 #else /* IPFIREWALL_JENKINSHASH */ 790 791 VNET_DEFINE_STATIC(uint32_t, dyn_hashseed); 792 #define V_dyn_hashseed VNET(dyn_hashseed) 793 794 static __inline int 795 addrcmp4(const struct ipfw_flow_id *id) 796 { 797 798 if (id->src_ip < id->dst_ip) 799 return (0); 800 if (id->src_ip > id->dst_ip) 801 return (1); 802 if (id->src_port <= id->dst_port) 803 return (0); 804 return (1); 805 } 806 807 #ifdef INET6 808 static __inline int 809 addrcmp6(const struct ipfw_flow_id *id) 810 { 811 int ret; 812 813 ret = memcmp(&id->src_ip6, &id->dst_ip6, sizeof(struct in6_addr)); 814 if (ret < 0) 815 return (0); 816 if (ret > 0) 817 return (1); 818 if (id->src_port <= id->dst_port) 819 return (0); 820 return (1); 821 } 822 823 static __inline uint32_t 824 hash_packet6(const struct ipfw_flow_id *id) 825 { 826 struct tuple6 { 827 struct in6_addr addr[2]; 828 uint16_t port[2]; 829 } t6; 830 831 if (addrcmp6(id) == 0) { 832 t6.addr[0] = id->src_ip6; 833 t6.addr[1] = id->dst_ip6; 834 t6.port[0] = id->src_port; 835 t6.port[1] = id->dst_port; 836 } else { 837 t6.addr[0] = id->dst_ip6; 838 t6.addr[1] = id->src_ip6; 839 t6.port[0] = id->dst_port; 840 t6.port[1] = id->src_port; 841 } 842 return (jenkins_hash32((const uint32_t *)&t6, 843 sizeof(t6) / sizeof(uint32_t), V_dyn_hashseed)); 844 } 845 #endif 846 847 static __inline uint32_t 848 hash_packet(const struct ipfw_flow_id *id) 849 { 850 struct tuple4 { 851 in_addr_t addr[2]; 852 uint16_t port[2]; 853 } t4; 854 855 if (IS_IP4_FLOW_ID(id)) { 856 /* All fields are in host byte order */ 857 if (addrcmp4(id) == 0) { 858 t4.addr[0] = id->src_ip; 859 t4.addr[1] = id->dst_ip; 860 t4.port[0] = id->src_port; 861 t4.port[1] = id->dst_port; 862 } else { 863 t4.addr[0] = id->dst_ip; 864 t4.addr[1] = id->src_ip; 865 t4.port[0] = id->dst_port; 866 t4.port[1] = id->src_port; 867 } 868 return (jenkins_hash32((const uint32_t *)&t4, 869 sizeof(t4) / sizeof(uint32_t), V_dyn_hashseed)); 870 } else 871 #ifdef INET6 872 if (IS_IP6_FLOW_ID(id)) 873 return (hash_packet6(id)); 874 #endif 875 return (0); 876 } 877 878 static __inline uint32_t 879 hash_parent(const struct ipfw_flow_id *id, const void *rule) 880 { 881 882 return (jenkins_hash32((const uint32_t *)&rule, 883 sizeof(rule) / sizeof(uint32_t), hash_packet(id))); 884 } 885 #endif /* IPFIREWALL_JENKINSHASH */ 886 887 /* 888 * Print customizable flow id description via log(9) facility. 889 */ 890 static void 891 print_dyn_rule_flags(const struct ipfw_flow_id *id, int dyn_type, 892 int log_flags, char *prefix, char *postfix) 893 { 894 struct in_addr da; 895 #ifdef INET6 896 char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; 897 #else 898 char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; 899 #endif 900 901 #ifdef INET6 902 if (IS_IP6_FLOW_ID(id)) { 903 ip6_sprintf(src, &id->src_ip6); 904 ip6_sprintf(dst, &id->dst_ip6); 905 } else 906 #endif 907 { 908 da.s_addr = htonl(id->src_ip); 909 inet_ntop(AF_INET, &da, src, sizeof(src)); 910 da.s_addr = htonl(id->dst_ip); 911 inet_ntop(AF_INET, &da, dst, sizeof(dst)); 912 } 913 log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", 914 prefix, dyn_type, src, id->src_port, dst, 915 id->dst_port, V_dyn_count, postfix); 916 } 917 918 #define print_dyn_rule(id, dtype, prefix, postfix) \ 919 print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) 920 921 #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) 922 #define TIME_LE(a,b) ((int)((a)-(b)) < 0) 923 #define _SEQ_GE(a,b) ((int)((a)-(b)) >= 0) 924 #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) 925 #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) 926 #define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8)) 927 #define ACK_FWD 0x00010000 /* fwd ack seen */ 928 #define ACK_REV 0x00020000 /* rev ack seen */ 929 #define ACK_BOTH (ACK_FWD | ACK_REV) 930 931 static uint32_t 932 dyn_update_tcp_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, 933 const struct tcphdr *tcp, int dir) 934 { 935 uint32_t ack, expire; 936 uint32_t state, old; 937 uint8_t th_flags; 938 939 expire = data->expire; 940 old = state = data->state; 941 th_flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); 942 state |= (dir == MATCH_FORWARD) ? th_flags: (th_flags << 8); 943 switch (state & TCP_FLAGS) { 944 case TH_SYN: /* opening */ 945 expire = time_uptime + V_dyn_syn_lifetime; 946 break; 947 948 case BOTH_SYN: /* move to established */ 949 case BOTH_SYN | TH_FIN: /* one side tries to close */ 950 case BOTH_SYN | (TH_FIN << 8): 951 if (tcp == NULL) 952 break; 953 ack = ntohl(tcp->th_ack); 954 if (dir == MATCH_FORWARD) { 955 if (data->ack_fwd == 0 || 956 _SEQ_GE(ack, data->ack_fwd)) { 957 state |= ACK_FWD; 958 if (data->ack_fwd != ack) 959 ck_pr_store_32(&data->ack_fwd, ack); 960 } 961 } else { 962 if (data->ack_rev == 0 || 963 _SEQ_GE(ack, data->ack_rev)) { 964 state |= ACK_REV; 965 if (data->ack_rev != ack) 966 ck_pr_store_32(&data->ack_rev, ack); 967 } 968 } 969 if ((state & ACK_BOTH) == ACK_BOTH) { 970 /* 971 * Set expire time to V_dyn_ack_lifetime only if 972 * we got ACKs for both directions. 973 * We use XOR here to avoid possible state 974 * overwriting in concurrent thread. 975 */ 976 expire = time_uptime + V_dyn_ack_lifetime; 977 ck_pr_xor_32(&data->state, ACK_BOTH); 978 } else if ((data->state & ACK_BOTH) != (state & ACK_BOTH)) 979 ck_pr_or_32(&data->state, state & ACK_BOTH); 980 break; 981 982 case BOTH_SYN | BOTH_FIN: /* both sides closed */ 983 if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) 984 V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; 985 expire = time_uptime + V_dyn_fin_lifetime; 986 break; 987 988 default: 989 if (V_dyn_keepalive != 0 && 990 V_dyn_rst_lifetime >= V_dyn_keepalive_period) 991 V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; 992 expire = time_uptime + V_dyn_rst_lifetime; 993 } 994 /* Save TCP state if it was changed */ 995 if ((state & TCP_FLAGS) != (old & TCP_FLAGS)) 996 ck_pr_or_32(&data->state, state & TCP_FLAGS); 997 return (expire); 998 } 999 1000 /* 1001 * Update ULP specific state. 1002 * For TCP we keep sequence numbers and flags. For other protocols 1003 * currently we update only expire time. Packets and bytes counters 1004 * are also updated here. 1005 */ 1006 static void 1007 dyn_update_proto_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, 1008 const void *ulp, int pktlen, int dir) 1009 { 1010 uint32_t expire; 1011 1012 /* NOTE: we are in critical section here. */ 1013 switch (pkt->proto) { 1014 case IPPROTO_UDP: 1015 case IPPROTO_UDPLITE: 1016 expire = time_uptime + V_dyn_udp_lifetime; 1017 break; 1018 case IPPROTO_TCP: 1019 expire = dyn_update_tcp_state(data, pkt, ulp, dir); 1020 break; 1021 default: 1022 expire = time_uptime + V_dyn_short_lifetime; 1023 } 1024 /* 1025 * Expiration timer has the per-second granularity, no need to update 1026 * it every time when state is matched. 1027 */ 1028 if (data->expire != expire) 1029 ck_pr_store_32(&data->expire, expire); 1030 1031 if (dir == MATCH_FORWARD) 1032 DYN_COUNTER_INC(data, fwd, pktlen); 1033 else 1034 DYN_COUNTER_INC(data, rev, pktlen); 1035 } 1036 1037 /* 1038 * Lookup IPv4 state. 1039 * Must be called in critical section. 1040 */ 1041 struct dyn_ipv4_state * 1042 dyn_lookup_ipv4_state(const struct ipfw_flow_id *pkt, const void *ulp, 1043 struct ipfw_dyn_info *info, int pktlen) 1044 { 1045 struct dyn_ipv4_state *s; 1046 uint32_t version, bucket; 1047 1048 bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); 1049 info->version = DYN_BUCKET_VERSION(bucket, ipv4_add); 1050 restart: 1051 version = DYN_BUCKET_VERSION(bucket, ipv4_del); 1052 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 1053 DYNSTATE_PROTECT(s); 1054 if (version != DYN_BUCKET_VERSION(bucket, ipv4_del)) 1055 goto restart; 1056 if (s->proto != pkt->proto) 1057 continue; 1058 if (info->kidx != 0 && s->kidx != info->kidx) 1059 continue; 1060 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1061 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1062 info->direction = MATCH_FORWARD; 1063 break; 1064 } 1065 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1066 s->src == pkt->dst_ip && s->dst == pkt->src_ip) { 1067 info->direction = MATCH_REVERSE; 1068 break; 1069 } 1070 } 1071 1072 if (s != NULL) 1073 dyn_update_proto_state(s->data, pkt, ulp, pktlen, 1074 info->direction); 1075 return (s); 1076 } 1077 1078 /* 1079 * Lookup IPv4 state. 1080 * Simplifed version is used to check that matching state doesn't exist. 1081 */ 1082 static int 1083 dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *pkt, 1084 const void *ulp, int pktlen, uint32_t bucket, uint16_t kidx) 1085 { 1086 struct dyn_ipv4_state *s; 1087 int dir; 1088 1089 dir = MATCH_NONE; 1090 DYN_BUCKET_ASSERT(bucket); 1091 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 1092 if (s->proto != pkt->proto || 1093 s->kidx != kidx) 1094 continue; 1095 if (s->sport == pkt->src_port && 1096 s->dport == pkt->dst_port && 1097 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1098 dir = MATCH_FORWARD; 1099 break; 1100 } 1101 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1102 s->src == pkt->dst_ip && s->dst == pkt->src_ip) { 1103 dir = MATCH_REVERSE; 1104 break; 1105 } 1106 } 1107 if (s != NULL) 1108 dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); 1109 return (s != NULL); 1110 } 1111 1112 struct dyn_ipv4_state * 1113 dyn_lookup_ipv4_parent(const struct ipfw_flow_id *pkt, const void *rule, 1114 uint32_t ruleid, uint16_t rulenum, uint32_t hashval) 1115 { 1116 struct dyn_ipv4_state *s; 1117 uint32_t version, bucket; 1118 1119 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1120 restart: 1121 version = DYN_BUCKET_VERSION(bucket, ipv4_parent_del); 1122 CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { 1123 DYNSTATE_PROTECT(s); 1124 if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_del)) 1125 goto restart; 1126 /* 1127 * NOTE: we do not need to check kidx, because parent rule 1128 * can not create states with different kidx. 1129 * And parent rule always created for forward direction. 1130 */ 1131 if (s->limit->parent == rule && 1132 s->limit->ruleid == ruleid && 1133 s->limit->rulenum == rulenum && 1134 s->proto == pkt->proto && 1135 s->sport == pkt->src_port && 1136 s->dport == pkt->dst_port && 1137 s->src == pkt->src_ip && s->dst == pkt->dst_ip) { 1138 if (s->limit->expire != time_uptime + 1139 V_dyn_short_lifetime) 1140 ck_pr_store_32(&s->limit->expire, 1141 time_uptime + V_dyn_short_lifetime); 1142 break; 1143 } 1144 } 1145 return (s); 1146 } 1147 1148 static struct dyn_ipv4_state * 1149 dyn_lookup_ipv4_parent_locked(const struct ipfw_flow_id *pkt, 1150 const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket) 1151 { 1152 struct dyn_ipv4_state *s; 1153 1154 DYN_BUCKET_ASSERT(bucket); 1155 CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { 1156 if (s->limit->parent == rule && 1157 s->limit->ruleid == ruleid && 1158 s->limit->rulenum == rulenum && 1159 s->proto == pkt->proto && 1160 s->sport == pkt->src_port && 1161 s->dport == pkt->dst_port && 1162 s->src == pkt->src_ip && s->dst == pkt->dst_ip) 1163 break; 1164 } 1165 return (s); 1166 } 1167 1168 1169 #ifdef INET6 1170 static uint32_t 1171 dyn_getscopeid(const struct ip_fw_args *args) 1172 { 1173 1174 /* 1175 * If source or destination address is an scopeid address, we need 1176 * determine the scope zone id to resolve address scope ambiguity. 1177 */ 1178 if (IN6_IS_ADDR_LINKLOCAL(&args->f_id.src_ip6) || 1179 IN6_IS_ADDR_LINKLOCAL(&args->f_id.dst_ip6)) { 1180 MPASS(args->oif != NULL || 1181 args->m->m_pkthdr.rcvif != NULL); 1182 return (in6_getscopezone(args->oif != NULL ? args->oif: 1183 args->m->m_pkthdr.rcvif, IPV6_ADDR_SCOPE_LINKLOCAL)); 1184 } 1185 return (0); 1186 } 1187 1188 /* 1189 * Lookup IPv6 state. 1190 * Must be called in critical section. 1191 */ 1192 static struct dyn_ipv6_state * 1193 dyn_lookup_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1194 const void *ulp, struct ipfw_dyn_info *info, int pktlen) 1195 { 1196 struct dyn_ipv6_state *s; 1197 uint32_t version, bucket; 1198 1199 bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); 1200 info->version = DYN_BUCKET_VERSION(bucket, ipv6_add); 1201 restart: 1202 version = DYN_BUCKET_VERSION(bucket, ipv6_del); 1203 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 1204 DYNSTATE_PROTECT(s); 1205 if (version != DYN_BUCKET_VERSION(bucket, ipv6_del)) 1206 goto restart; 1207 if (s->proto != pkt->proto || s->zoneid != zoneid) 1208 continue; 1209 if (info->kidx != 0 && s->kidx != info->kidx) 1210 continue; 1211 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1212 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1213 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1214 info->direction = MATCH_FORWARD; 1215 break; 1216 } 1217 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1218 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && 1219 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { 1220 info->direction = MATCH_REVERSE; 1221 break; 1222 } 1223 } 1224 if (s != NULL) 1225 dyn_update_proto_state(s->data, pkt, ulp, pktlen, 1226 info->direction); 1227 return (s); 1228 } 1229 1230 /* 1231 * Lookup IPv6 state. 1232 * Simplifed version is used to check that matching state doesn't exist. 1233 */ 1234 static int 1235 dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1236 const void *ulp, int pktlen, uint32_t bucket, uint16_t kidx) 1237 { 1238 struct dyn_ipv6_state *s; 1239 int dir; 1240 1241 dir = MATCH_NONE; 1242 DYN_BUCKET_ASSERT(bucket); 1243 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 1244 if (s->proto != pkt->proto || s->kidx != kidx || 1245 s->zoneid != zoneid) 1246 continue; 1247 if (s->sport == pkt->src_port && s->dport == pkt->dst_port && 1248 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1249 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1250 dir = MATCH_FORWARD; 1251 break; 1252 } 1253 if (s->sport == pkt->dst_port && s->dport == pkt->src_port && 1254 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && 1255 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { 1256 dir = MATCH_REVERSE; 1257 break; 1258 } 1259 } 1260 if (s != NULL) 1261 dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); 1262 return (s != NULL); 1263 } 1264 1265 static struct dyn_ipv6_state * 1266 dyn_lookup_ipv6_parent(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1267 const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t hashval) 1268 { 1269 struct dyn_ipv6_state *s; 1270 uint32_t version, bucket; 1271 1272 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1273 restart: 1274 version = DYN_BUCKET_VERSION(bucket, ipv6_parent_del); 1275 CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { 1276 DYNSTATE_PROTECT(s); 1277 if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_del)) 1278 goto restart; 1279 /* 1280 * NOTE: we do not need to check kidx, because parent rule 1281 * can not create states with different kidx. 1282 * Also parent rule always created for forward direction. 1283 */ 1284 if (s->limit->parent == rule && 1285 s->limit->ruleid == ruleid && 1286 s->limit->rulenum == rulenum && 1287 s->proto == pkt->proto && 1288 s->sport == pkt->src_port && 1289 s->dport == pkt->dst_port && s->zoneid == zoneid && 1290 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1291 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { 1292 if (s->limit->expire != time_uptime + 1293 V_dyn_short_lifetime) 1294 ck_pr_store_32(&s->limit->expire, 1295 time_uptime + V_dyn_short_lifetime); 1296 break; 1297 } 1298 } 1299 return (s); 1300 } 1301 1302 static struct dyn_ipv6_state * 1303 dyn_lookup_ipv6_parent_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1304 const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket) 1305 { 1306 struct dyn_ipv6_state *s; 1307 1308 DYN_BUCKET_ASSERT(bucket); 1309 CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { 1310 if (s->limit->parent == rule && 1311 s->limit->ruleid == ruleid && 1312 s->limit->rulenum == rulenum && 1313 s->proto == pkt->proto && 1314 s->sport == pkt->src_port && 1315 s->dport == pkt->dst_port && s->zoneid == zoneid && 1316 IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && 1317 IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) 1318 break; 1319 } 1320 return (s); 1321 } 1322 1323 #endif /* INET6 */ 1324 1325 /* 1326 * Lookup dynamic state. 1327 * pkt - filled by ipfw_chk() ipfw_flow_id; 1328 * ulp - determined by ipfw_chk() upper level protocol header; 1329 * dyn_info - info about matched state to return back; 1330 * Returns pointer to state's parent rule and dyn_info. If there is 1331 * no state, NULL is returned. 1332 * On match ipfw_dyn_lookup() updates state's counters. 1333 */ 1334 struct ip_fw * 1335 ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp, 1336 int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info) 1337 { 1338 struct dyn_data *data; 1339 struct ip_fw *rule; 1340 1341 IPFW_RLOCK_ASSERT(&V_layer3_chain); 1342 1343 data = NULL; 1344 rule = NULL; 1345 info->kidx = cmd->arg1; 1346 info->direction = MATCH_NONE; 1347 info->hashval = hash_packet(&args->f_id); 1348 1349 DYNSTATE_CRITICAL_ENTER(); 1350 if (IS_IP4_FLOW_ID(&args->f_id)) { 1351 struct dyn_ipv4_state *s; 1352 1353 s = dyn_lookup_ipv4_state(&args->f_id, ulp, info, pktlen); 1354 if (s != NULL) { 1355 /* 1356 * Dynamic states are created using the same 5-tuple, 1357 * so it is assumed, that parent rule for O_LIMIT 1358 * state has the same address family. 1359 */ 1360 data = s->data; 1361 if (s->type == O_LIMIT) { 1362 s = data->parent; 1363 rule = s->limit->parent; 1364 } else 1365 rule = data->parent; 1366 } 1367 } 1368 #ifdef INET6 1369 else if (IS_IP6_FLOW_ID(&args->f_id)) { 1370 struct dyn_ipv6_state *s; 1371 1372 s = dyn_lookup_ipv6_state(&args->f_id, dyn_getscopeid(args), 1373 ulp, info, pktlen); 1374 if (s != NULL) { 1375 data = s->data; 1376 if (s->type == O_LIMIT) { 1377 s = data->parent; 1378 rule = s->limit->parent; 1379 } else 1380 rule = data->parent; 1381 } 1382 } 1383 #endif 1384 if (data != NULL) { 1385 /* 1386 * If cached chain id is the same, we can avoid rule index 1387 * lookup. Otherwise do lookup and update chain_id and f_pos. 1388 * It is safe even if there is concurrent thread that want 1389 * update the same state, because chain->id can be changed 1390 * only under IPFW_WLOCK(). 1391 */ 1392 if (data->chain_id != V_layer3_chain.id) { 1393 data->f_pos = ipfw_find_rule(&V_layer3_chain, 1394 data->rulenum, data->ruleid); 1395 /* 1396 * Check that found state has not orphaned. 1397 * When chain->id being changed the parent 1398 * rule can be deleted. If found rule doesn't 1399 * match the parent pointer, consider this 1400 * result as MATCH_NONE and return NULL. 1401 * 1402 * This will lead to creation of new similar state 1403 * that will be added into head of this bucket. 1404 * And the state that we currently have matched 1405 * should be deleted by dyn_expire_states(). 1406 * 1407 * In case when dyn_keep_states is enabled, return 1408 * pointer to deleted rule and f_pos value 1409 * corresponding to penultimate rule. 1410 * When we have enabled V_dyn_keep_states, states 1411 * that become orphaned will get the DYN_REFERENCED 1412 * flag and rule will keep around. So we can return 1413 * it. But since it is not in the rules map, we need 1414 * return such f_pos value, so after the state 1415 * handling if the search will continue, the next rule 1416 * will be the last one - the default rule. 1417 */ 1418 if (V_layer3_chain.map[data->f_pos] == rule) { 1419 data->chain_id = V_layer3_chain.id; 1420 info->f_pos = data->f_pos; 1421 } else if (V_dyn_keep_states != 0) { 1422 /* 1423 * The original rule pointer is still usable. 1424 * So, we return it, but f_pos need to be 1425 * changed to point to the penultimate rule. 1426 */ 1427 MPASS(V_layer3_chain.n_rules > 1); 1428 data->chain_id = V_layer3_chain.id; 1429 data->f_pos = V_layer3_chain.n_rules - 2; 1430 info->f_pos = data->f_pos; 1431 } else { 1432 rule = NULL; 1433 info->direction = MATCH_NONE; 1434 DYN_DEBUG("rule %p [%u, %u] is considered " 1435 "invalid in data %p", rule, data->ruleid, 1436 data->rulenum, data); 1437 /* info->f_pos doesn't matter here. */ 1438 } 1439 } else 1440 info->f_pos = data->f_pos; 1441 } 1442 DYNSTATE_CRITICAL_EXIT(); 1443 #if 0 1444 /* 1445 * Return MATCH_NONE if parent rule is in disabled set. 1446 * This will lead to creation of new similar state that 1447 * will be added into head of this bucket. 1448 * 1449 * XXXAE: we need to be able update state's set when parent 1450 * rule set is changed. 1451 */ 1452 if (rule != NULL && (V_set_disable & (1 << rule->set))) { 1453 rule = NULL; 1454 info->direction = MATCH_NONE; 1455 } 1456 #endif 1457 return (rule); 1458 } 1459 1460 static struct dyn_parent * 1461 dyn_alloc_parent(void *parent, uint32_t ruleid, uint16_t rulenum, 1462 uint8_t set, uint32_t hashval) 1463 { 1464 struct dyn_parent *limit; 1465 1466 limit = uma_zalloc(V_dyn_parent_zone, M_NOWAIT | M_ZERO); 1467 if (limit == NULL) { 1468 if (last_log != time_uptime) { 1469 last_log = time_uptime; 1470 log(LOG_DEBUG, 1471 "ipfw: Cannot allocate parent dynamic state, " 1472 "consider increasing " 1473 "net.inet.ip.fw.dyn_parent_max\n"); 1474 } 1475 return (NULL); 1476 } 1477 1478 limit->parent = parent; 1479 limit->ruleid = ruleid; 1480 limit->rulenum = rulenum; 1481 limit->set = set; 1482 limit->hashval = hashval; 1483 limit->expire = time_uptime + V_dyn_short_lifetime; 1484 return (limit); 1485 } 1486 1487 static struct dyn_data * 1488 dyn_alloc_dyndata(void *parent, uint32_t ruleid, uint16_t rulenum, 1489 uint8_t set, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, 1490 uint32_t hashval, uint16_t fibnum) 1491 { 1492 struct dyn_data *data; 1493 1494 data = uma_zalloc(V_dyn_data_zone, M_NOWAIT | M_ZERO); 1495 if (data == NULL) { 1496 if (last_log != time_uptime) { 1497 last_log = time_uptime; 1498 log(LOG_DEBUG, 1499 "ipfw: Cannot allocate dynamic state, " 1500 "consider increasing net.inet.ip.fw.dyn_max\n"); 1501 } 1502 return (NULL); 1503 } 1504 1505 data->parent = parent; 1506 data->ruleid = ruleid; 1507 data->rulenum = rulenum; 1508 data->set = set; 1509 data->fibnum = fibnum; 1510 data->hashval = hashval; 1511 data->expire = time_uptime + V_dyn_syn_lifetime; 1512 dyn_update_proto_state(data, pkt, ulp, pktlen, MATCH_FORWARD); 1513 return (data); 1514 } 1515 1516 static struct dyn_ipv4_state * 1517 dyn_alloc_ipv4_state(const struct ipfw_flow_id *pkt, uint16_t kidx, 1518 uint8_t type) 1519 { 1520 struct dyn_ipv4_state *s; 1521 1522 s = uma_zalloc(V_dyn_ipv4_zone, M_NOWAIT | M_ZERO); 1523 if (s == NULL) 1524 return (NULL); 1525 1526 s->type = type; 1527 s->kidx = kidx; 1528 s->proto = pkt->proto; 1529 s->sport = pkt->src_port; 1530 s->dport = pkt->dst_port; 1531 s->src = pkt->src_ip; 1532 s->dst = pkt->dst_ip; 1533 return (s); 1534 } 1535 1536 /* 1537 * Add IPv4 parent state. 1538 * Returns pointer to parent state. When it is not NULL we are in 1539 * critical section and pointer protected by hazard pointer. 1540 * When some error occurs, it returns NULL and exit from critical section 1541 * is not needed. 1542 */ 1543 static struct dyn_ipv4_state * 1544 dyn_add_ipv4_parent(void *rule, uint32_t ruleid, uint16_t rulenum, 1545 uint8_t set, const struct ipfw_flow_id *pkt, uint32_t hashval, 1546 uint32_t version, uint16_t kidx) 1547 { 1548 struct dyn_ipv4_state *s; 1549 struct dyn_parent *limit; 1550 uint32_t bucket; 1551 1552 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1553 DYN_BUCKET_LOCK(bucket); 1554 if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_add)) { 1555 /* 1556 * Bucket version has been changed since last lookup, 1557 * do lookup again to be sure that state does not exist. 1558 */ 1559 s = dyn_lookup_ipv4_parent_locked(pkt, rule, ruleid, 1560 rulenum, bucket); 1561 if (s != NULL) { 1562 /* 1563 * Simultaneous thread has already created this 1564 * state. Just return it. 1565 */ 1566 DYNSTATE_CRITICAL_ENTER(); 1567 DYNSTATE_PROTECT(s); 1568 DYN_BUCKET_UNLOCK(bucket); 1569 return (s); 1570 } 1571 } 1572 1573 limit = dyn_alloc_parent(rule, ruleid, rulenum, set, hashval); 1574 if (limit == NULL) { 1575 DYN_BUCKET_UNLOCK(bucket); 1576 return (NULL); 1577 } 1578 1579 s = dyn_alloc_ipv4_state(pkt, kidx, O_LIMIT_PARENT); 1580 if (s == NULL) { 1581 DYN_BUCKET_UNLOCK(bucket); 1582 uma_zfree(V_dyn_parent_zone, limit); 1583 return (NULL); 1584 } 1585 1586 s->limit = limit; 1587 CK_SLIST_INSERT_HEAD(&V_dyn_ipv4_parent[bucket], s, entry); 1588 DYN_COUNT_INC(dyn_parent_count); 1589 DYN_BUCKET_VERSION_BUMP(bucket, ipv4_parent_add); 1590 DYNSTATE_CRITICAL_ENTER(); 1591 DYNSTATE_PROTECT(s); 1592 DYN_BUCKET_UNLOCK(bucket); 1593 return (s); 1594 } 1595 1596 static int 1597 dyn_add_ipv4_state(void *parent, uint32_t ruleid, uint16_t rulenum, 1598 uint8_t set, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, 1599 uint32_t hashval, struct ipfw_dyn_info *info, uint16_t fibnum, 1600 uint16_t kidx, uint8_t type) 1601 { 1602 struct dyn_ipv4_state *s; 1603 void *data; 1604 uint32_t bucket; 1605 1606 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1607 DYN_BUCKET_LOCK(bucket); 1608 if (info->direction == MATCH_UNKNOWN || 1609 info->kidx != kidx || 1610 info->hashval != hashval || 1611 info->version != DYN_BUCKET_VERSION(bucket, ipv4_add)) { 1612 /* 1613 * Bucket version has been changed since last lookup, 1614 * do lookup again to be sure that state does not exist. 1615 */ 1616 if (dyn_lookup_ipv4_state_locked(pkt, ulp, pktlen, 1617 bucket, kidx) != 0) { 1618 DYN_BUCKET_UNLOCK(bucket); 1619 return (EEXIST); 1620 } 1621 } 1622 1623 data = dyn_alloc_dyndata(parent, ruleid, rulenum, set, pkt, ulp, 1624 pktlen, hashval, fibnum); 1625 if (data == NULL) { 1626 DYN_BUCKET_UNLOCK(bucket); 1627 return (ENOMEM); 1628 } 1629 1630 s = dyn_alloc_ipv4_state(pkt, kidx, type); 1631 if (s == NULL) { 1632 DYN_BUCKET_UNLOCK(bucket); 1633 uma_zfree(V_dyn_data_zone, data); 1634 return (ENOMEM); 1635 } 1636 1637 s->data = data; 1638 CK_SLIST_INSERT_HEAD(&V_dyn_ipv4[bucket], s, entry); 1639 DYN_COUNT_INC(dyn_count); 1640 DYN_BUCKET_VERSION_BUMP(bucket, ipv4_add); 1641 DYN_BUCKET_UNLOCK(bucket); 1642 return (0); 1643 } 1644 1645 #ifdef INET6 1646 static struct dyn_ipv6_state * 1647 dyn_alloc_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1648 uint16_t kidx, uint8_t type) 1649 { 1650 struct dyn_ipv6_state *s; 1651 1652 s = uma_zalloc(V_dyn_ipv6_zone, M_NOWAIT | M_ZERO); 1653 if (s == NULL) 1654 return (NULL); 1655 1656 s->type = type; 1657 s->kidx = kidx; 1658 s->zoneid = zoneid; 1659 s->proto = pkt->proto; 1660 s->sport = pkt->src_port; 1661 s->dport = pkt->dst_port; 1662 s->src = pkt->src_ip6; 1663 s->dst = pkt->dst_ip6; 1664 return (s); 1665 } 1666 1667 /* 1668 * Add IPv6 parent state. 1669 * Returns pointer to parent state. When it is not NULL we are in 1670 * critical section and pointer protected by hazard pointer. 1671 * When some error occurs, it return NULL and exit from critical section 1672 * is not needed. 1673 */ 1674 static struct dyn_ipv6_state * 1675 dyn_add_ipv6_parent(void *rule, uint32_t ruleid, uint16_t rulenum, 1676 uint8_t set, const struct ipfw_flow_id *pkt, uint32_t zoneid, 1677 uint32_t hashval, uint32_t version, uint16_t kidx) 1678 { 1679 struct dyn_ipv6_state *s; 1680 struct dyn_parent *limit; 1681 uint32_t bucket; 1682 1683 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1684 DYN_BUCKET_LOCK(bucket); 1685 if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_add)) { 1686 /* 1687 * Bucket version has been changed since last lookup, 1688 * do lookup again to be sure that state does not exist. 1689 */ 1690 s = dyn_lookup_ipv6_parent_locked(pkt, zoneid, rule, ruleid, 1691 rulenum, bucket); 1692 if (s != NULL) { 1693 /* 1694 * Simultaneous thread has already created this 1695 * state. Just return it. 1696 */ 1697 DYNSTATE_CRITICAL_ENTER(); 1698 DYNSTATE_PROTECT(s); 1699 DYN_BUCKET_UNLOCK(bucket); 1700 return (s); 1701 } 1702 } 1703 1704 limit = dyn_alloc_parent(rule, ruleid, rulenum, set, hashval); 1705 if (limit == NULL) { 1706 DYN_BUCKET_UNLOCK(bucket); 1707 return (NULL); 1708 } 1709 1710 s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, O_LIMIT_PARENT); 1711 if (s == NULL) { 1712 DYN_BUCKET_UNLOCK(bucket); 1713 uma_zfree(V_dyn_parent_zone, limit); 1714 return (NULL); 1715 } 1716 1717 s->limit = limit; 1718 CK_SLIST_INSERT_HEAD(&V_dyn_ipv6_parent[bucket], s, entry); 1719 DYN_COUNT_INC(dyn_parent_count); 1720 DYN_BUCKET_VERSION_BUMP(bucket, ipv6_parent_add); 1721 DYNSTATE_CRITICAL_ENTER(); 1722 DYNSTATE_PROTECT(s); 1723 DYN_BUCKET_UNLOCK(bucket); 1724 return (s); 1725 } 1726 1727 static int 1728 dyn_add_ipv6_state(void *parent, uint32_t ruleid, uint16_t rulenum, 1729 uint8_t set, const struct ipfw_flow_id *pkt, uint32_t zoneid, 1730 const void *ulp, int pktlen, uint32_t hashval, struct ipfw_dyn_info *info, 1731 uint16_t fibnum, uint16_t kidx, uint8_t type) 1732 { 1733 struct dyn_ipv6_state *s; 1734 struct dyn_data *data; 1735 uint32_t bucket; 1736 1737 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1738 DYN_BUCKET_LOCK(bucket); 1739 if (info->direction == MATCH_UNKNOWN || 1740 info->kidx != kidx || 1741 info->hashval != hashval || 1742 info->version != DYN_BUCKET_VERSION(bucket, ipv6_add)) { 1743 /* 1744 * Bucket version has been changed since last lookup, 1745 * do lookup again to be sure that state does not exist. 1746 */ 1747 if (dyn_lookup_ipv6_state_locked(pkt, zoneid, ulp, pktlen, 1748 bucket, kidx) != 0) { 1749 DYN_BUCKET_UNLOCK(bucket); 1750 return (EEXIST); 1751 } 1752 } 1753 1754 data = dyn_alloc_dyndata(parent, ruleid, rulenum, set, pkt, ulp, 1755 pktlen, hashval, fibnum); 1756 if (data == NULL) { 1757 DYN_BUCKET_UNLOCK(bucket); 1758 return (ENOMEM); 1759 } 1760 1761 s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, type); 1762 if (s == NULL) { 1763 DYN_BUCKET_UNLOCK(bucket); 1764 uma_zfree(V_dyn_data_zone, data); 1765 return (ENOMEM); 1766 } 1767 1768 s->data = data; 1769 CK_SLIST_INSERT_HEAD(&V_dyn_ipv6[bucket], s, entry); 1770 DYN_COUNT_INC(dyn_count); 1771 DYN_BUCKET_VERSION_BUMP(bucket, ipv6_add); 1772 DYN_BUCKET_UNLOCK(bucket); 1773 return (0); 1774 } 1775 #endif /* INET6 */ 1776 1777 static void * 1778 dyn_get_parent_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1779 struct ip_fw *rule, uint32_t hashval, uint32_t limit, uint16_t kidx) 1780 { 1781 char sbuf[24]; 1782 struct dyn_parent *p; 1783 void *ret; 1784 uint32_t bucket, version; 1785 1786 p = NULL; 1787 ret = NULL; 1788 bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); 1789 DYNSTATE_CRITICAL_ENTER(); 1790 if (IS_IP4_FLOW_ID(pkt)) { 1791 struct dyn_ipv4_state *s; 1792 1793 version = DYN_BUCKET_VERSION(bucket, ipv4_parent_add); 1794 s = dyn_lookup_ipv4_parent(pkt, rule, rule->id, 1795 rule->rulenum, bucket); 1796 if (s == NULL) { 1797 /* 1798 * Exit from critical section because dyn_add_parent() 1799 * will acquire bucket lock. 1800 */ 1801 DYNSTATE_CRITICAL_EXIT(); 1802 1803 s = dyn_add_ipv4_parent(rule, rule->id, 1804 rule->rulenum, rule->set, pkt, hashval, 1805 version, kidx); 1806 if (s == NULL) 1807 return (NULL); 1808 /* Now we are in critical section again. */ 1809 } 1810 ret = s; 1811 p = s->limit; 1812 } 1813 #ifdef INET6 1814 else if (IS_IP6_FLOW_ID(pkt)) { 1815 struct dyn_ipv6_state *s; 1816 1817 version = DYN_BUCKET_VERSION(bucket, ipv6_parent_add); 1818 s = dyn_lookup_ipv6_parent(pkt, zoneid, rule, rule->id, 1819 rule->rulenum, bucket); 1820 if (s == NULL) { 1821 /* 1822 * Exit from critical section because dyn_add_parent() 1823 * can acquire bucket mutex. 1824 */ 1825 DYNSTATE_CRITICAL_EXIT(); 1826 1827 s = dyn_add_ipv6_parent(rule, rule->id, 1828 rule->rulenum, rule->set, pkt, zoneid, hashval, 1829 version, kidx); 1830 if (s == NULL) 1831 return (NULL); 1832 /* Now we are in critical section again. */ 1833 } 1834 ret = s; 1835 p = s->limit; 1836 } 1837 #endif 1838 else { 1839 DYNSTATE_CRITICAL_EXIT(); 1840 return (NULL); 1841 } 1842 1843 /* Check the limit */ 1844 if (DPARENT_COUNT(p) >= limit) { 1845 DYNSTATE_CRITICAL_EXIT(); 1846 if (V_fw_verbose && last_log != time_uptime) { 1847 last_log = time_uptime; 1848 snprintf(sbuf, sizeof(sbuf), "%u drop session", 1849 rule->rulenum); 1850 print_dyn_rule_flags(pkt, O_LIMIT, 1851 LOG_SECURITY | LOG_DEBUG, sbuf, 1852 "too many entries"); 1853 } 1854 return (NULL); 1855 } 1856 1857 /* Take new session into account. */ 1858 DPARENT_COUNT_INC(p); 1859 /* 1860 * We must exit from critical section because the following code 1861 * can acquire bucket mutex. 1862 * We rely on the the 'count' field. The state will not expire 1863 * until it has some child states, i.e. 'count' field is not zero. 1864 * Return state pointer, it will be used by child states as parent. 1865 */ 1866 DYNSTATE_CRITICAL_EXIT(); 1867 return (ret); 1868 } 1869 1870 static int 1871 dyn_install_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, 1872 uint16_t fibnum, const void *ulp, int pktlen, void *rule, 1873 uint32_t ruleid, uint16_t rulenum, uint8_t set, 1874 struct ipfw_dyn_info *info, uint32_t limit, uint16_t limit_mask, 1875 uint16_t kidx, uint8_t type) 1876 { 1877 struct ipfw_flow_id id; 1878 uint32_t hashval, parent_hashval; 1879 int ret; 1880 1881 MPASS(type == O_LIMIT || type == O_KEEP_STATE); 1882 1883 if (type == O_LIMIT) { 1884 /* Create masked flow id and calculate bucket */ 1885 id.addr_type = pkt->addr_type; 1886 id.proto = pkt->proto; 1887 id.fib = fibnum; /* unused */ 1888 id.src_port = (limit_mask & DYN_SRC_PORT) ? 1889 pkt->src_port: 0; 1890 id.dst_port = (limit_mask & DYN_DST_PORT) ? 1891 pkt->dst_port: 0; 1892 if (IS_IP4_FLOW_ID(pkt)) { 1893 id.src_ip = (limit_mask & DYN_SRC_ADDR) ? 1894 pkt->src_ip: 0; 1895 id.dst_ip = (limit_mask & DYN_DST_ADDR) ? 1896 pkt->dst_ip: 0; 1897 } 1898 #ifdef INET6 1899 else if (IS_IP6_FLOW_ID(pkt)) { 1900 if (limit_mask & DYN_SRC_ADDR) 1901 id.src_ip6 = pkt->src_ip6; 1902 else 1903 memset(&id.src_ip6, 0, sizeof(id.src_ip6)); 1904 if (limit_mask & DYN_DST_ADDR) 1905 id.dst_ip6 = pkt->dst_ip6; 1906 else 1907 memset(&id.dst_ip6, 0, sizeof(id.dst_ip6)); 1908 } 1909 #endif 1910 else 1911 return (EAFNOSUPPORT); 1912 1913 parent_hashval = hash_parent(&id, rule); 1914 rule = dyn_get_parent_state(&id, zoneid, rule, parent_hashval, 1915 limit, kidx); 1916 if (rule == NULL) { 1917 #if 0 1918 if (V_fw_verbose && last_log != time_uptime) { 1919 last_log = time_uptime; 1920 snprintf(sbuf, sizeof(sbuf), 1921 "%u drop session", rule->rulenum); 1922 print_dyn_rule_flags(pkt, O_LIMIT, 1923 LOG_SECURITY | LOG_DEBUG, sbuf, 1924 "too many entries"); 1925 } 1926 #endif 1927 return (EACCES); 1928 } 1929 /* 1930 * Limit is not reached, create new state. 1931 * Now rule points to parent state. 1932 */ 1933 } 1934 1935 hashval = hash_packet(pkt); 1936 if (IS_IP4_FLOW_ID(pkt)) 1937 ret = dyn_add_ipv4_state(rule, ruleid, rulenum, set, pkt, 1938 ulp, pktlen, hashval, info, fibnum, kidx, type); 1939 #ifdef INET6 1940 else if (IS_IP6_FLOW_ID(pkt)) 1941 ret = dyn_add_ipv6_state(rule, ruleid, rulenum, set, pkt, 1942 zoneid, ulp, pktlen, hashval, info, fibnum, kidx, type); 1943 #endif /* INET6 */ 1944 else 1945 ret = EAFNOSUPPORT; 1946 1947 if (type == O_LIMIT) { 1948 if (ret != 0) { 1949 /* 1950 * We failed to create child state for O_LIMIT 1951 * opcode. Since we already counted it in the parent, 1952 * we must revert counter back. The 'rule' points to 1953 * parent state, use it to get dyn_parent. 1954 * 1955 * XXXAE: it should be safe to use 'rule' pointer 1956 * without extra lookup, parent state is referenced 1957 * and should not be freed. 1958 */ 1959 if (IS_IP4_FLOW_ID(&id)) 1960 DPARENT_COUNT_DEC( 1961 ((struct dyn_ipv4_state *)rule)->limit); 1962 #ifdef INET6 1963 else if (IS_IP6_FLOW_ID(&id)) 1964 DPARENT_COUNT_DEC( 1965 ((struct dyn_ipv6_state *)rule)->limit); 1966 #endif 1967 } 1968 } 1969 /* 1970 * EEXIST means that simultaneous thread has created this 1971 * state. Consider this as success. 1972 * 1973 * XXXAE: should we invalidate 'info' content here? 1974 */ 1975 if (ret == EEXIST) 1976 return (0); 1977 return (ret); 1978 } 1979 1980 /* 1981 * Install dynamic state. 1982 * chain - ipfw's instance; 1983 * rule - the parent rule that installs the state; 1984 * cmd - opcode that installs the state; 1985 * args - ipfw arguments; 1986 * ulp - upper level protocol header; 1987 * pktlen - packet length; 1988 * info - dynamic state lookup info; 1989 * tablearg - tablearg id. 1990 * 1991 * Returns non-zero value (failure) if state is not installed because 1992 * of errors or because session limitations are enforced. 1993 */ 1994 int 1995 ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, 1996 const ipfw_insn_limit *cmd, const struct ip_fw_args *args, 1997 const void *ulp, int pktlen, struct ipfw_dyn_info *info, 1998 uint32_t tablearg) 1999 { 2000 uint32_t limit; 2001 uint16_t limit_mask; 2002 2003 if (cmd->o.opcode == O_LIMIT) { 2004 limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); 2005 limit_mask = cmd->limit_mask; 2006 } else { 2007 limit = 0; 2008 limit_mask = 0; 2009 } 2010 return (dyn_install_state(&args->f_id, 2011 #ifdef INET6 2012 IS_IP6_FLOW_ID(&args->f_id) ? dyn_getscopeid(args): 2013 #endif 2014 0, M_GETFIB(args->m), ulp, pktlen, rule, rule->id, rule->rulenum, 2015 rule->set, info, limit, limit_mask, cmd->o.arg1, cmd->o.opcode)); 2016 } 2017 2018 /* 2019 * Free safe to remove state entries from expired lists. 2020 */ 2021 static void 2022 dyn_free_states(struct ip_fw_chain *chain) 2023 { 2024 struct dyn_ipv4_state *s4, *s4n; 2025 #ifdef INET6 2026 struct dyn_ipv6_state *s6, *s6n; 2027 #endif 2028 int cached_count, i; 2029 2030 /* 2031 * We keep pointers to objects that are in use on each CPU 2032 * in the per-cpu dyn_hp pointer. When object is going to be 2033 * removed, first of it is unlinked from the corresponding 2034 * list. This leads to changing of dyn_bucket_xxx_delver version. 2035 * Unlinked objects is placed into corresponding dyn_expired_xxx 2036 * list. Reader that is going to dereference object pointer checks 2037 * dyn_bucket_xxx_delver version before and after storing pointer 2038 * into dyn_hp. If version is the same, the object is protected 2039 * from freeing and it is safe to dereference. Othervise reader 2040 * tries to iterate list again from the beginning, but this object 2041 * now unlinked and thus will not be accessible. 2042 * 2043 * Copy dyn_hp pointers for each CPU into dyn_hp_cache array. 2044 * It does not matter that some pointer can be changed in 2045 * time while we are copying. We need to check, that objects 2046 * removed in the previous pass are not in use. And if dyn_hp 2047 * pointer does not contain it in the time when we are copying, 2048 * it will not appear there, because it is already unlinked. 2049 * And for new pointers we will not free objects that will be 2050 * unlinked in this pass. 2051 */ 2052 cached_count = 0; 2053 CPU_FOREACH(i) { 2054 dyn_hp_cache[cached_count] = DYNSTATE_GET(i); 2055 if (dyn_hp_cache[cached_count] != NULL) 2056 cached_count++; 2057 } 2058 2059 /* 2060 * Free expired states that are safe to free. 2061 * Check each entry from previous pass in the dyn_expired_xxx 2062 * list, if pointer to the object is in the dyn_hp_cache array, 2063 * keep it until next pass. Otherwise it is safe to free the 2064 * object. 2065 * 2066 * XXXAE: optimize this to use SLIST_REMOVE_AFTER. 2067 */ 2068 #define DYN_FREE_STATES(s, next, name) do { \ 2069 s = SLIST_FIRST(&V_dyn_expired_ ## name); \ 2070 while (s != NULL) { \ 2071 next = SLIST_NEXT(s, expired); \ 2072 for (i = 0; i < cached_count; i++) \ 2073 if (dyn_hp_cache[i] == s) \ 2074 break; \ 2075 if (i == cached_count) { \ 2076 if (s->type == O_LIMIT_PARENT && \ 2077 s->limit->count != 0) { \ 2078 s = next; \ 2079 continue; \ 2080 } \ 2081 SLIST_REMOVE(&V_dyn_expired_ ## name, \ 2082 s, dyn_ ## name ## _state, expired); \ 2083 if (s->type == O_LIMIT_PARENT) \ 2084 uma_zfree(V_dyn_parent_zone, s->limit); \ 2085 else \ 2086 uma_zfree(V_dyn_data_zone, s->data); \ 2087 uma_zfree(V_dyn_ ## name ## _zone, s); \ 2088 } \ 2089 s = next; \ 2090 } \ 2091 } while (0) 2092 2093 /* 2094 * Protect access to expired lists with DYN_EXPIRED_LOCK. 2095 * Userland can invoke ipfw_expire_dyn_states() to delete 2096 * specific states, this will lead to modification of expired 2097 * lists. 2098 * 2099 * XXXAE: do we need DYN_EXPIRED_LOCK? We can just use 2100 * IPFW_UH_WLOCK to protect access to these lists. 2101 */ 2102 DYN_EXPIRED_LOCK(); 2103 DYN_FREE_STATES(s4, s4n, ipv4); 2104 #ifdef INET6 2105 DYN_FREE_STATES(s6, s6n, ipv6); 2106 #endif 2107 DYN_EXPIRED_UNLOCK(); 2108 #undef DYN_FREE_STATES 2109 } 2110 2111 /* 2112 * Returns: 2113 * 0 when state is not matched by specified range; 2114 * 1 when state is matched by specified range; 2115 * 2 when state is matched by specified range and requested deletion of 2116 * dynamic states. 2117 */ 2118 static int 2119 dyn_match_range(uint16_t rulenum, uint8_t set, const ipfw_range_tlv *rt) 2120 { 2121 2122 MPASS(rt != NULL); 2123 /* flush all states */ 2124 if (rt->flags & IPFW_RCFLAG_ALL) { 2125 if (rt->flags & IPFW_RCFLAG_DYNAMIC) 2126 return (2); /* forced */ 2127 return (1); 2128 } 2129 if ((rt->flags & IPFW_RCFLAG_SET) != 0 && set != rt->set) 2130 return (0); 2131 if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && 2132 (rulenum < rt->start_rule || rulenum > rt->end_rule)) 2133 return (0); 2134 if (rt->flags & IPFW_RCFLAG_DYNAMIC) 2135 return (2); 2136 return (1); 2137 } 2138 2139 static void 2140 dyn_acquire_rule(struct ip_fw_chain *ch, struct dyn_data *data, 2141 struct ip_fw *rule, uint16_t kidx) 2142 { 2143 struct dyn_state_obj *obj; 2144 2145 /* 2146 * Do not acquire reference twice. 2147 * This can happen when rule deletion executed for 2148 * the same range, but different ruleset id. 2149 */ 2150 if (data->flags & DYN_REFERENCED) 2151 return; 2152 2153 IPFW_UH_WLOCK_ASSERT(ch); 2154 MPASS(kidx != 0); 2155 2156 data->flags |= DYN_REFERENCED; 2157 /* Reference the named object */ 2158 obj = SRV_OBJECT(ch, kidx); 2159 obj->no.refcnt++; 2160 MPASS(obj->no.etlv == IPFW_TLV_STATE_NAME); 2161 2162 /* Reference the parent rule */ 2163 rule->refcnt++; 2164 } 2165 2166 static void 2167 dyn_release_rule(struct ip_fw_chain *ch, struct dyn_data *data, 2168 struct ip_fw *rule, uint16_t kidx) 2169 { 2170 struct dyn_state_obj *obj; 2171 2172 IPFW_UH_WLOCK_ASSERT(ch); 2173 MPASS(kidx != 0); 2174 2175 obj = SRV_OBJECT(ch, kidx); 2176 if (obj->no.refcnt == 1) 2177 dyn_destroy(ch, &obj->no); 2178 else 2179 obj->no.refcnt--; 2180 2181 if (--rule->refcnt == 1) 2182 ipfw_free_rule(rule); 2183 } 2184 2185 /* 2186 * We do not keep O_LIMIT_PARENT states when V_dyn_keep_states is enabled. 2187 * O_LIMIT state is created when new connection is going to be established 2188 * and there is no matching state. So, since the old parent rule was deleted 2189 * we can't create new states with old parent, and thus we can not account 2190 * new connections with already established connections, and can not do 2191 * proper limiting. 2192 */ 2193 static int 2194 dyn_match_ipv4_state(struct ip_fw_chain *ch, struct dyn_ipv4_state *s, 2195 const ipfw_range_tlv *rt) 2196 { 2197 struct ip_fw *rule; 2198 int ret; 2199 2200 if (s->type == O_LIMIT_PARENT) 2201 return (dyn_match_range(s->limit->rulenum, 2202 s->limit->set, rt)); 2203 2204 ret = dyn_match_range(s->data->rulenum, s->data->set, rt); 2205 if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) 2206 return (ret); 2207 2208 rule = s->data->parent; 2209 if (s->type == O_LIMIT) 2210 rule = ((struct dyn_ipv4_state *)rule)->limit->parent; 2211 dyn_acquire_rule(ch, s->data, rule, s->kidx); 2212 return (0); 2213 } 2214 2215 #ifdef INET6 2216 static int 2217 dyn_match_ipv6_state(struct ip_fw_chain *ch, struct dyn_ipv6_state *s, 2218 const ipfw_range_tlv *rt) 2219 { 2220 struct ip_fw *rule; 2221 int ret; 2222 2223 if (s->type == O_LIMIT_PARENT) 2224 return (dyn_match_range(s->limit->rulenum, 2225 s->limit->set, rt)); 2226 2227 ret = dyn_match_range(s->data->rulenum, s->data->set, rt); 2228 if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) 2229 return (ret); 2230 2231 rule = s->data->parent; 2232 if (s->type == O_LIMIT) 2233 rule = ((struct dyn_ipv6_state *)rule)->limit->parent; 2234 dyn_acquire_rule(ch, s->data, rule, s->kidx); 2235 return (0); 2236 } 2237 #endif 2238 2239 /* 2240 * Unlink expired entries from states lists. 2241 * @rt can be used to specify the range of states for deletion. 2242 */ 2243 static void 2244 dyn_expire_states(struct ip_fw_chain *ch, ipfw_range_tlv *rt) 2245 { 2246 struct dyn_ipv4_slist expired_ipv4; 2247 #ifdef INET6 2248 struct dyn_ipv6_slist expired_ipv6; 2249 struct dyn_ipv6_state *s6, *s6n, *s6p; 2250 #endif 2251 struct dyn_ipv4_state *s4, *s4n, *s4p; 2252 void *rule; 2253 int bucket, removed, length, max_length; 2254 2255 IPFW_UH_WLOCK_ASSERT(ch); 2256 2257 /* 2258 * Unlink expired states from each bucket. 2259 * With acquired bucket lock iterate entries of each lists: 2260 * ipv4, ipv4_parent, ipv6, and ipv6_parent. Check expired time 2261 * and unlink entry from the list, link entry into temporary 2262 * expired_xxx lists then bump "del" bucket version. 2263 * 2264 * When an entry is removed, corresponding states counter is 2265 * decremented. If entry has O_LIMIT type, parent's reference 2266 * counter is decremented. 2267 * 2268 * NOTE: this function can be called from userspace context 2269 * when user deletes rules. In this case all matched states 2270 * will be forcedly unlinked. O_LIMIT_PARENT states will be kept 2271 * in the expired lists until reference counter become zero. 2272 */ 2273 #define DYN_UNLINK_STATES(s, prev, next, exp, af, name, extra) do { \ 2274 length = 0; \ 2275 removed = 0; \ 2276 prev = NULL; \ 2277 s = CK_SLIST_FIRST(&V_dyn_ ## name [bucket]); \ 2278 while (s != NULL) { \ 2279 next = CK_SLIST_NEXT(s, entry); \ 2280 if ((TIME_LEQ((s)->exp, time_uptime) && extra) || \ 2281 (rt != NULL && \ 2282 dyn_match_ ## af ## _state(ch, s, rt))) { \ 2283 if (prev != NULL) \ 2284 CK_SLIST_REMOVE_AFTER(prev, entry); \ 2285 else \ 2286 CK_SLIST_REMOVE_HEAD( \ 2287 &V_dyn_ ## name [bucket], entry); \ 2288 removed++; \ 2289 SLIST_INSERT_HEAD(&expired_ ## af, s, expired); \ 2290 if (s->type == O_LIMIT_PARENT) \ 2291 DYN_COUNT_DEC(dyn_parent_count); \ 2292 else { \ 2293 DYN_COUNT_DEC(dyn_count); \ 2294 if (s->data->flags & DYN_REFERENCED) { \ 2295 rule = s->data->parent; \ 2296 if (s->type == O_LIMIT) \ 2297 rule = ((__typeof(s)) \ 2298 rule)->limit->parent;\ 2299 dyn_release_rule(ch, s->data, \ 2300 rule, s->kidx); \ 2301 } \ 2302 if (s->type == O_LIMIT) { \ 2303 s = s->data->parent; \ 2304 DPARENT_COUNT_DEC(s->limit); \ 2305 } \ 2306 } \ 2307 } else { \ 2308 prev = s; \ 2309 length++; \ 2310 } \ 2311 s = next; \ 2312 } \ 2313 if (removed != 0) \ 2314 DYN_BUCKET_VERSION_BUMP(bucket, name ## _del); \ 2315 if (length > max_length) \ 2316 max_length = length; \ 2317 } while (0) 2318 2319 SLIST_INIT(&expired_ipv4); 2320 #ifdef INET6 2321 SLIST_INIT(&expired_ipv6); 2322 #endif 2323 max_length = 0; 2324 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2325 DYN_BUCKET_LOCK(bucket); 2326 DYN_UNLINK_STATES(s4, s4p, s4n, data->expire, ipv4, ipv4, 1); 2327 DYN_UNLINK_STATES(s4, s4p, s4n, limit->expire, ipv4, 2328 ipv4_parent, (s4->limit->count == 0)); 2329 #ifdef INET6 2330 DYN_UNLINK_STATES(s6, s6p, s6n, data->expire, ipv6, ipv6, 1); 2331 DYN_UNLINK_STATES(s6, s6p, s6n, limit->expire, ipv6, 2332 ipv6_parent, (s6->limit->count == 0)); 2333 #endif 2334 DYN_BUCKET_UNLOCK(bucket); 2335 } 2336 /* Update curr_max_length for statistics. */ 2337 V_curr_max_length = max_length; 2338 /* 2339 * Concatenate temporary lists with global expired lists. 2340 */ 2341 DYN_EXPIRED_LOCK(); 2342 SLIST_CONCAT(&V_dyn_expired_ipv4, &expired_ipv4, 2343 dyn_ipv4_state, expired); 2344 #ifdef INET6 2345 SLIST_CONCAT(&V_dyn_expired_ipv6, &expired_ipv6, 2346 dyn_ipv6_state, expired); 2347 #endif 2348 DYN_EXPIRED_UNLOCK(); 2349 #undef DYN_UNLINK_STATES 2350 #undef DYN_UNREF_STATES 2351 } 2352 2353 static struct mbuf * 2354 dyn_mgethdr(int len, uint16_t fibnum) 2355 { 2356 struct mbuf *m; 2357 2358 m = m_gethdr(M_NOWAIT, MT_DATA); 2359 if (m == NULL) 2360 return (NULL); 2361 #ifdef MAC 2362 mac_netinet_firewall_send(m); 2363 #endif 2364 M_SETFIB(m, fibnum); 2365 m->m_data += max_linkhdr; 2366 m->m_flags |= M_SKIP_FIREWALL; 2367 m->m_len = m->m_pkthdr.len = len; 2368 bzero(m->m_data, len); 2369 return (m); 2370 } 2371 2372 static void 2373 dyn_make_keepalive_ipv4(struct mbuf *m, in_addr_t src, in_addr_t dst, 2374 uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport) 2375 { 2376 struct tcphdr *tcp; 2377 struct ip *ip; 2378 2379 ip = mtod(m, struct ip *); 2380 ip->ip_v = 4; 2381 ip->ip_hl = sizeof(*ip) >> 2; 2382 ip->ip_tos = IPTOS_LOWDELAY; 2383 ip->ip_len = htons(m->m_len); 2384 ip->ip_off |= htons(IP_DF); 2385 ip->ip_ttl = V_ip_defttl; 2386 ip->ip_p = IPPROTO_TCP; 2387 ip->ip_src.s_addr = htonl(src); 2388 ip->ip_dst.s_addr = htonl(dst); 2389 2390 tcp = mtodo(m, sizeof(struct ip)); 2391 tcp->th_sport = htons(sport); 2392 tcp->th_dport = htons(dport); 2393 tcp->th_off = sizeof(struct tcphdr) >> 2; 2394 tcp->th_seq = htonl(seq); 2395 tcp->th_ack = htonl(ack); 2396 tcp->th_flags = TH_ACK; 2397 tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 2398 htons(sizeof(struct tcphdr) + IPPROTO_TCP)); 2399 2400 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 2401 m->m_pkthdr.csum_flags = CSUM_TCP; 2402 } 2403 2404 static void 2405 dyn_enqueue_keepalive_ipv4(struct mbufq *q, const struct dyn_ipv4_state *s) 2406 { 2407 struct mbuf *m; 2408 2409 if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { 2410 m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), 2411 s->data->fibnum); 2412 if (m != NULL) { 2413 dyn_make_keepalive_ipv4(m, s->dst, s->src, 2414 s->data->ack_fwd - 1, s->data->ack_rev, 2415 s->dport, s->sport); 2416 if (mbufq_enqueue(q, m)) { 2417 m_freem(m); 2418 log(LOG_DEBUG, "ipfw: limit for IPv4 " 2419 "keepalive queue is reached.\n"); 2420 return; 2421 } 2422 } 2423 } 2424 2425 if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { 2426 m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), 2427 s->data->fibnum); 2428 if (m != NULL) { 2429 dyn_make_keepalive_ipv4(m, s->src, s->dst, 2430 s->data->ack_rev - 1, s->data->ack_fwd, 2431 s->sport, s->dport); 2432 if (mbufq_enqueue(q, m)) { 2433 m_freem(m); 2434 log(LOG_DEBUG, "ipfw: limit for IPv4 " 2435 "keepalive queue is reached.\n"); 2436 return; 2437 } 2438 } 2439 } 2440 } 2441 2442 /* 2443 * Prepare and send keep-alive packets. 2444 */ 2445 static void 2446 dyn_send_keepalive_ipv4(struct ip_fw_chain *chain) 2447 { 2448 struct mbufq q; 2449 struct mbuf *m; 2450 struct dyn_ipv4_state *s; 2451 uint32_t bucket; 2452 2453 mbufq_init(&q, INT_MAX); 2454 IPFW_UH_RLOCK(chain); 2455 /* 2456 * It is safe to not use hazard pointer and just do lockless 2457 * access to the lists, because states entries can not be deleted 2458 * while we hold IPFW_UH_RLOCK. 2459 */ 2460 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2461 CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { 2462 /* 2463 * Only established TCP connections that will 2464 * become expired withing dyn_keepalive_interval. 2465 */ 2466 if (s->proto != IPPROTO_TCP || 2467 (s->data->state & BOTH_SYN) != BOTH_SYN || 2468 TIME_LEQ(time_uptime + V_dyn_keepalive_interval, 2469 s->data->expire)) 2470 continue; 2471 dyn_enqueue_keepalive_ipv4(&q, s); 2472 } 2473 } 2474 IPFW_UH_RUNLOCK(chain); 2475 while ((m = mbufq_dequeue(&q)) != NULL) 2476 ip_output(m, NULL, NULL, 0, NULL, NULL); 2477 } 2478 2479 #ifdef INET6 2480 static void 2481 dyn_make_keepalive_ipv6(struct mbuf *m, const struct in6_addr *src, 2482 const struct in6_addr *dst, uint32_t zoneid, uint32_t seq, uint32_t ack, 2483 uint16_t sport, uint16_t dport) 2484 { 2485 struct tcphdr *tcp; 2486 struct ip6_hdr *ip6; 2487 2488 ip6 = mtod(m, struct ip6_hdr *); 2489 ip6->ip6_vfc |= IPV6_VERSION; 2490 ip6->ip6_plen = htons(sizeof(struct tcphdr)); 2491 ip6->ip6_nxt = IPPROTO_TCP; 2492 ip6->ip6_hlim = IPV6_DEFHLIM; 2493 ip6->ip6_src = *src; 2494 if (IN6_IS_ADDR_LINKLOCAL(src)) 2495 ip6->ip6_src.s6_addr16[1] = htons(zoneid & 0xffff); 2496 ip6->ip6_dst = *dst; 2497 if (IN6_IS_ADDR_LINKLOCAL(dst)) 2498 ip6->ip6_dst.s6_addr16[1] = htons(zoneid & 0xffff); 2499 2500 tcp = mtodo(m, sizeof(struct ip6_hdr)); 2501 tcp->th_sport = htons(sport); 2502 tcp->th_dport = htons(dport); 2503 tcp->th_off = sizeof(struct tcphdr) >> 2; 2504 tcp->th_seq = htonl(seq); 2505 tcp->th_ack = htonl(ack); 2506 tcp->th_flags = TH_ACK; 2507 tcp->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr), 2508 IPPROTO_TCP, 0); 2509 2510 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 2511 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 2512 } 2513 2514 static void 2515 dyn_enqueue_keepalive_ipv6(struct mbufq *q, const struct dyn_ipv6_state *s) 2516 { 2517 struct mbuf *m; 2518 2519 if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { 2520 m = dyn_mgethdr(sizeof(struct ip6_hdr) + 2521 sizeof(struct tcphdr), s->data->fibnum); 2522 if (m != NULL) { 2523 dyn_make_keepalive_ipv6(m, &s->dst, &s->src, 2524 s->zoneid, s->data->ack_fwd - 1, s->data->ack_rev, 2525 s->dport, s->sport); 2526 if (mbufq_enqueue(q, m)) { 2527 m_freem(m); 2528 log(LOG_DEBUG, "ipfw: limit for IPv6 " 2529 "keepalive queue is reached.\n"); 2530 return; 2531 } 2532 } 2533 } 2534 2535 if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { 2536 m = dyn_mgethdr(sizeof(struct ip6_hdr) + 2537 sizeof(struct tcphdr), s->data->fibnum); 2538 if (m != NULL) { 2539 dyn_make_keepalive_ipv6(m, &s->src, &s->dst, 2540 s->zoneid, s->data->ack_rev - 1, s->data->ack_fwd, 2541 s->sport, s->dport); 2542 if (mbufq_enqueue(q, m)) { 2543 m_freem(m); 2544 log(LOG_DEBUG, "ipfw: limit for IPv6 " 2545 "keepalive queue is reached.\n"); 2546 return; 2547 } 2548 } 2549 } 2550 } 2551 2552 static void 2553 dyn_send_keepalive_ipv6(struct ip_fw_chain *chain) 2554 { 2555 struct mbufq q; 2556 struct mbuf *m; 2557 struct dyn_ipv6_state *s; 2558 uint32_t bucket; 2559 2560 mbufq_init(&q, INT_MAX); 2561 IPFW_UH_RLOCK(chain); 2562 /* 2563 * It is safe to not use hazard pointer and just do lockless 2564 * access to the lists, because states entries can not be deleted 2565 * while we hold IPFW_UH_RLOCK. 2566 */ 2567 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2568 CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { 2569 /* 2570 * Only established TCP connections that will 2571 * become expired withing dyn_keepalive_interval. 2572 */ 2573 if (s->proto != IPPROTO_TCP || 2574 (s->data->state & BOTH_SYN) != BOTH_SYN || 2575 TIME_LEQ(time_uptime + V_dyn_keepalive_interval, 2576 s->data->expire)) 2577 continue; 2578 dyn_enqueue_keepalive_ipv6(&q, s); 2579 } 2580 } 2581 IPFW_UH_RUNLOCK(chain); 2582 while ((m = mbufq_dequeue(&q)) != NULL) 2583 ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); 2584 } 2585 #endif /* INET6 */ 2586 2587 static void 2588 dyn_grow_hashtable(struct ip_fw_chain *chain, uint32_t new) 2589 { 2590 #ifdef INET6 2591 struct dyn_ipv6ck_slist *ipv6, *ipv6_parent; 2592 uint32_t *ipv6_add, *ipv6_del, *ipv6_parent_add, *ipv6_parent_del; 2593 struct dyn_ipv6_state *s6; 2594 #endif 2595 struct dyn_ipv4ck_slist *ipv4, *ipv4_parent; 2596 uint32_t *ipv4_add, *ipv4_del, *ipv4_parent_add, *ipv4_parent_del; 2597 struct dyn_ipv4_state *s4; 2598 struct mtx *bucket_lock; 2599 void *tmp; 2600 uint32_t bucket; 2601 2602 MPASS(powerof2(new)); 2603 DYN_DEBUG("grow hash size %u -> %u", V_curr_dyn_buckets, new); 2604 /* 2605 * Allocate and initialize new lists. 2606 * XXXAE: on memory pressure this can disable callout timer. 2607 */ 2608 bucket_lock = malloc(new * sizeof(struct mtx), M_IPFW, 2609 M_WAITOK | M_ZERO); 2610 ipv4 = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, 2611 M_WAITOK | M_ZERO); 2612 ipv4_parent = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, 2613 M_WAITOK | M_ZERO); 2614 ipv4_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); 2615 ipv4_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); 2616 ipv4_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, 2617 M_WAITOK | M_ZERO); 2618 ipv4_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, 2619 M_WAITOK | M_ZERO); 2620 #ifdef INET6 2621 ipv6 = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, 2622 M_WAITOK | M_ZERO); 2623 ipv6_parent = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, 2624 M_WAITOK | M_ZERO); 2625 ipv6_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); 2626 ipv6_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO); 2627 ipv6_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, 2628 M_WAITOK | M_ZERO); 2629 ipv6_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, 2630 M_WAITOK | M_ZERO); 2631 #endif 2632 for (bucket = 0; bucket < new; bucket++) { 2633 DYN_BUCKET_LOCK_INIT(bucket_lock, bucket); 2634 CK_SLIST_INIT(&ipv4[bucket]); 2635 CK_SLIST_INIT(&ipv4_parent[bucket]); 2636 #ifdef INET6 2637 CK_SLIST_INIT(&ipv6[bucket]); 2638 CK_SLIST_INIT(&ipv6_parent[bucket]); 2639 #endif 2640 } 2641 2642 #define DYN_RELINK_STATES(s, hval, i, head, ohead) do { \ 2643 while ((s = CK_SLIST_FIRST(&V_dyn_ ## ohead[i])) != NULL) { \ 2644 CK_SLIST_REMOVE_HEAD(&V_dyn_ ## ohead[i], entry); \ 2645 CK_SLIST_INSERT_HEAD(&head[DYN_BUCKET(s->hval, new)], \ 2646 s, entry); \ 2647 } \ 2648 } while (0) 2649 /* 2650 * Prevent rules changing from userland. 2651 */ 2652 IPFW_UH_WLOCK(chain); 2653 /* 2654 * Hold traffic processing until we finish resize to 2655 * prevent access to states lists. 2656 */ 2657 IPFW_WLOCK(chain); 2658 /* Re-link all dynamic states */ 2659 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2660 DYN_RELINK_STATES(s4, data->hashval, bucket, ipv4, ipv4); 2661 DYN_RELINK_STATES(s4, limit->hashval, bucket, ipv4_parent, 2662 ipv4_parent); 2663 #ifdef INET6 2664 DYN_RELINK_STATES(s6, data->hashval, bucket, ipv6, ipv6); 2665 DYN_RELINK_STATES(s6, limit->hashval, bucket, ipv6_parent, 2666 ipv6_parent); 2667 #endif 2668 } 2669 2670 #define DYN_SWAP_PTR(old, new, tmp) do { \ 2671 tmp = old; \ 2672 old = new; \ 2673 new = tmp; \ 2674 } while (0) 2675 /* Swap pointers */ 2676 DYN_SWAP_PTR(V_dyn_bucket_lock, bucket_lock, tmp); 2677 DYN_SWAP_PTR(V_dyn_ipv4, ipv4, tmp); 2678 DYN_SWAP_PTR(V_dyn_ipv4_parent, ipv4_parent, tmp); 2679 DYN_SWAP_PTR(V_dyn_ipv4_add, ipv4_add, tmp); 2680 DYN_SWAP_PTR(V_dyn_ipv4_parent_add, ipv4_parent_add, tmp); 2681 DYN_SWAP_PTR(V_dyn_ipv4_del, ipv4_del, tmp); 2682 DYN_SWAP_PTR(V_dyn_ipv4_parent_del, ipv4_parent_del, tmp); 2683 2684 #ifdef INET6 2685 DYN_SWAP_PTR(V_dyn_ipv6, ipv6, tmp); 2686 DYN_SWAP_PTR(V_dyn_ipv6_parent, ipv6_parent, tmp); 2687 DYN_SWAP_PTR(V_dyn_ipv6_add, ipv6_add, tmp); 2688 DYN_SWAP_PTR(V_dyn_ipv6_parent_add, ipv6_parent_add, tmp); 2689 DYN_SWAP_PTR(V_dyn_ipv6_del, ipv6_del, tmp); 2690 DYN_SWAP_PTR(V_dyn_ipv6_parent_del, ipv6_parent_del, tmp); 2691 #endif 2692 bucket = V_curr_dyn_buckets; 2693 V_curr_dyn_buckets = new; 2694 2695 IPFW_WUNLOCK(chain); 2696 IPFW_UH_WUNLOCK(chain); 2697 2698 /* Release old resources */ 2699 while (bucket-- != 0) 2700 DYN_BUCKET_LOCK_DESTROY(bucket_lock, bucket); 2701 free(bucket_lock, M_IPFW); 2702 free(ipv4, M_IPFW); 2703 free(ipv4_parent, M_IPFW); 2704 free(ipv4_add, M_IPFW); 2705 free(ipv4_parent_add, M_IPFW); 2706 free(ipv4_del, M_IPFW); 2707 free(ipv4_parent_del, M_IPFW); 2708 #ifdef INET6 2709 free(ipv6, M_IPFW); 2710 free(ipv6_parent, M_IPFW); 2711 free(ipv6_add, M_IPFW); 2712 free(ipv6_parent_add, M_IPFW); 2713 free(ipv6_del, M_IPFW); 2714 free(ipv6_parent_del, M_IPFW); 2715 #endif 2716 } 2717 2718 /* 2719 * This function is used to perform various maintenance 2720 * on dynamic hash lists. Currently it is called every second. 2721 */ 2722 static void 2723 dyn_tick(void *vnetx) 2724 { 2725 uint32_t buckets; 2726 2727 CURVNET_SET((struct vnet *)vnetx); 2728 /* 2729 * First free states unlinked in previous passes. 2730 */ 2731 dyn_free_states(&V_layer3_chain); 2732 /* 2733 * Now unlink others expired states. 2734 * We use IPFW_UH_WLOCK to avoid concurrent call of 2735 * dyn_expire_states(). It is the only function that does 2736 * deletion of state entries from states lists. 2737 */ 2738 IPFW_UH_WLOCK(&V_layer3_chain); 2739 dyn_expire_states(&V_layer3_chain, NULL); 2740 IPFW_UH_WUNLOCK(&V_layer3_chain); 2741 /* 2742 * Send keepalives if they are enabled and the time has come. 2743 */ 2744 if (V_dyn_keepalive != 0 && 2745 V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) { 2746 V_dyn_keepalive_last = time_uptime; 2747 dyn_send_keepalive_ipv4(&V_layer3_chain); 2748 #ifdef INET6 2749 dyn_send_keepalive_ipv6(&V_layer3_chain); 2750 #endif 2751 } 2752 /* 2753 * Check if we need to resize the hash: 2754 * if current number of states exceeds number of buckets in hash, 2755 * and dyn_buckets_max permits to grow the number of buckets, then 2756 * do it. Grow hash size to the minimum power of 2 which is bigger 2757 * than current states count. 2758 */ 2759 if (V_curr_dyn_buckets < V_dyn_buckets_max && 2760 (V_curr_dyn_buckets < V_dyn_count / 2 || ( 2761 V_curr_dyn_buckets < V_dyn_count && V_curr_max_length > 8))) { 2762 buckets = 1 << fls(V_dyn_count); 2763 if (buckets > V_dyn_buckets_max) 2764 buckets = V_dyn_buckets_max; 2765 dyn_grow_hashtable(&V_layer3_chain, buckets); 2766 } 2767 2768 callout_reset_on(&V_dyn_timeout, hz, dyn_tick, vnetx, 0); 2769 CURVNET_RESTORE(); 2770 } 2771 2772 void 2773 ipfw_expire_dyn_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt) 2774 { 2775 /* 2776 * Do not perform any checks if we currently have no dynamic states 2777 */ 2778 if (V_dyn_count == 0) 2779 return; 2780 2781 IPFW_UH_WLOCK_ASSERT(chain); 2782 dyn_expire_states(chain, rt); 2783 } 2784 2785 /* 2786 * Pass through all states and reset eaction for orphaned rules. 2787 */ 2788 void 2789 ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint16_t eaction_id, 2790 uint16_t default_id, uint16_t instance_id) 2791 { 2792 #ifdef INET6 2793 struct dyn_ipv6_state *s6; 2794 #endif 2795 struct dyn_ipv4_state *s4; 2796 struct ip_fw *rule; 2797 uint32_t bucket; 2798 2799 #define DYN_RESET_EACTION(s, h, b) \ 2800 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 2801 if ((s->data->flags & DYN_REFERENCED) == 0) \ 2802 continue; \ 2803 rule = s->data->parent; \ 2804 if (s->type == O_LIMIT) \ 2805 rule = ((__typeof(s))rule)->limit->parent; \ 2806 ipfw_reset_eaction(ch, rule, eaction_id, \ 2807 default_id, instance_id); \ 2808 } 2809 2810 IPFW_UH_WLOCK_ASSERT(ch); 2811 if (V_dyn_count == 0) 2812 return; 2813 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2814 DYN_RESET_EACTION(s4, ipv4, bucket); 2815 #ifdef INET6 2816 DYN_RESET_EACTION(s6, ipv6, bucket); 2817 #endif 2818 } 2819 } 2820 2821 /* 2822 * Returns size of dynamic states in legacy format 2823 */ 2824 int 2825 ipfw_dyn_len(void) 2826 { 2827 2828 return ((V_dyn_count + V_dyn_parent_count) * sizeof(ipfw_dyn_rule)); 2829 } 2830 2831 /* 2832 * Returns number of dynamic states. 2833 * Marks every named object index used by dynamic states with bit in @bmask. 2834 * Returns number of named objects accounted in bmask via @nocnt. 2835 * Used by dump format v1 (current). 2836 */ 2837 uint32_t 2838 ipfw_dyn_get_count(uint32_t *bmask, int *nocnt) 2839 { 2840 #ifdef INET6 2841 struct dyn_ipv6_state *s6; 2842 #endif 2843 struct dyn_ipv4_state *s4; 2844 uint32_t bucket; 2845 2846 #define DYN_COUNT_OBJECTS(s, h, b) \ 2847 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 2848 MPASS(s->kidx != 0); \ 2849 if (ipfw_mark_object_kidx(bmask, IPFW_TLV_STATE_NAME, \ 2850 s->kidx) != 0) \ 2851 (*nocnt)++; \ 2852 } 2853 2854 IPFW_UH_RLOCK_ASSERT(&V_layer3_chain); 2855 2856 /* No need to pass through all the buckets. */ 2857 *nocnt = 0; 2858 if (V_dyn_count + V_dyn_parent_count == 0) 2859 return (0); 2860 2861 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 2862 DYN_COUNT_OBJECTS(s4, ipv4, bucket); 2863 #ifdef INET6 2864 DYN_COUNT_OBJECTS(s6, ipv6, bucket); 2865 #endif 2866 } 2867 2868 return (V_dyn_count + V_dyn_parent_count); 2869 } 2870 2871 /* 2872 * Check if rule contains at least one dynamic opcode. 2873 * 2874 * Returns 1 if such opcode is found, 0 otherwise. 2875 */ 2876 int 2877 ipfw_is_dyn_rule(struct ip_fw *rule) 2878 { 2879 int cmdlen, l; 2880 ipfw_insn *cmd; 2881 2882 l = rule->cmd_len; 2883 cmd = rule->cmd; 2884 cmdlen = 0; 2885 for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { 2886 cmdlen = F_LEN(cmd); 2887 2888 switch (cmd->opcode) { 2889 case O_LIMIT: 2890 case O_KEEP_STATE: 2891 case O_PROBE_STATE: 2892 case O_CHECK_STATE: 2893 return (1); 2894 } 2895 } 2896 2897 return (0); 2898 } 2899 2900 static void 2901 dyn_export_parent(const struct dyn_parent *p, uint16_t kidx, 2902 ipfw_dyn_rule *dst) 2903 { 2904 2905 dst->dyn_type = O_LIMIT_PARENT; 2906 dst->kidx = kidx; 2907 dst->count = (uint16_t)DPARENT_COUNT(p); 2908 dst->expire = TIME_LEQ(p->expire, time_uptime) ? 0: 2909 p->expire - time_uptime; 2910 2911 /* 'rule' is used to pass up the rule number and set */ 2912 memcpy(&dst->rule, &p->rulenum, sizeof(p->rulenum)); 2913 /* store set number into high word of dst->rule pointer. */ 2914 memcpy((char *)&dst->rule + sizeof(p->rulenum), &p->set, 2915 sizeof(p->set)); 2916 2917 /* unused fields */ 2918 dst->pcnt = 0; 2919 dst->bcnt = 0; 2920 dst->parent = NULL; 2921 dst->state = 0; 2922 dst->ack_fwd = 0; 2923 dst->ack_rev = 0; 2924 dst->bucket = p->hashval; 2925 /* 2926 * The legacy userland code will interpret a NULL here as a marker 2927 * for the last dynamic rule. 2928 */ 2929 dst->next = (ipfw_dyn_rule *)1; 2930 } 2931 2932 static void 2933 dyn_export_data(const struct dyn_data *data, uint16_t kidx, uint8_t type, 2934 ipfw_dyn_rule *dst) 2935 { 2936 2937 dst->dyn_type = type; 2938 dst->kidx = kidx; 2939 dst->pcnt = data->pcnt_fwd + data->pcnt_rev; 2940 dst->bcnt = data->bcnt_fwd + data->bcnt_rev; 2941 dst->expire = TIME_LEQ(data->expire, time_uptime) ? 0: 2942 data->expire - time_uptime; 2943 2944 /* 'rule' is used to pass up the rule number and set */ 2945 memcpy(&dst->rule, &data->rulenum, sizeof(data->rulenum)); 2946 /* store set number into high word of dst->rule pointer. */ 2947 memcpy((char *)&dst->rule + sizeof(data->rulenum), &data->set, 2948 sizeof(data->set)); 2949 2950 dst->state = data->state; 2951 if (data->flags & DYN_REFERENCED) 2952 dst->state |= IPFW_DYN_ORPHANED; 2953 2954 /* unused fields */ 2955 dst->parent = NULL; 2956 dst->ack_fwd = data->ack_fwd; 2957 dst->ack_rev = data->ack_rev; 2958 dst->count = 0; 2959 dst->bucket = data->hashval; 2960 /* 2961 * The legacy userland code will interpret a NULL here as a marker 2962 * for the last dynamic rule. 2963 */ 2964 dst->next = (ipfw_dyn_rule *)1; 2965 } 2966 2967 static void 2968 dyn_export_ipv4_state(const struct dyn_ipv4_state *s, ipfw_dyn_rule *dst) 2969 { 2970 2971 switch (s->type) { 2972 case O_LIMIT_PARENT: 2973 dyn_export_parent(s->limit, s->kidx, dst); 2974 break; 2975 default: 2976 dyn_export_data(s->data, s->kidx, s->type, dst); 2977 } 2978 2979 dst->id.dst_ip = s->dst; 2980 dst->id.src_ip = s->src; 2981 dst->id.dst_port = s->dport; 2982 dst->id.src_port = s->sport; 2983 dst->id.fib = s->data->fibnum; 2984 dst->id.proto = s->proto; 2985 dst->id._flags = 0; 2986 dst->id.addr_type = 4; 2987 2988 memset(&dst->id.dst_ip6, 0, sizeof(dst->id.dst_ip6)); 2989 memset(&dst->id.src_ip6, 0, sizeof(dst->id.src_ip6)); 2990 dst->id.flow_id6 = dst->id.extra = 0; 2991 } 2992 2993 #ifdef INET6 2994 static void 2995 dyn_export_ipv6_state(const struct dyn_ipv6_state *s, ipfw_dyn_rule *dst) 2996 { 2997 2998 switch (s->type) { 2999 case O_LIMIT_PARENT: 3000 dyn_export_parent(s->limit, s->kidx, dst); 3001 break; 3002 default: 3003 dyn_export_data(s->data, s->kidx, s->type, dst); 3004 } 3005 3006 dst->id.src_ip6 = s->src; 3007 dst->id.dst_ip6 = s->dst; 3008 dst->id.dst_port = s->dport; 3009 dst->id.src_port = s->sport; 3010 dst->id.fib = s->data->fibnum; 3011 dst->id.proto = s->proto; 3012 dst->id._flags = 0; 3013 dst->id.addr_type = 6; 3014 3015 dst->id.dst_ip = dst->id.src_ip = 0; 3016 dst->id.flow_id6 = dst->id.extra = 0; 3017 } 3018 #endif /* INET6 */ 3019 3020 /* 3021 * Fills the buffer given by @sd with dynamic states. 3022 * Used by dump format v1 (current). 3023 * 3024 * Returns 0 on success. 3025 */ 3026 int 3027 ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) 3028 { 3029 #ifdef INET6 3030 struct dyn_ipv6_state *s6; 3031 #endif 3032 struct dyn_ipv4_state *s4; 3033 ipfw_obj_dyntlv *dst, *last; 3034 ipfw_obj_ctlv *ctlv; 3035 uint32_t bucket; 3036 3037 if (V_dyn_count == 0) 3038 return (0); 3039 3040 /* 3041 * IPFW_UH_RLOCK garantees that another userland request 3042 * and callout thread will not delete entries from states 3043 * lists. 3044 */ 3045 IPFW_UH_RLOCK_ASSERT(chain); 3046 3047 ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); 3048 if (ctlv == NULL) 3049 return (ENOMEM); 3050 ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; 3051 ctlv->objsize = sizeof(ipfw_obj_dyntlv); 3052 last = NULL; 3053 3054 #define DYN_EXPORT_STATES(s, af, h, b) \ 3055 CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ 3056 dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, \ 3057 sizeof(ipfw_obj_dyntlv)); \ 3058 if (dst == NULL) \ 3059 return (ENOMEM); \ 3060 dyn_export_ ## af ## _state(s, &dst->state); \ 3061 dst->head.length = sizeof(ipfw_obj_dyntlv); \ 3062 dst->head.type = IPFW_TLV_DYN_ENT; \ 3063 last = dst; \ 3064 } 3065 3066 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 3067 DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket); 3068 DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket); 3069 #ifdef INET6 3070 DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket); 3071 DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket); 3072 #endif /* INET6 */ 3073 } 3074 3075 /* mark last dynamic rule */ 3076 if (last != NULL) 3077 last->head.flags = IPFW_DF_LAST; /* XXX: unused */ 3078 return (0); 3079 #undef DYN_EXPORT_STATES 3080 } 3081 3082 /* 3083 * Fill given buffer with dynamic states (legacy format). 3084 * IPFW_UH_RLOCK has to be held while calling. 3085 */ 3086 void 3087 ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep) 3088 { 3089 #ifdef INET6 3090 struct dyn_ipv6_state *s6; 3091 #endif 3092 struct dyn_ipv4_state *s4; 3093 ipfw_dyn_rule *p, *last = NULL; 3094 char *bp; 3095 uint32_t bucket; 3096 3097 if (V_dyn_count == 0) 3098 return; 3099 bp = *pbp; 3100 3101 IPFW_UH_RLOCK_ASSERT(chain); 3102 3103 #define DYN_EXPORT_STATES(s, af, head, b) \ 3104 CK_SLIST_FOREACH(s, &V_dyn_ ## head[b], entry) { \ 3105 if (bp + sizeof(*p) > ep) \ 3106 break; \ 3107 p = (ipfw_dyn_rule *)bp; \ 3108 dyn_export_ ## af ## _state(s, p); \ 3109 last = p; \ 3110 bp += sizeof(*p); \ 3111 } 3112 3113 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 3114 DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket); 3115 DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket); 3116 #ifdef INET6 3117 DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket); 3118 DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket); 3119 #endif /* INET6 */ 3120 } 3121 3122 if (last != NULL) /* mark last dynamic rule */ 3123 last->next = NULL; 3124 *pbp = bp; 3125 #undef DYN_EXPORT_STATES 3126 } 3127 3128 void 3129 ipfw_dyn_init(struct ip_fw_chain *chain) 3130 { 3131 3132 #ifdef IPFIREWALL_JENKINSHASH 3133 V_dyn_hashseed = arc4random(); 3134 #endif 3135 V_dyn_max = 16384; /* max # of states */ 3136 V_dyn_parent_max = 4096; /* max # of parent states */ 3137 V_dyn_buckets_max = 8192; /* must be power of 2 */ 3138 3139 V_dyn_ack_lifetime = 300; 3140 V_dyn_syn_lifetime = 20; 3141 V_dyn_fin_lifetime = 1; 3142 V_dyn_rst_lifetime = 1; 3143 V_dyn_udp_lifetime = 10; 3144 V_dyn_short_lifetime = 5; 3145 3146 V_dyn_keepalive_interval = 20; 3147 V_dyn_keepalive_period = 5; 3148 V_dyn_keepalive = 1; /* send keepalives */ 3149 V_dyn_keepalive_last = time_uptime; 3150 3151 V_dyn_data_zone = uma_zcreate("IPFW dynamic states data", 3152 sizeof(struct dyn_data), NULL, NULL, NULL, NULL, 3153 UMA_ALIGN_PTR, 0); 3154 uma_zone_set_max(V_dyn_data_zone, V_dyn_max); 3155 3156 V_dyn_parent_zone = uma_zcreate("IPFW parent dynamic states", 3157 sizeof(struct dyn_parent), NULL, NULL, NULL, NULL, 3158 UMA_ALIGN_PTR, 0); 3159 uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); 3160 3161 SLIST_INIT(&V_dyn_expired_ipv4); 3162 V_dyn_ipv4 = NULL; 3163 V_dyn_ipv4_parent = NULL; 3164 V_dyn_ipv4_zone = uma_zcreate("IPFW IPv4 dynamic states", 3165 sizeof(struct dyn_ipv4_state), NULL, NULL, NULL, NULL, 3166 UMA_ALIGN_PTR, 0); 3167 3168 #ifdef INET6 3169 SLIST_INIT(&V_dyn_expired_ipv6); 3170 V_dyn_ipv6 = NULL; 3171 V_dyn_ipv6_parent = NULL; 3172 V_dyn_ipv6_zone = uma_zcreate("IPFW IPv6 dynamic states", 3173 sizeof(struct dyn_ipv6_state), NULL, NULL, NULL, NULL, 3174 UMA_ALIGN_PTR, 0); 3175 #endif 3176 3177 /* Initialize buckets. */ 3178 V_curr_dyn_buckets = 0; 3179 V_dyn_bucket_lock = NULL; 3180 dyn_grow_hashtable(chain, 256); 3181 3182 if (IS_DEFAULT_VNET(curvnet)) 3183 dyn_hp_cache = malloc(mp_ncpus * sizeof(void *), M_IPFW, 3184 M_WAITOK | M_ZERO); 3185 3186 DYN_EXPIRED_LOCK_INIT(); 3187 callout_init(&V_dyn_timeout, 1); 3188 callout_reset(&V_dyn_timeout, hz, dyn_tick, curvnet); 3189 IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); 3190 } 3191 3192 void 3193 ipfw_dyn_uninit(int pass) 3194 { 3195 #ifdef INET6 3196 struct dyn_ipv6_state *s6; 3197 #endif 3198 struct dyn_ipv4_state *s4; 3199 int bucket; 3200 3201 if (pass == 0) { 3202 callout_drain(&V_dyn_timeout); 3203 return; 3204 } 3205 IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); 3206 DYN_EXPIRED_LOCK_DESTROY(); 3207 3208 #define DYN_FREE_STATES_FORCED(CK, s, af, name, en) do { \ 3209 while ((s = CK ## SLIST_FIRST(&V_dyn_ ## name)) != NULL) { \ 3210 CK ## SLIST_REMOVE_HEAD(&V_dyn_ ## name, en); \ 3211 if (s->type == O_LIMIT_PARENT) \ 3212 uma_zfree(V_dyn_parent_zone, s->limit); \ 3213 else \ 3214 uma_zfree(V_dyn_data_zone, s->data); \ 3215 uma_zfree(V_dyn_ ## af ## _zone, s); \ 3216 } \ 3217 } while (0) 3218 for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { 3219 DYN_BUCKET_LOCK_DESTROY(V_dyn_bucket_lock, bucket); 3220 3221 DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4[bucket], entry); 3222 DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4_parent[bucket], 3223 entry); 3224 #ifdef INET6 3225 DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6[bucket], entry); 3226 DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6_parent[bucket], 3227 entry); 3228 #endif /* INET6 */ 3229 } 3230 DYN_FREE_STATES_FORCED(, s4, ipv4, expired_ipv4, expired); 3231 #ifdef INET6 3232 DYN_FREE_STATES_FORCED(, s6, ipv6, expired_ipv6, expired); 3233 #endif 3234 #undef DYN_FREE_STATES_FORCED 3235 3236 uma_zdestroy(V_dyn_ipv4_zone); 3237 uma_zdestroy(V_dyn_data_zone); 3238 uma_zdestroy(V_dyn_parent_zone); 3239 #ifdef INET6 3240 uma_zdestroy(V_dyn_ipv6_zone); 3241 free(V_dyn_ipv6, M_IPFW); 3242 free(V_dyn_ipv6_parent, M_IPFW); 3243 free(V_dyn_ipv6_add, M_IPFW); 3244 free(V_dyn_ipv6_parent_add, M_IPFW); 3245 free(V_dyn_ipv6_del, M_IPFW); 3246 free(V_dyn_ipv6_parent_del, M_IPFW); 3247 #endif 3248 free(V_dyn_bucket_lock, M_IPFW); 3249 free(V_dyn_ipv4, M_IPFW); 3250 free(V_dyn_ipv4_parent, M_IPFW); 3251 free(V_dyn_ipv4_add, M_IPFW); 3252 free(V_dyn_ipv4_parent_add, M_IPFW); 3253 free(V_dyn_ipv4_del, M_IPFW); 3254 free(V_dyn_ipv4_parent_del, M_IPFW); 3255 if (IS_DEFAULT_VNET(curvnet)) 3256 free(dyn_hp_cache, M_IPFW); 3257 } 3258 3259 3260