1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * (C) 1999-2001 Paul `Rusty' Russell 4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 5 * (C) 2011 Patrick McHardy <kaber@trash.net> 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include <linux/module.h> 11 #include <linux/types.h> 12 #include <linux/timer.h> 13 #include <linux/skbuff.h> 14 #include <linux/gfp.h> 15 #include <net/xfrm.h> 16 #include <linux/siphash.h> 17 #include <linux/rtnetlink.h> 18 19 #include <net/netfilter/nf_conntrack_bpf.h> 20 #include <net/netfilter/nf_conntrack_core.h> 21 #include <net/netfilter/nf_conntrack_helper.h> 22 #include <net/netfilter/nf_conntrack_seqadj.h> 23 #include <net/netfilter/nf_conntrack_zones.h> 24 #include <net/netfilter/nf_nat.h> 25 #include <net/netfilter/nf_nat_helper.h> 26 #include <uapi/linux/netfilter/nf_nat.h> 27 28 #include "nf_internals.h" 29 30 #define NF_NAT_MAX_ATTEMPTS 128 31 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) 32 33 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; 34 35 static DEFINE_MUTEX(nf_nat_proto_mutex); 36 static unsigned int nat_net_id __read_mostly; 37 38 static struct hlist_head *nf_nat_bysource __read_mostly; 39 static unsigned int nf_nat_htable_size __read_mostly; 40 static siphash_aligned_key_t nf_nat_hash_rnd; 41 42 struct nf_nat_lookup_hook_priv { 43 struct nf_hook_entries __rcu *entries; 44 45 struct rcu_head rcu_head; 46 }; 47 48 struct nf_nat_hooks_net { 49 struct nf_hook_ops *nat_hook_ops; 50 unsigned int users; 51 }; 52 53 struct nat_net { 54 struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; 55 }; 56 57 #ifdef CONFIG_XFRM 58 static void nf_nat_ipv4_decode_session(struct sk_buff *skb, 59 const struct nf_conn *ct, 60 enum ip_conntrack_dir dir, 61 unsigned long statusbit, 62 struct flowi *fl) 63 { 64 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; 65 struct flowi4 *fl4 = &fl->u.ip4; 66 67 if (ct->status & statusbit) { 68 fl4->daddr = t->dst.u3.ip; 69 if (t->dst.protonum == IPPROTO_TCP || 70 t->dst.protonum == IPPROTO_UDP || 71 t->dst.protonum == IPPROTO_SCTP) 72 fl4->fl4_dport = t->dst.u.all; 73 } 74 75 statusbit ^= IPS_NAT_MASK; 76 77 if (ct->status & statusbit) { 78 fl4->saddr = t->src.u3.ip; 79 if (t->dst.protonum == IPPROTO_TCP || 80 t->dst.protonum == IPPROTO_UDP || 81 t->dst.protonum == IPPROTO_SCTP) 82 fl4->fl4_sport = t->src.u.all; 83 } 84 } 85 86 static void nf_nat_ipv6_decode_session(struct sk_buff *skb, 87 const struct nf_conn *ct, 88 enum ip_conntrack_dir dir, 89 unsigned long statusbit, 90 struct flowi *fl) 91 { 92 #if IS_ENABLED(CONFIG_IPV6) 93 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; 94 struct flowi6 *fl6 = &fl->u.ip6; 95 96 if (ct->status & statusbit) { 97 fl6->daddr = t->dst.u3.in6; 98 if (t->dst.protonum == IPPROTO_TCP || 99 t->dst.protonum == IPPROTO_UDP || 100 t->dst.protonum == IPPROTO_SCTP) 101 fl6->fl6_dport = t->dst.u.all; 102 } 103 104 statusbit ^= IPS_NAT_MASK; 105 106 if (ct->status & statusbit) { 107 fl6->saddr = t->src.u3.in6; 108 if (t->dst.protonum == IPPROTO_TCP || 109 t->dst.protonum == IPPROTO_UDP || 110 t->dst.protonum == IPPROTO_SCTP) 111 fl6->fl6_sport = t->src.u.all; 112 } 113 #endif 114 } 115 116 static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) 117 { 118 const struct nf_conn *ct; 119 enum ip_conntrack_info ctinfo; 120 enum ip_conntrack_dir dir; 121 unsigned long statusbit; 122 u8 family; 123 124 ct = nf_ct_get(skb, &ctinfo); 125 if (ct == NULL) 126 return; 127 128 family = nf_ct_l3num(ct); 129 dir = CTINFO2DIR(ctinfo); 130 if (dir == IP_CT_DIR_ORIGINAL) 131 statusbit = IPS_DST_NAT; 132 else 133 statusbit = IPS_SRC_NAT; 134 135 switch (family) { 136 case NFPROTO_IPV4: 137 nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); 138 return; 139 case NFPROTO_IPV6: 140 nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); 141 return; 142 } 143 } 144 #endif /* CONFIG_XFRM */ 145 146 /* We keep an extra hash for each conntrack, for fast searching. */ 147 static unsigned int 148 hash_by_src(const struct net *net, 149 const struct nf_conntrack_zone *zone, 150 const struct nf_conntrack_tuple *tuple) 151 { 152 unsigned int hash; 153 struct { 154 struct nf_conntrack_man src; 155 u32 net_mix; 156 u32 protonum; 157 u32 zone; 158 } __aligned(SIPHASH_ALIGNMENT) combined; 159 160 get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); 161 162 memset(&combined, 0, sizeof(combined)); 163 164 /* Original src, to ensure we map it consistently if poss. */ 165 combined.src = tuple->src; 166 combined.net_mix = net_hash_mix(net); 167 combined.protonum = tuple->dst.protonum; 168 169 /* Zone ID can be used provided its valid for both directions */ 170 if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) 171 combined.zone = zone->id; 172 173 hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); 174 175 return reciprocal_scale(hash, nf_nat_htable_size); 176 } 177 178 /** 179 * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry 180 * @tuple: proposed NAT binding 181 * @ignored_conntrack: our (unconfirmed) conntrack entry 182 * 183 * A conntrack entry can be inserted to the connection tracking table 184 * if there is no existing entry with an identical tuple in either direction. 185 * 186 * Example: 187 * INITIATOR -> NAT/PAT -> RESPONDER 188 * 189 * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). 190 * Then, later, NAT/PAT itself also connects to RESPONDER. 191 * 192 * This will not work if the SNAT done earlier has same IP:PORT source pair. 193 * 194 * Conntrack table has: 195 * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT 196 * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT 197 * 198 * and new locally originating connection wants: 199 * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT 200 * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT 201 * 202 * ... which would mean incoming packets cannot be distinguished between 203 * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). 204 * 205 * @return: true if the proposed NAT mapping collides with an existing entry. 206 */ 207 static int 208 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, 209 const struct nf_conn *ignored_conntrack) 210 { 211 /* Conntrack tracking doesn't keep track of outgoing tuples; only 212 * incoming ones. NAT means they don't have a fixed mapping, 213 * so we invert the tuple and look for the incoming reply. 214 * 215 * We could keep a separate hash if this proves too slow. 216 */ 217 struct nf_conntrack_tuple reply; 218 219 nf_ct_invert_tuple(&reply, tuple); 220 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 221 } 222 223 static bool nf_nat_allow_clash(const struct nf_conn *ct) 224 { 225 return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; 226 } 227 228 /** 229 * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry 230 * @tuple: proposed NAT binding 231 * @ignored_ct: our (unconfirmed) conntrack entry 232 * 233 * Same as nf_nat_used_tuple, but also check for rare clash in reverse 234 * direction. Should be called only when @tuple has not been altered, i.e. 235 * @ignored_conntrack will not be subject to NAT. 236 * 237 * @return: true if the proposed NAT mapping collides with existing entry. 238 */ 239 static noinline bool 240 nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, 241 const struct nf_conn *ignored_ct) 242 { 243 static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; 244 const struct nf_conntrack_tuple_hash *thash; 245 const struct nf_conntrack_zone *zone; 246 struct nf_conn *ct; 247 bool taken = true; 248 struct net *net; 249 250 if (!nf_nat_used_tuple(tuple, ignored_ct)) 251 return false; 252 253 if (!nf_nat_allow_clash(ignored_ct)) 254 return true; 255 256 /* Initial choice clashes with existing conntrack. 257 * Check for (rare) reverse collision. 258 * 259 * This can happen when new packets are received in both directions 260 * at the exact same time on different CPUs. 261 * 262 * Without SMP, first packet creates new conntrack entry and second 263 * packet is resolved as established reply packet. 264 * 265 * With parallel processing, both packets could be picked up as 266 * new and both get their own ct entry allocated. 267 * 268 * If ignored_conntrack and colliding ct are not subject to NAT then 269 * pretend the tuple is available and let later clash resolution 270 * handle this at insertion time. 271 * 272 * Without it, the 'reply' packet has its source port rewritten 273 * by nat engine. 274 */ 275 if (READ_ONCE(ignored_ct->status) & uses_nat) 276 return true; 277 278 net = nf_ct_net(ignored_ct); 279 zone = nf_ct_zone(ignored_ct); 280 281 thash = nf_conntrack_find_get(net, zone, tuple); 282 if (unlikely(!thash)) { 283 struct nf_conntrack_tuple reply; 284 285 nf_ct_invert_tuple(&reply, tuple); 286 thash = nf_conntrack_find_get(net, zone, &reply); 287 if (!thash) /* clashing entry went away */ 288 return false; 289 } 290 291 ct = nf_ct_tuplehash_to_ctrack(thash); 292 293 /* clashing connection subject to NAT? Retry with new tuple. */ 294 if (READ_ONCE(ct->status) & uses_nat) 295 goto out; 296 297 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 298 &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple)) 299 taken = false; 300 out: 301 nf_ct_put(ct); 302 return taken; 303 } 304 305 static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) 306 { 307 static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | 308 IPS_DYING; 309 static const unsigned long flags_needed = IPS_SRC_NAT; 310 enum tcp_conntrack old_state; 311 312 old_state = READ_ONCE(ct->proto.tcp.state); 313 if (old_state < TCP_CONNTRACK_TIME_WAIT) 314 return false; 315 316 if (flags & flags_refuse) 317 return false; 318 319 return (flags & flags_needed) == flags_needed; 320 } 321 322 /* reverse direction will send packets to new source, so 323 * make sure such packets are invalid. 324 */ 325 static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) 326 { 327 return (__s32)(new->proto.tcp.seen[0].td_end - 328 old->proto.tcp.seen[0].td_end) > 0; 329 } 330 331 static int 332 nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, 333 const struct nf_conn *ignored_conntrack, 334 unsigned int attempts_left) 335 { 336 static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; 337 struct nf_conntrack_tuple_hash *thash; 338 const struct nf_conntrack_zone *zone; 339 struct nf_conntrack_tuple reply; 340 unsigned long flags; 341 struct nf_conn *ct; 342 bool taken = true; 343 struct net *net; 344 345 nf_ct_invert_tuple(&reply, tuple); 346 347 if (attempts_left > NF_NAT_HARDER_THRESH || 348 tuple->dst.protonum != IPPROTO_TCP || 349 ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) 350 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 351 352 /* :ast few attempts to find a free tcp port. Destructive 353 * action: evict colliding if its in timewait state and the 354 * tcp sequence number has advanced past the one used by the 355 * old entry. 356 */ 357 net = nf_ct_net(ignored_conntrack); 358 zone = nf_ct_zone(ignored_conntrack); 359 360 thash = nf_conntrack_find_get(net, zone, &reply); 361 if (!thash) 362 return false; 363 364 ct = nf_ct_tuplehash_to_ctrack(thash); 365 366 if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) 367 goto out; 368 369 if (WARN_ON_ONCE(ct == ignored_conntrack)) 370 goto out; 371 372 flags = READ_ONCE(ct->status); 373 if (!nf_nat_may_kill(ct, flags)) 374 goto out; 375 376 if (!nf_seq_has_advanced(ct, ignored_conntrack)) 377 goto out; 378 379 /* Even if we can evict do not reuse if entry is offloaded. */ 380 if (nf_ct_kill(ct)) 381 taken = flags & flags_offload; 382 out: 383 nf_ct_put(ct); 384 return taken; 385 } 386 387 static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, 388 const struct nf_nat_range2 *range) 389 { 390 if (t->src.l3num == NFPROTO_IPV4) 391 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && 392 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); 393 394 return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && 395 ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; 396 } 397 398 /* Is the manipable part of the tuple between min and max incl? */ 399 static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, 400 enum nf_nat_manip_type maniptype, 401 const union nf_conntrack_man_proto *min, 402 const union nf_conntrack_man_proto *max) 403 { 404 __be16 port; 405 406 switch (tuple->dst.protonum) { 407 case IPPROTO_ICMP: 408 case IPPROTO_ICMPV6: 409 return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && 410 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); 411 case IPPROTO_GRE: /* all fall though */ 412 case IPPROTO_TCP: 413 case IPPROTO_UDP: 414 case IPPROTO_SCTP: 415 if (maniptype == NF_NAT_MANIP_SRC) 416 port = tuple->src.u.all; 417 else 418 port = tuple->dst.u.all; 419 420 return ntohs(port) >= ntohs(min->all) && 421 ntohs(port) <= ntohs(max->all); 422 default: 423 return true; 424 } 425 } 426 427 /* If we source map this tuple so reply looks like reply_tuple, will 428 * that meet the constraints of range. 429 */ 430 static int nf_in_range(const struct nf_conntrack_tuple *tuple, 431 const struct nf_nat_range2 *range) 432 { 433 /* If we are supposed to map IPs, then we must be in the 434 * range specified, otherwise let this drag us onto a new src IP. 435 */ 436 if (range->flags & NF_NAT_RANGE_MAP_IPS && 437 !nf_nat_inet_in_range(tuple, range)) 438 return 0; 439 440 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) 441 return 1; 442 443 return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, 444 &range->min_proto, &range->max_proto); 445 } 446 447 static inline int 448 same_src(const struct nf_conn *ct, 449 const struct nf_conntrack_tuple *tuple) 450 { 451 const struct nf_conntrack_tuple *t; 452 453 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 454 return (t->dst.protonum == tuple->dst.protonum && 455 nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && 456 t->src.u.all == tuple->src.u.all); 457 } 458 459 /* Only called for SRC manip */ 460 static int 461 find_appropriate_src(struct net *net, 462 const struct nf_conntrack_zone *zone, 463 const struct nf_conntrack_tuple *tuple, 464 struct nf_conntrack_tuple *result, 465 const struct nf_nat_range2 *range) 466 { 467 unsigned int h = hash_by_src(net, zone, tuple); 468 const struct nf_conn *ct; 469 470 hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { 471 if (same_src(ct, tuple) && 472 net_eq(net, nf_ct_net(ct)) && 473 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { 474 /* Copy source part from reply tuple. */ 475 nf_ct_invert_tuple(result, 476 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 477 result->dst = tuple->dst; 478 479 if (nf_in_range(result, range)) 480 return 1; 481 } 482 } 483 return 0; 484 } 485 486 /* For [FUTURE] fragmentation handling, we want the least-used 487 * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus 488 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports 489 * 1-65535, we don't do pro-rata allocation based on ports; we choose 490 * the ip with the lowest src-ip/dst-ip/proto usage. 491 */ 492 static void 493 find_best_ips_proto(const struct nf_conntrack_zone *zone, 494 struct nf_conntrack_tuple *tuple, 495 const struct nf_nat_range2 *range, 496 const struct nf_conn *ct, 497 enum nf_nat_manip_type maniptype) 498 { 499 union nf_inet_addr *var_ipp; 500 unsigned int i, max; 501 /* Host order */ 502 u32 minip, maxip, j, dist; 503 bool full_range; 504 505 /* No IP mapping? Do nothing. */ 506 if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) 507 return; 508 509 if (maniptype == NF_NAT_MANIP_SRC) 510 var_ipp = &tuple->src.u3; 511 else 512 var_ipp = &tuple->dst.u3; 513 514 /* Fast path: only one choice. */ 515 if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { 516 *var_ipp = range->min_addr; 517 return; 518 } 519 520 if (nf_ct_l3num(ct) == NFPROTO_IPV4) 521 max = sizeof(var_ipp->ip) / sizeof(u32) - 1; 522 else 523 max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; 524 525 /* Hashing source and destination IPs gives a fairly even 526 * spread in practice (if there are a small number of IPs 527 * involved, there usually aren't that many connections 528 * anyway). The consistency means that servers see the same 529 * client coming from the same IP (some Internet Banking sites 530 * like this), even across reboots. 531 */ 532 j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), 533 range->flags & NF_NAT_RANGE_PERSISTENT ? 534 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); 535 536 full_range = false; 537 for (i = 0; i <= max; i++) { 538 /* If first bytes of the address are at the maximum, use the 539 * distance. Otherwise use the full range. 540 */ 541 if (!full_range) { 542 minip = ntohl((__force __be32)range->min_addr.all[i]); 543 maxip = ntohl((__force __be32)range->max_addr.all[i]); 544 dist = maxip - minip + 1; 545 } else { 546 minip = 0; 547 dist = ~0; 548 } 549 550 var_ipp->all[i] = (__force __u32) 551 htonl(minip + reciprocal_scale(j, dist)); 552 if (var_ipp->all[i] != range->max_addr.all[i]) 553 full_range = true; 554 555 if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) 556 j ^= (__force u32)tuple->dst.u3.all[i]; 557 } 558 } 559 560 /* Alter the per-proto part of the tuple (depending on maniptype), to 561 * give a unique tuple in the given range if possible. 562 * 563 * Per-protocol part of tuple is initialized to the incoming packet. 564 */ 565 static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, 566 const struct nf_nat_range2 *range, 567 enum nf_nat_manip_type maniptype, 568 const struct nf_conn *ct) 569 { 570 unsigned int range_size, min, max, i, attempts; 571 __be16 *keyptr; 572 u16 off; 573 574 switch (tuple->dst.protonum) { 575 case IPPROTO_ICMP: 576 case IPPROTO_ICMPV6: 577 /* id is same for either direction... */ 578 keyptr = &tuple->src.u.icmp.id; 579 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 580 min = 0; 581 range_size = 65536; 582 } else { 583 min = ntohs(range->min_proto.icmp.id); 584 range_size = ntohs(range->max_proto.icmp.id) - 585 ntohs(range->min_proto.icmp.id) + 1; 586 } 587 goto find_free_id; 588 #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) 589 case IPPROTO_GRE: 590 /* If there is no master conntrack we are not PPTP, 591 do not change tuples */ 592 if (!ct->master) 593 return; 594 595 if (maniptype == NF_NAT_MANIP_SRC) 596 keyptr = &tuple->src.u.gre.key; 597 else 598 keyptr = &tuple->dst.u.gre.key; 599 600 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 601 min = 1; 602 range_size = 65535; 603 } else { 604 min = ntohs(range->min_proto.gre.key); 605 range_size = ntohs(range->max_proto.gre.key) - min + 1; 606 } 607 goto find_free_id; 608 #endif 609 case IPPROTO_UDP: 610 case IPPROTO_TCP: 611 case IPPROTO_SCTP: 612 if (maniptype == NF_NAT_MANIP_SRC) 613 keyptr = &tuple->src.u.all; 614 else 615 keyptr = &tuple->dst.u.all; 616 617 break; 618 default: 619 return; 620 } 621 622 /* If no range specified... */ 623 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 624 /* If it's dst rewrite, can't change port */ 625 if (maniptype == NF_NAT_MANIP_DST) 626 return; 627 628 if (ntohs(*keyptr) < 1024) { 629 /* Loose convention: >> 512 is credential passing */ 630 if (ntohs(*keyptr) < 512) { 631 min = 1; 632 range_size = 511 - min + 1; 633 } else { 634 min = 600; 635 range_size = 1023 - min + 1; 636 } 637 } else { 638 min = 1024; 639 range_size = 65535 - 1024 + 1; 640 } 641 } else { 642 min = ntohs(range->min_proto.all); 643 max = ntohs(range->max_proto.all); 644 if (unlikely(max < min)) 645 swap(max, min); 646 range_size = max - min + 1; 647 } 648 649 find_free_id: 650 if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) 651 off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); 652 else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || 653 maniptype != NF_NAT_MANIP_DST) 654 off = get_random_u16(); 655 else 656 off = 0; 657 658 attempts = range_size; 659 if (attempts > NF_NAT_MAX_ATTEMPTS) 660 attempts = NF_NAT_MAX_ATTEMPTS; 661 662 /* We are in softirq; doing a search of the entire range risks 663 * soft lockup when all tuples are already used. 664 * 665 * If we can't find any free port from first offset, pick a new 666 * one and try again, with ever smaller search window. 667 */ 668 another_round: 669 for (i = 0; i < attempts; i++, off++) { 670 *keyptr = htons(min + off % range_size); 671 if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) 672 return; 673 } 674 675 if (attempts >= range_size || attempts < 16) 676 return; 677 attempts /= 2; 678 off = get_random_u16(); 679 goto another_round; 680 } 681 682 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, 683 * we change the source to map into the range. For NF_INET_PRE_ROUTING 684 * and NF_INET_LOCAL_OUT, we change the destination to map into the 685 * range. It might not be possible to get a unique tuple, but we try. 686 * At worst (or if we race), we will end up with a final duplicate in 687 * __nf_conntrack_confirm and drop the packet. */ 688 static void 689 get_unique_tuple(struct nf_conntrack_tuple *tuple, 690 const struct nf_conntrack_tuple *orig_tuple, 691 const struct nf_nat_range2 *range, 692 struct nf_conn *ct, 693 enum nf_nat_manip_type maniptype) 694 { 695 const struct nf_conntrack_zone *zone; 696 struct net *net = nf_ct_net(ct); 697 698 zone = nf_ct_zone(ct); 699 700 /* 1) If this srcip/proto/src-proto-part is currently mapped, 701 * and that same mapping gives a unique tuple within the given 702 * range, use that. 703 * 704 * This is only required for source (ie. NAT/masq) mappings. 705 * So far, we don't do local source mappings, so multiple 706 * manips not an issue. 707 */ 708 if (maniptype == NF_NAT_MANIP_SRC && 709 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 710 /* try the original tuple first */ 711 if (nf_in_range(orig_tuple, range)) { 712 if (!nf_nat_used_tuple_new(orig_tuple, ct)) { 713 *tuple = *orig_tuple; 714 return; 715 } 716 } else if (find_appropriate_src(net, zone, 717 orig_tuple, tuple, range)) { 718 pr_debug("get_unique_tuple: Found current src map\n"); 719 if (!nf_nat_used_tuple(tuple, ct)) 720 return; 721 } 722 } 723 724 /* 2) Select the least-used IP/proto combination in the given range */ 725 *tuple = *orig_tuple; 726 find_best_ips_proto(zone, tuple, range, ct, maniptype); 727 728 /* 3) The per-protocol part of the manip is made to map into 729 * the range to make a unique tuple. 730 */ 731 732 /* Only bother mapping if it's not already in range and unique */ 733 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 734 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 735 if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && 736 l4proto_in_range(tuple, maniptype, 737 &range->min_proto, 738 &range->max_proto) && 739 (range->min_proto.all == range->max_proto.all || 740 !nf_nat_used_tuple(tuple, ct))) 741 return; 742 } else if (!nf_nat_used_tuple(tuple, ct)) { 743 return; 744 } 745 } 746 747 /* Last chance: get protocol to try to obtain unique tuple. */ 748 nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); 749 } 750 751 struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) 752 { 753 struct nf_conn_nat *nat = nfct_nat(ct); 754 if (nat) 755 return nat; 756 757 if (!nf_ct_is_confirmed(ct)) 758 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); 759 760 return nat; 761 } 762 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); 763 764 unsigned int 765 nf_nat_setup_info(struct nf_conn *ct, 766 const struct nf_nat_range2 *range, 767 enum nf_nat_manip_type maniptype) 768 { 769 struct net *net = nf_ct_net(ct); 770 struct nf_conntrack_tuple curr_tuple, new_tuple; 771 772 /* Can't setup nat info for confirmed ct. */ 773 if (nf_ct_is_confirmed(ct)) 774 return NF_ACCEPT; 775 776 WARN_ON(maniptype != NF_NAT_MANIP_SRC && 777 maniptype != NF_NAT_MANIP_DST); 778 779 if (WARN_ON(nf_nat_initialized(ct, maniptype))) 780 return NF_DROP; 781 782 /* What we've got will look like inverse of reply. Normally 783 * this is what is in the conntrack, except for prior 784 * manipulations (future optimization: if num_manips == 0, 785 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 786 */ 787 nf_ct_invert_tuple(&curr_tuple, 788 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 789 790 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); 791 792 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { 793 struct nf_conntrack_tuple reply; 794 795 /* Alter conntrack table so will recognize replies. */ 796 nf_ct_invert_tuple(&reply, &new_tuple); 797 nf_conntrack_alter_reply(ct, &reply); 798 799 /* Non-atomic: we own this at the moment. */ 800 if (maniptype == NF_NAT_MANIP_SRC) 801 ct->status |= IPS_SRC_NAT; 802 else 803 ct->status |= IPS_DST_NAT; 804 805 if (nfct_help(ct) && !nfct_seqadj(ct)) 806 if (!nfct_seqadj_ext_add(ct)) 807 return NF_DROP; 808 } 809 810 if (maniptype == NF_NAT_MANIP_SRC) { 811 unsigned int srchash; 812 spinlock_t *lock; 813 814 srchash = hash_by_src(net, nf_ct_zone(ct), 815 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 816 lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; 817 spin_lock_bh(lock); 818 hlist_add_head_rcu(&ct->nat_bysource, 819 &nf_nat_bysource[srchash]); 820 spin_unlock_bh(lock); 821 } 822 823 /* It's done. */ 824 if (maniptype == NF_NAT_MANIP_DST) 825 ct->status |= IPS_DST_NAT_DONE; 826 else 827 ct->status |= IPS_SRC_NAT_DONE; 828 829 return NF_ACCEPT; 830 } 831 EXPORT_SYMBOL(nf_nat_setup_info); 832 833 static unsigned int 834 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) 835 { 836 /* Force range to this IP; let proto decide mapping for 837 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 838 * Use reply in case it's already been mangled (eg local packet). 839 */ 840 union nf_inet_addr ip = 841 (manip == NF_NAT_MANIP_SRC ? 842 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : 843 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); 844 struct nf_nat_range2 range = { 845 .flags = NF_NAT_RANGE_MAP_IPS, 846 .min_addr = ip, 847 .max_addr = ip, 848 }; 849 return nf_nat_setup_info(ct, &range, manip); 850 } 851 852 unsigned int 853 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 854 { 855 return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); 856 } 857 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); 858 859 /* Do packet manipulations according to nf_nat_setup_info. */ 860 unsigned int nf_nat_packet(struct nf_conn *ct, 861 enum ip_conntrack_info ctinfo, 862 unsigned int hooknum, 863 struct sk_buff *skb) 864 { 865 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); 866 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 867 unsigned int verdict = NF_ACCEPT; 868 unsigned long statusbit; 869 870 if (mtype == NF_NAT_MANIP_SRC) 871 statusbit = IPS_SRC_NAT; 872 else 873 statusbit = IPS_DST_NAT; 874 875 /* Invert if this is reply dir. */ 876 if (dir == IP_CT_DIR_REPLY) 877 statusbit ^= IPS_NAT_MASK; 878 879 /* Non-atomic: these bits don't change. */ 880 if (ct->status & statusbit) 881 verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); 882 883 return verdict; 884 } 885 EXPORT_SYMBOL_GPL(nf_nat_packet); 886 887 static bool in_vrf_postrouting(const struct nf_hook_state *state) 888 { 889 #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) 890 if (state->hook == NF_INET_POST_ROUTING && 891 netif_is_l3_master(state->out)) 892 return true; 893 #endif 894 return false; 895 } 896 897 unsigned int 898 nf_nat_inet_fn(void *priv, struct sk_buff *skb, 899 const struct nf_hook_state *state) 900 { 901 struct nf_conn *ct; 902 enum ip_conntrack_info ctinfo; 903 struct nf_conn_nat *nat; 904 /* maniptype == SRC for postrouting. */ 905 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); 906 907 ct = nf_ct_get(skb, &ctinfo); 908 /* Can't track? It's not due to stress, or conntrack would 909 * have dropped it. Hence it's the user's responsibilty to 910 * packet filter it out, or implement conntrack/NAT for that 911 * protocol. 8) --RR 912 */ 913 if (!ct || in_vrf_postrouting(state)) 914 return NF_ACCEPT; 915 916 nat = nfct_nat(ct); 917 918 switch (ctinfo) { 919 case IP_CT_RELATED: 920 case IP_CT_RELATED_REPLY: 921 /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ 922 case IP_CT_NEW: 923 /* Seen it before? This can happen for loopback, retrans, 924 * or local packets. 925 */ 926 if (!nf_nat_initialized(ct, maniptype)) { 927 struct nf_nat_lookup_hook_priv *lpriv = priv; 928 struct nf_hook_entries *e = rcu_dereference(lpriv->entries); 929 unsigned int ret; 930 int i; 931 932 if (!e) 933 goto null_bind; 934 935 for (i = 0; i < e->num_hook_entries; i++) { 936 ret = e->hooks[i].hook(e->hooks[i].priv, skb, 937 state); 938 if (ret != NF_ACCEPT) 939 return ret; 940 if (nf_nat_initialized(ct, maniptype)) 941 goto do_nat; 942 } 943 null_bind: 944 ret = nf_nat_alloc_null_binding(ct, state->hook); 945 if (ret != NF_ACCEPT) 946 return ret; 947 } else { 948 pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", 949 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", 950 ct, ct->status); 951 if (nf_nat_oif_changed(state->hook, ctinfo, nat, 952 state->out)) 953 goto oif_changed; 954 } 955 break; 956 default: 957 /* ESTABLISHED */ 958 WARN_ON(ctinfo != IP_CT_ESTABLISHED && 959 ctinfo != IP_CT_ESTABLISHED_REPLY); 960 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) 961 goto oif_changed; 962 } 963 do_nat: 964 return nf_nat_packet(ct, ctinfo, state->hook, skb); 965 966 oif_changed: 967 nf_ct_kill_acct(ct, ctinfo, skb); 968 return NF_DROP; 969 } 970 EXPORT_SYMBOL_GPL(nf_nat_inet_fn); 971 972 struct nf_nat_proto_clean { 973 u8 l3proto; 974 u8 l4proto; 975 }; 976 977 /* kill conntracks with affected NAT section */ 978 static int nf_nat_proto_remove(struct nf_conn *i, void *data) 979 { 980 const struct nf_nat_proto_clean *clean = data; 981 982 if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || 983 (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) 984 return 0; 985 986 return i->status & IPS_NAT_MASK ? 1 : 0; 987 } 988 989 static void nf_nat_cleanup_conntrack(struct nf_conn *ct) 990 { 991 unsigned int h; 992 993 h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 994 spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 995 hlist_del_rcu(&ct->nat_bysource); 996 spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 997 } 998 999 static int nf_nat_proto_clean(struct nf_conn *ct, void *data) 1000 { 1001 if (nf_nat_proto_remove(ct, data)) 1002 return 1; 1003 1004 /* This module is being removed and conntrack has nat null binding. 1005 * Remove it from bysource hash, as the table will be freed soon. 1006 * 1007 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() 1008 * will delete entry from already-freed table. 1009 */ 1010 if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) 1011 nf_nat_cleanup_conntrack(ct); 1012 1013 /* don't delete conntrack. Although that would make things a lot 1014 * simpler, we'd end up flushing all conntracks on nat rmmod. 1015 */ 1016 return 0; 1017 } 1018 1019 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1020 1021 #include <linux/netfilter/nfnetlink.h> 1022 #include <linux/netfilter/nfnetlink_conntrack.h> 1023 1024 static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { 1025 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, 1026 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, 1027 }; 1028 1029 static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], 1030 struct nf_nat_range2 *range) 1031 { 1032 if (tb[CTA_PROTONAT_PORT_MIN]) { 1033 range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); 1034 range->max_proto.all = range->min_proto.all; 1035 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1036 } 1037 if (tb[CTA_PROTONAT_PORT_MAX]) { 1038 range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); 1039 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1040 } 1041 return 0; 1042 } 1043 1044 static int nfnetlink_parse_nat_proto(struct nlattr *attr, 1045 const struct nf_conn *ct, 1046 struct nf_nat_range2 *range) 1047 { 1048 struct nlattr *tb[CTA_PROTONAT_MAX+1]; 1049 int err; 1050 1051 err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, 1052 protonat_nla_policy, NULL); 1053 if (err < 0) 1054 return err; 1055 1056 return nf_nat_l4proto_nlattr_to_range(tb, range); 1057 } 1058 1059 static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { 1060 [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, 1061 [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, 1062 [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, 1063 [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, 1064 [CTA_NAT_PROTO] = { .type = NLA_NESTED }, 1065 }; 1066 1067 static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], 1068 struct nf_nat_range2 *range) 1069 { 1070 if (tb[CTA_NAT_V4_MINIP]) { 1071 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); 1072 range->flags |= NF_NAT_RANGE_MAP_IPS; 1073 } 1074 1075 range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], 1076 range->min_addr.ip); 1077 1078 return 0; 1079 } 1080 1081 static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], 1082 struct nf_nat_range2 *range) 1083 { 1084 if (tb[CTA_NAT_V6_MINIP]) { 1085 nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], 1086 sizeof(struct in6_addr)); 1087 range->flags |= NF_NAT_RANGE_MAP_IPS; 1088 } 1089 1090 if (tb[CTA_NAT_V6_MAXIP]) 1091 nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], 1092 sizeof(struct in6_addr)); 1093 else 1094 range->max_addr = range->min_addr; 1095 1096 return 0; 1097 } 1098 1099 static int 1100 nfnetlink_parse_nat(const struct nlattr *nat, 1101 const struct nf_conn *ct, struct nf_nat_range2 *range) 1102 { 1103 struct nlattr *tb[CTA_NAT_MAX+1]; 1104 int err; 1105 1106 memset(range, 0, sizeof(*range)); 1107 1108 err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, 1109 nat_nla_policy, NULL); 1110 if (err < 0) 1111 return err; 1112 1113 switch (nf_ct_l3num(ct)) { 1114 case NFPROTO_IPV4: 1115 err = nf_nat_ipv4_nlattr_to_range(tb, range); 1116 break; 1117 case NFPROTO_IPV6: 1118 err = nf_nat_ipv6_nlattr_to_range(tb, range); 1119 break; 1120 default: 1121 err = -EPROTONOSUPPORT; 1122 break; 1123 } 1124 1125 if (err) 1126 return err; 1127 1128 if (!tb[CTA_NAT_PROTO]) 1129 return 0; 1130 1131 return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); 1132 } 1133 1134 /* This function is called under rcu_read_lock() */ 1135 static int 1136 nfnetlink_parse_nat_setup(struct nf_conn *ct, 1137 enum nf_nat_manip_type manip, 1138 const struct nlattr *attr) 1139 { 1140 struct nf_nat_range2 range; 1141 int err; 1142 1143 /* Should not happen, restricted to creating new conntracks 1144 * via ctnetlink. 1145 */ 1146 if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) 1147 return -EEXIST; 1148 1149 /* No NAT information has been passed, allocate the null-binding */ 1150 if (attr == NULL) 1151 return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; 1152 1153 err = nfnetlink_parse_nat(attr, ct, &range); 1154 if (err < 0) 1155 return err; 1156 1157 return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; 1158 } 1159 #else 1160 static int 1161 nfnetlink_parse_nat_setup(struct nf_conn *ct, 1162 enum nf_nat_manip_type manip, 1163 const struct nlattr *attr) 1164 { 1165 return -EOPNOTSUPP; 1166 } 1167 #endif 1168 1169 static struct nf_ct_helper_expectfn follow_master_nat = { 1170 .name = "nat-follow-master", 1171 .expectfn = nf_nat_follow_master, 1172 }; 1173 1174 int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, 1175 const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) 1176 { 1177 struct nat_net *nat_net = net_generic(net, nat_net_id); 1178 struct nf_nat_hooks_net *nat_proto_net; 1179 struct nf_nat_lookup_hook_priv *priv; 1180 unsigned int hooknum = ops->hooknum; 1181 struct nf_hook_ops *nat_ops; 1182 int i, ret; 1183 1184 if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) 1185 return -EINVAL; 1186 1187 nat_proto_net = &nat_net->nat_proto_net[pf]; 1188 1189 for (i = 0; i < ops_count; i++) { 1190 if (orig_nat_ops[i].hooknum == hooknum) { 1191 hooknum = i; 1192 break; 1193 } 1194 } 1195 1196 if (WARN_ON_ONCE(i == ops_count)) 1197 return -EINVAL; 1198 1199 mutex_lock(&nf_nat_proto_mutex); 1200 if (!nat_proto_net->nat_hook_ops) { 1201 WARN_ON(nat_proto_net->users != 0); 1202 1203 nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); 1204 if (!nat_ops) { 1205 mutex_unlock(&nf_nat_proto_mutex); 1206 return -ENOMEM; 1207 } 1208 1209 for (i = 0; i < ops_count; i++) { 1210 priv = kzalloc_obj(*priv); 1211 if (priv) { 1212 nat_ops[i].priv = priv; 1213 continue; 1214 } 1215 mutex_unlock(&nf_nat_proto_mutex); 1216 while (i) 1217 kfree(nat_ops[--i].priv); 1218 kfree(nat_ops); 1219 return -ENOMEM; 1220 } 1221 1222 ret = nf_register_net_hooks(net, nat_ops, ops_count); 1223 if (ret < 0) { 1224 mutex_unlock(&nf_nat_proto_mutex); 1225 for (i = 0; i < ops_count; i++) 1226 kfree(nat_ops[i].priv); 1227 kfree(nat_ops); 1228 return ret; 1229 } 1230 1231 nat_proto_net->nat_hook_ops = nat_ops; 1232 } 1233 1234 nat_ops = nat_proto_net->nat_hook_ops; 1235 priv = nat_ops[hooknum].priv; 1236 if (WARN_ON_ONCE(!priv)) { 1237 mutex_unlock(&nf_nat_proto_mutex); 1238 return -EOPNOTSUPP; 1239 } 1240 1241 ret = nf_hook_entries_insert_raw(&priv->entries, ops); 1242 if (ret == 0) 1243 nat_proto_net->users++; 1244 1245 mutex_unlock(&nf_nat_proto_mutex); 1246 return ret; 1247 } 1248 1249 void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, 1250 unsigned int ops_count) 1251 { 1252 struct nat_net *nat_net = net_generic(net, nat_net_id); 1253 struct nf_nat_hooks_net *nat_proto_net; 1254 struct nf_nat_lookup_hook_priv *priv; 1255 struct nf_hook_ops *nat_ops; 1256 int hooknum = ops->hooknum; 1257 int i; 1258 1259 if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) 1260 return; 1261 1262 nat_proto_net = &nat_net->nat_proto_net[pf]; 1263 1264 mutex_lock(&nf_nat_proto_mutex); 1265 if (WARN_ON(nat_proto_net->users == 0)) 1266 goto unlock; 1267 1268 nat_proto_net->users--; 1269 1270 nat_ops = nat_proto_net->nat_hook_ops; 1271 for (i = 0; i < ops_count; i++) { 1272 if (nat_ops[i].hooknum == hooknum) { 1273 hooknum = i; 1274 break; 1275 } 1276 } 1277 if (WARN_ON_ONCE(i == ops_count)) 1278 goto unlock; 1279 priv = nat_ops[hooknum].priv; 1280 nf_hook_entries_delete_raw(&priv->entries, ops); 1281 1282 if (nat_proto_net->users == 0) { 1283 nf_unregister_net_hooks(net, nat_ops, ops_count); 1284 1285 for (i = 0; i < ops_count; i++) { 1286 priv = nat_ops[i].priv; 1287 kfree_rcu(priv, rcu_head); 1288 } 1289 1290 nat_proto_net->nat_hook_ops = NULL; 1291 kfree(nat_ops); 1292 } 1293 unlock: 1294 mutex_unlock(&nf_nat_proto_mutex); 1295 } 1296 1297 static struct pernet_operations nat_net_ops = { 1298 .id = &nat_net_id, 1299 .size = sizeof(struct nat_net), 1300 }; 1301 1302 static const struct nf_nat_hook nat_hook = { 1303 .parse_nat_setup = nfnetlink_parse_nat_setup, 1304 #ifdef CONFIG_XFRM 1305 .decode_session = __nf_nat_decode_session, 1306 #endif 1307 .remove_nat_bysrc = nf_nat_cleanup_conntrack, 1308 }; 1309 1310 static int __init nf_nat_init(void) 1311 { 1312 int ret, i; 1313 1314 /* Leave them the same for the moment. */ 1315 nf_nat_htable_size = nf_conntrack_htable_size; 1316 if (nf_nat_htable_size < CONNTRACK_LOCKS) 1317 nf_nat_htable_size = CONNTRACK_LOCKS; 1318 1319 nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); 1320 if (!nf_nat_bysource) 1321 return -ENOMEM; 1322 1323 for (i = 0; i < CONNTRACK_LOCKS; i++) 1324 spin_lock_init(&nf_nat_locks[i]); 1325 1326 ret = register_pernet_subsys(&nat_net_ops); 1327 if (ret < 0) { 1328 kvfree(nf_nat_bysource); 1329 return ret; 1330 } 1331 1332 nf_ct_helper_expectfn_register(&follow_master_nat); 1333 1334 WARN_ON(nf_nat_hook != NULL); 1335 RCU_INIT_POINTER(nf_nat_hook, &nat_hook); 1336 1337 ret = register_nf_nat_bpf(); 1338 if (ret < 0) { 1339 RCU_INIT_POINTER(nf_nat_hook, NULL); 1340 nf_ct_helper_expectfn_unregister(&follow_master_nat); 1341 synchronize_net(); 1342 unregister_pernet_subsys(&nat_net_ops); 1343 kvfree(nf_nat_bysource); 1344 } 1345 1346 return ret; 1347 } 1348 1349 static void __exit nf_nat_cleanup(void) 1350 { 1351 struct nf_nat_proto_clean clean = {}; 1352 1353 nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); 1354 1355 nf_ct_helper_expectfn_unregister(&follow_master_nat); 1356 RCU_INIT_POINTER(nf_nat_hook, NULL); 1357 1358 synchronize_net(); 1359 kvfree(nf_nat_bysource); 1360 unregister_pernet_subsys(&nat_net_ops); 1361 } 1362 1363 MODULE_LICENSE("GPL"); 1364 MODULE_DESCRIPTION("Network address translation core"); 1365 1366 module_init(nf_nat_init); 1367 module_exit(nf_nat_cleanup); 1368