1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * (C) 1999-2001 Paul `Rusty' Russell 4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 5 * (C) 2011 Patrick McHardy <kaber@trash.net> 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include <linux/module.h> 11 #include <linux/types.h> 12 #include <linux/timer.h> 13 #include <linux/skbuff.h> 14 #include <linux/gfp.h> 15 #include <net/xfrm.h> 16 #include <linux/siphash.h> 17 #include <linux/rtnetlink.h> 18 19 #include <net/netfilter/nf_conntrack_bpf.h> 20 #include <net/netfilter/nf_conntrack_core.h> 21 #include <net/netfilter/nf_conntrack_helper.h> 22 #include <net/netfilter/nf_conntrack_seqadj.h> 23 #include <net/netfilter/nf_conntrack_zones.h> 24 #include <net/netfilter/nf_nat.h> 25 #include <net/netfilter/nf_nat_helper.h> 26 #include <uapi/linux/netfilter/nf_nat.h> 27 28 #include "nf_internals.h" 29 30 #define NF_NAT_MAX_ATTEMPTS 128 31 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) 32 33 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; 34 35 static DEFINE_MUTEX(nf_nat_proto_mutex); 36 static unsigned int nat_net_id __read_mostly; 37 38 static struct hlist_head *nf_nat_bysource __read_mostly; 39 static unsigned int nf_nat_htable_size __read_mostly; 40 static siphash_aligned_key_t nf_nat_hash_rnd; 41 42 struct nf_nat_lookup_hook_priv { 43 struct nf_hook_entries __rcu *entries; 44 45 struct rcu_head rcu_head; 46 }; 47 48 struct nf_nat_hooks_net { 49 struct nf_hook_ops *nat_hook_ops; 50 unsigned int users; 51 }; 52 53 struct nat_net { 54 struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; 55 }; 56 57 #ifdef CONFIG_XFRM 58 static void nf_nat_ipv4_decode_session(struct sk_buff *skb, 59 const struct nf_conn *ct, 60 enum ip_conntrack_dir dir, 61 unsigned long statusbit, 62 struct flowi *fl) 63 { 64 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; 65 struct flowi4 *fl4 = &fl->u.ip4; 66 67 if (ct->status & statusbit) { 68 fl4->daddr = t->dst.u3.ip; 69 if (t->dst.protonum == IPPROTO_TCP || 70 t->dst.protonum == IPPROTO_UDP || 71 t->dst.protonum == IPPROTO_UDPLITE || 72 t->dst.protonum == IPPROTO_SCTP) 73 fl4->fl4_dport = t->dst.u.all; 74 } 75 76 statusbit ^= IPS_NAT_MASK; 77 78 if (ct->status & statusbit) { 79 fl4->saddr = t->src.u3.ip; 80 if (t->dst.protonum == IPPROTO_TCP || 81 t->dst.protonum == IPPROTO_UDP || 82 t->dst.protonum == IPPROTO_UDPLITE || 83 t->dst.protonum == IPPROTO_SCTP) 84 fl4->fl4_sport = t->src.u.all; 85 } 86 } 87 88 static void nf_nat_ipv6_decode_session(struct sk_buff *skb, 89 const struct nf_conn *ct, 90 enum ip_conntrack_dir dir, 91 unsigned long statusbit, 92 struct flowi *fl) 93 { 94 #if IS_ENABLED(CONFIG_IPV6) 95 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; 96 struct flowi6 *fl6 = &fl->u.ip6; 97 98 if (ct->status & statusbit) { 99 fl6->daddr = t->dst.u3.in6; 100 if (t->dst.protonum == IPPROTO_TCP || 101 t->dst.protonum == IPPROTO_UDP || 102 t->dst.protonum == IPPROTO_UDPLITE || 103 t->dst.protonum == IPPROTO_SCTP) 104 fl6->fl6_dport = t->dst.u.all; 105 } 106 107 statusbit ^= IPS_NAT_MASK; 108 109 if (ct->status & statusbit) { 110 fl6->saddr = t->src.u3.in6; 111 if (t->dst.protonum == IPPROTO_TCP || 112 t->dst.protonum == IPPROTO_UDP || 113 t->dst.protonum == IPPROTO_UDPLITE || 114 t->dst.protonum == IPPROTO_SCTP) 115 fl6->fl6_sport = t->src.u.all; 116 } 117 #endif 118 } 119 120 static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) 121 { 122 const struct nf_conn *ct; 123 enum ip_conntrack_info ctinfo; 124 enum ip_conntrack_dir dir; 125 unsigned long statusbit; 126 u8 family; 127 128 ct = nf_ct_get(skb, &ctinfo); 129 if (ct == NULL) 130 return; 131 132 family = nf_ct_l3num(ct); 133 dir = CTINFO2DIR(ctinfo); 134 if (dir == IP_CT_DIR_ORIGINAL) 135 statusbit = IPS_DST_NAT; 136 else 137 statusbit = IPS_SRC_NAT; 138 139 switch (family) { 140 case NFPROTO_IPV4: 141 nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); 142 return; 143 case NFPROTO_IPV6: 144 nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); 145 return; 146 } 147 } 148 #endif /* CONFIG_XFRM */ 149 150 /* We keep an extra hash for each conntrack, for fast searching. */ 151 static unsigned int 152 hash_by_src(const struct net *net, 153 const struct nf_conntrack_zone *zone, 154 const struct nf_conntrack_tuple *tuple) 155 { 156 unsigned int hash; 157 struct { 158 struct nf_conntrack_man src; 159 u32 net_mix; 160 u32 protonum; 161 u32 zone; 162 } __aligned(SIPHASH_ALIGNMENT) combined; 163 164 get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); 165 166 memset(&combined, 0, sizeof(combined)); 167 168 /* Original src, to ensure we map it consistently if poss. */ 169 combined.src = tuple->src; 170 combined.net_mix = net_hash_mix(net); 171 combined.protonum = tuple->dst.protonum; 172 173 /* Zone ID can be used provided its valid for both directions */ 174 if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) 175 combined.zone = zone->id; 176 177 hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); 178 179 return reciprocal_scale(hash, nf_nat_htable_size); 180 } 181 182 /** 183 * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry 184 * @tuple: proposed NAT binding 185 * @ignored_conntrack: our (unconfirmed) conntrack entry 186 * 187 * A conntrack entry can be inserted to the connection tracking table 188 * if there is no existing entry with an identical tuple in either direction. 189 * 190 * Example: 191 * INITIATOR -> NAT/PAT -> RESPONDER 192 * 193 * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). 194 * Then, later, NAT/PAT itself also connects to RESPONDER. 195 * 196 * This will not work if the SNAT done earlier has same IP:PORT source pair. 197 * 198 * Conntrack table has: 199 * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT 200 * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT 201 * 202 * and new locally originating connection wants: 203 * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT 204 * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT 205 * 206 * ... which would mean incoming packets cannot be distinguished between 207 * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). 208 * 209 * @return: true if the proposed NAT mapping collides with an existing entry. 210 */ 211 static int 212 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, 213 const struct nf_conn *ignored_conntrack) 214 { 215 /* Conntrack tracking doesn't keep track of outgoing tuples; only 216 * incoming ones. NAT means they don't have a fixed mapping, 217 * so we invert the tuple and look for the incoming reply. 218 * 219 * We could keep a separate hash if this proves too slow. 220 */ 221 struct nf_conntrack_tuple reply; 222 223 nf_ct_invert_tuple(&reply, tuple); 224 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 225 } 226 227 static bool nf_nat_allow_clash(const struct nf_conn *ct) 228 { 229 return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; 230 } 231 232 /** 233 * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry 234 * @tuple: proposed NAT binding 235 * @ignored_ct: our (unconfirmed) conntrack entry 236 * 237 * Same as nf_nat_used_tuple, but also check for rare clash in reverse 238 * direction. Should be called only when @tuple has not been altered, i.e. 239 * @ignored_conntrack will not be subject to NAT. 240 * 241 * @return: true if the proposed NAT mapping collides with existing entry. 242 */ 243 static noinline bool 244 nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, 245 const struct nf_conn *ignored_ct) 246 { 247 static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; 248 const struct nf_conntrack_tuple_hash *thash; 249 const struct nf_conntrack_zone *zone; 250 struct nf_conn *ct; 251 bool taken = true; 252 struct net *net; 253 254 if (!nf_nat_used_tuple(tuple, ignored_ct)) 255 return false; 256 257 if (!nf_nat_allow_clash(ignored_ct)) 258 return true; 259 260 /* Initial choice clashes with existing conntrack. 261 * Check for (rare) reverse collision. 262 * 263 * This can happen when new packets are received in both directions 264 * at the exact same time on different CPUs. 265 * 266 * Without SMP, first packet creates new conntrack entry and second 267 * packet is resolved as established reply packet. 268 * 269 * With parallel processing, both packets could be picked up as 270 * new and both get their own ct entry allocated. 271 * 272 * If ignored_conntrack and colliding ct are not subject to NAT then 273 * pretend the tuple is available and let later clash resolution 274 * handle this at insertion time. 275 * 276 * Without it, the 'reply' packet has its source port rewritten 277 * by nat engine. 278 */ 279 if (READ_ONCE(ignored_ct->status) & uses_nat) 280 return true; 281 282 net = nf_ct_net(ignored_ct); 283 zone = nf_ct_zone(ignored_ct); 284 285 thash = nf_conntrack_find_get(net, zone, tuple); 286 if (unlikely(!thash)) { 287 struct nf_conntrack_tuple reply; 288 289 nf_ct_invert_tuple(&reply, tuple); 290 thash = nf_conntrack_find_get(net, zone, &reply); 291 if (!thash) /* clashing entry went away */ 292 return false; 293 } 294 295 ct = nf_ct_tuplehash_to_ctrack(thash); 296 297 /* clashing connection subject to NAT? Retry with new tuple. */ 298 if (READ_ONCE(ct->status) & uses_nat) 299 goto out; 300 301 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 302 &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple)) 303 taken = false; 304 out: 305 nf_ct_put(ct); 306 return taken; 307 } 308 309 static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) 310 { 311 static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | 312 IPS_DYING; 313 static const unsigned long flags_needed = IPS_SRC_NAT; 314 enum tcp_conntrack old_state; 315 316 old_state = READ_ONCE(ct->proto.tcp.state); 317 if (old_state < TCP_CONNTRACK_TIME_WAIT) 318 return false; 319 320 if (flags & flags_refuse) 321 return false; 322 323 return (flags & flags_needed) == flags_needed; 324 } 325 326 /* reverse direction will send packets to new source, so 327 * make sure such packets are invalid. 328 */ 329 static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) 330 { 331 return (__s32)(new->proto.tcp.seen[0].td_end - 332 old->proto.tcp.seen[0].td_end) > 0; 333 } 334 335 static int 336 nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, 337 const struct nf_conn *ignored_conntrack, 338 unsigned int attempts_left) 339 { 340 static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; 341 struct nf_conntrack_tuple_hash *thash; 342 const struct nf_conntrack_zone *zone; 343 struct nf_conntrack_tuple reply; 344 unsigned long flags; 345 struct nf_conn *ct; 346 bool taken = true; 347 struct net *net; 348 349 nf_ct_invert_tuple(&reply, tuple); 350 351 if (attempts_left > NF_NAT_HARDER_THRESH || 352 tuple->dst.protonum != IPPROTO_TCP || 353 ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) 354 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 355 356 /* :ast few attempts to find a free tcp port. Destructive 357 * action: evict colliding if its in timewait state and the 358 * tcp sequence number has advanced past the one used by the 359 * old entry. 360 */ 361 net = nf_ct_net(ignored_conntrack); 362 zone = nf_ct_zone(ignored_conntrack); 363 364 thash = nf_conntrack_find_get(net, zone, &reply); 365 if (!thash) 366 return false; 367 368 ct = nf_ct_tuplehash_to_ctrack(thash); 369 370 if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) 371 goto out; 372 373 if (WARN_ON_ONCE(ct == ignored_conntrack)) 374 goto out; 375 376 flags = READ_ONCE(ct->status); 377 if (!nf_nat_may_kill(ct, flags)) 378 goto out; 379 380 if (!nf_seq_has_advanced(ct, ignored_conntrack)) 381 goto out; 382 383 /* Even if we can evict do not reuse if entry is offloaded. */ 384 if (nf_ct_kill(ct)) 385 taken = flags & flags_offload; 386 out: 387 nf_ct_put(ct); 388 return taken; 389 } 390 391 static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, 392 const struct nf_nat_range2 *range) 393 { 394 if (t->src.l3num == NFPROTO_IPV4) 395 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && 396 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); 397 398 return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && 399 ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; 400 } 401 402 /* Is the manipable part of the tuple between min and max incl? */ 403 static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, 404 enum nf_nat_manip_type maniptype, 405 const union nf_conntrack_man_proto *min, 406 const union nf_conntrack_man_proto *max) 407 { 408 __be16 port; 409 410 switch (tuple->dst.protonum) { 411 case IPPROTO_ICMP: 412 case IPPROTO_ICMPV6: 413 return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && 414 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); 415 case IPPROTO_GRE: /* all fall though */ 416 case IPPROTO_TCP: 417 case IPPROTO_UDP: 418 case IPPROTO_UDPLITE: 419 case IPPROTO_SCTP: 420 if (maniptype == NF_NAT_MANIP_SRC) 421 port = tuple->src.u.all; 422 else 423 port = tuple->dst.u.all; 424 425 return ntohs(port) >= ntohs(min->all) && 426 ntohs(port) <= ntohs(max->all); 427 default: 428 return true; 429 } 430 } 431 432 /* If we source map this tuple so reply looks like reply_tuple, will 433 * that meet the constraints of range. 434 */ 435 static int nf_in_range(const struct nf_conntrack_tuple *tuple, 436 const struct nf_nat_range2 *range) 437 { 438 /* If we are supposed to map IPs, then we must be in the 439 * range specified, otherwise let this drag us onto a new src IP. 440 */ 441 if (range->flags & NF_NAT_RANGE_MAP_IPS && 442 !nf_nat_inet_in_range(tuple, range)) 443 return 0; 444 445 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) 446 return 1; 447 448 return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, 449 &range->min_proto, &range->max_proto); 450 } 451 452 static inline int 453 same_src(const struct nf_conn *ct, 454 const struct nf_conntrack_tuple *tuple) 455 { 456 const struct nf_conntrack_tuple *t; 457 458 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 459 return (t->dst.protonum == tuple->dst.protonum && 460 nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && 461 t->src.u.all == tuple->src.u.all); 462 } 463 464 /* Only called for SRC manip */ 465 static int 466 find_appropriate_src(struct net *net, 467 const struct nf_conntrack_zone *zone, 468 const struct nf_conntrack_tuple *tuple, 469 struct nf_conntrack_tuple *result, 470 const struct nf_nat_range2 *range) 471 { 472 unsigned int h = hash_by_src(net, zone, tuple); 473 const struct nf_conn *ct; 474 475 hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { 476 if (same_src(ct, tuple) && 477 net_eq(net, nf_ct_net(ct)) && 478 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { 479 /* Copy source part from reply tuple. */ 480 nf_ct_invert_tuple(result, 481 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 482 result->dst = tuple->dst; 483 484 if (nf_in_range(result, range)) 485 return 1; 486 } 487 } 488 return 0; 489 } 490 491 /* For [FUTURE] fragmentation handling, we want the least-used 492 * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus 493 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports 494 * 1-65535, we don't do pro-rata allocation based on ports; we choose 495 * the ip with the lowest src-ip/dst-ip/proto usage. 496 */ 497 static void 498 find_best_ips_proto(const struct nf_conntrack_zone *zone, 499 struct nf_conntrack_tuple *tuple, 500 const struct nf_nat_range2 *range, 501 const struct nf_conn *ct, 502 enum nf_nat_manip_type maniptype) 503 { 504 union nf_inet_addr *var_ipp; 505 unsigned int i, max; 506 /* Host order */ 507 u32 minip, maxip, j, dist; 508 bool full_range; 509 510 /* No IP mapping? Do nothing. */ 511 if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) 512 return; 513 514 if (maniptype == NF_NAT_MANIP_SRC) 515 var_ipp = &tuple->src.u3; 516 else 517 var_ipp = &tuple->dst.u3; 518 519 /* Fast path: only one choice. */ 520 if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { 521 *var_ipp = range->min_addr; 522 return; 523 } 524 525 if (nf_ct_l3num(ct) == NFPROTO_IPV4) 526 max = sizeof(var_ipp->ip) / sizeof(u32) - 1; 527 else 528 max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; 529 530 /* Hashing source and destination IPs gives a fairly even 531 * spread in practice (if there are a small number of IPs 532 * involved, there usually aren't that many connections 533 * anyway). The consistency means that servers see the same 534 * client coming from the same IP (some Internet Banking sites 535 * like this), even across reboots. 536 */ 537 j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), 538 range->flags & NF_NAT_RANGE_PERSISTENT ? 539 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); 540 541 full_range = false; 542 for (i = 0; i <= max; i++) { 543 /* If first bytes of the address are at the maximum, use the 544 * distance. Otherwise use the full range. 545 */ 546 if (!full_range) { 547 minip = ntohl((__force __be32)range->min_addr.all[i]); 548 maxip = ntohl((__force __be32)range->max_addr.all[i]); 549 dist = maxip - minip + 1; 550 } else { 551 minip = 0; 552 dist = ~0; 553 } 554 555 var_ipp->all[i] = (__force __u32) 556 htonl(minip + reciprocal_scale(j, dist)); 557 if (var_ipp->all[i] != range->max_addr.all[i]) 558 full_range = true; 559 560 if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) 561 j ^= (__force u32)tuple->dst.u3.all[i]; 562 } 563 } 564 565 /* Alter the per-proto part of the tuple (depending on maniptype), to 566 * give a unique tuple in the given range if possible. 567 * 568 * Per-protocol part of tuple is initialized to the incoming packet. 569 */ 570 static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, 571 const struct nf_nat_range2 *range, 572 enum nf_nat_manip_type maniptype, 573 const struct nf_conn *ct) 574 { 575 unsigned int range_size, min, max, i, attempts; 576 __be16 *keyptr; 577 u16 off; 578 579 switch (tuple->dst.protonum) { 580 case IPPROTO_ICMP: 581 case IPPROTO_ICMPV6: 582 /* id is same for either direction... */ 583 keyptr = &tuple->src.u.icmp.id; 584 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 585 min = 0; 586 range_size = 65536; 587 } else { 588 min = ntohs(range->min_proto.icmp.id); 589 range_size = ntohs(range->max_proto.icmp.id) - 590 ntohs(range->min_proto.icmp.id) + 1; 591 } 592 goto find_free_id; 593 #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) 594 case IPPROTO_GRE: 595 /* If there is no master conntrack we are not PPTP, 596 do not change tuples */ 597 if (!ct->master) 598 return; 599 600 if (maniptype == NF_NAT_MANIP_SRC) 601 keyptr = &tuple->src.u.gre.key; 602 else 603 keyptr = &tuple->dst.u.gre.key; 604 605 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 606 min = 1; 607 range_size = 65535; 608 } else { 609 min = ntohs(range->min_proto.gre.key); 610 range_size = ntohs(range->max_proto.gre.key) - min + 1; 611 } 612 goto find_free_id; 613 #endif 614 case IPPROTO_UDP: 615 case IPPROTO_UDPLITE: 616 case IPPROTO_TCP: 617 case IPPROTO_SCTP: 618 if (maniptype == NF_NAT_MANIP_SRC) 619 keyptr = &tuple->src.u.all; 620 else 621 keyptr = &tuple->dst.u.all; 622 623 break; 624 default: 625 return; 626 } 627 628 /* If no range specified... */ 629 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 630 /* If it's dst rewrite, can't change port */ 631 if (maniptype == NF_NAT_MANIP_DST) 632 return; 633 634 if (ntohs(*keyptr) < 1024) { 635 /* Loose convention: >> 512 is credential passing */ 636 if (ntohs(*keyptr) < 512) { 637 min = 1; 638 range_size = 511 - min + 1; 639 } else { 640 min = 600; 641 range_size = 1023 - min + 1; 642 } 643 } else { 644 min = 1024; 645 range_size = 65535 - 1024 + 1; 646 } 647 } else { 648 min = ntohs(range->min_proto.all); 649 max = ntohs(range->max_proto.all); 650 if (unlikely(max < min)) 651 swap(max, min); 652 range_size = max - min + 1; 653 } 654 655 find_free_id: 656 if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) 657 off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); 658 else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || 659 maniptype != NF_NAT_MANIP_DST) 660 off = get_random_u16(); 661 else 662 off = 0; 663 664 attempts = range_size; 665 if (attempts > NF_NAT_MAX_ATTEMPTS) 666 attempts = NF_NAT_MAX_ATTEMPTS; 667 668 /* We are in softirq; doing a search of the entire range risks 669 * soft lockup when all tuples are already used. 670 * 671 * If we can't find any free port from first offset, pick a new 672 * one and try again, with ever smaller search window. 673 */ 674 another_round: 675 for (i = 0; i < attempts; i++, off++) { 676 *keyptr = htons(min + off % range_size); 677 if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) 678 return; 679 } 680 681 if (attempts >= range_size || attempts < 16) 682 return; 683 attempts /= 2; 684 off = get_random_u16(); 685 goto another_round; 686 } 687 688 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, 689 * we change the source to map into the range. For NF_INET_PRE_ROUTING 690 * and NF_INET_LOCAL_OUT, we change the destination to map into the 691 * range. It might not be possible to get a unique tuple, but we try. 692 * At worst (or if we race), we will end up with a final duplicate in 693 * __nf_conntrack_confirm and drop the packet. */ 694 static void 695 get_unique_tuple(struct nf_conntrack_tuple *tuple, 696 const struct nf_conntrack_tuple *orig_tuple, 697 const struct nf_nat_range2 *range, 698 struct nf_conn *ct, 699 enum nf_nat_manip_type maniptype) 700 { 701 const struct nf_conntrack_zone *zone; 702 struct net *net = nf_ct_net(ct); 703 704 zone = nf_ct_zone(ct); 705 706 /* 1) If this srcip/proto/src-proto-part is currently mapped, 707 * and that same mapping gives a unique tuple within the given 708 * range, use that. 709 * 710 * This is only required for source (ie. NAT/masq) mappings. 711 * So far, we don't do local source mappings, so multiple 712 * manips not an issue. 713 */ 714 if (maniptype == NF_NAT_MANIP_SRC && 715 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 716 /* try the original tuple first */ 717 if (nf_in_range(orig_tuple, range)) { 718 if (!nf_nat_used_tuple_new(orig_tuple, ct)) { 719 *tuple = *orig_tuple; 720 return; 721 } 722 } else if (find_appropriate_src(net, zone, 723 orig_tuple, tuple, range)) { 724 pr_debug("get_unique_tuple: Found current src map\n"); 725 if (!nf_nat_used_tuple(tuple, ct)) 726 return; 727 } 728 } 729 730 /* 2) Select the least-used IP/proto combination in the given range */ 731 *tuple = *orig_tuple; 732 find_best_ips_proto(zone, tuple, range, ct, maniptype); 733 734 /* 3) The per-protocol part of the manip is made to map into 735 * the range to make a unique tuple. 736 */ 737 738 /* Only bother mapping if it's not already in range and unique */ 739 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 740 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 741 if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && 742 l4proto_in_range(tuple, maniptype, 743 &range->min_proto, 744 &range->max_proto) && 745 (range->min_proto.all == range->max_proto.all || 746 !nf_nat_used_tuple(tuple, ct))) 747 return; 748 } else if (!nf_nat_used_tuple(tuple, ct)) { 749 return; 750 } 751 } 752 753 /* Last chance: get protocol to try to obtain unique tuple. */ 754 nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); 755 } 756 757 struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) 758 { 759 struct nf_conn_nat *nat = nfct_nat(ct); 760 if (nat) 761 return nat; 762 763 if (!nf_ct_is_confirmed(ct)) 764 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); 765 766 return nat; 767 } 768 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); 769 770 unsigned int 771 nf_nat_setup_info(struct nf_conn *ct, 772 const struct nf_nat_range2 *range, 773 enum nf_nat_manip_type maniptype) 774 { 775 struct net *net = nf_ct_net(ct); 776 struct nf_conntrack_tuple curr_tuple, new_tuple; 777 778 /* Can't setup nat info for confirmed ct. */ 779 if (nf_ct_is_confirmed(ct)) 780 return NF_ACCEPT; 781 782 WARN_ON(maniptype != NF_NAT_MANIP_SRC && 783 maniptype != NF_NAT_MANIP_DST); 784 785 if (WARN_ON(nf_nat_initialized(ct, maniptype))) 786 return NF_DROP; 787 788 /* What we've got will look like inverse of reply. Normally 789 * this is what is in the conntrack, except for prior 790 * manipulations (future optimization: if num_manips == 0, 791 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 792 */ 793 nf_ct_invert_tuple(&curr_tuple, 794 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 795 796 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); 797 798 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { 799 struct nf_conntrack_tuple reply; 800 801 /* Alter conntrack table so will recognize replies. */ 802 nf_ct_invert_tuple(&reply, &new_tuple); 803 nf_conntrack_alter_reply(ct, &reply); 804 805 /* Non-atomic: we own this at the moment. */ 806 if (maniptype == NF_NAT_MANIP_SRC) 807 ct->status |= IPS_SRC_NAT; 808 else 809 ct->status |= IPS_DST_NAT; 810 811 if (nfct_help(ct) && !nfct_seqadj(ct)) 812 if (!nfct_seqadj_ext_add(ct)) 813 return NF_DROP; 814 } 815 816 if (maniptype == NF_NAT_MANIP_SRC) { 817 unsigned int srchash; 818 spinlock_t *lock; 819 820 srchash = hash_by_src(net, nf_ct_zone(ct), 821 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 822 lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; 823 spin_lock_bh(lock); 824 hlist_add_head_rcu(&ct->nat_bysource, 825 &nf_nat_bysource[srchash]); 826 spin_unlock_bh(lock); 827 } 828 829 /* It's done. */ 830 if (maniptype == NF_NAT_MANIP_DST) 831 ct->status |= IPS_DST_NAT_DONE; 832 else 833 ct->status |= IPS_SRC_NAT_DONE; 834 835 return NF_ACCEPT; 836 } 837 EXPORT_SYMBOL(nf_nat_setup_info); 838 839 static unsigned int 840 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) 841 { 842 /* Force range to this IP; let proto decide mapping for 843 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 844 * Use reply in case it's already been mangled (eg local packet). 845 */ 846 union nf_inet_addr ip = 847 (manip == NF_NAT_MANIP_SRC ? 848 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : 849 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); 850 struct nf_nat_range2 range = { 851 .flags = NF_NAT_RANGE_MAP_IPS, 852 .min_addr = ip, 853 .max_addr = ip, 854 }; 855 return nf_nat_setup_info(ct, &range, manip); 856 } 857 858 unsigned int 859 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 860 { 861 return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); 862 } 863 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); 864 865 /* Do packet manipulations according to nf_nat_setup_info. */ 866 unsigned int nf_nat_packet(struct nf_conn *ct, 867 enum ip_conntrack_info ctinfo, 868 unsigned int hooknum, 869 struct sk_buff *skb) 870 { 871 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); 872 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 873 unsigned int verdict = NF_ACCEPT; 874 unsigned long statusbit; 875 876 if (mtype == NF_NAT_MANIP_SRC) 877 statusbit = IPS_SRC_NAT; 878 else 879 statusbit = IPS_DST_NAT; 880 881 /* Invert if this is reply dir. */ 882 if (dir == IP_CT_DIR_REPLY) 883 statusbit ^= IPS_NAT_MASK; 884 885 /* Non-atomic: these bits don't change. */ 886 if (ct->status & statusbit) 887 verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); 888 889 return verdict; 890 } 891 EXPORT_SYMBOL_GPL(nf_nat_packet); 892 893 static bool in_vrf_postrouting(const struct nf_hook_state *state) 894 { 895 #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) 896 if (state->hook == NF_INET_POST_ROUTING && 897 netif_is_l3_master(state->out)) 898 return true; 899 #endif 900 return false; 901 } 902 903 unsigned int 904 nf_nat_inet_fn(void *priv, struct sk_buff *skb, 905 const struct nf_hook_state *state) 906 { 907 struct nf_conn *ct; 908 enum ip_conntrack_info ctinfo; 909 struct nf_conn_nat *nat; 910 /* maniptype == SRC for postrouting. */ 911 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); 912 913 ct = nf_ct_get(skb, &ctinfo); 914 /* Can't track? It's not due to stress, or conntrack would 915 * have dropped it. Hence it's the user's responsibilty to 916 * packet filter it out, or implement conntrack/NAT for that 917 * protocol. 8) --RR 918 */ 919 if (!ct || in_vrf_postrouting(state)) 920 return NF_ACCEPT; 921 922 nat = nfct_nat(ct); 923 924 switch (ctinfo) { 925 case IP_CT_RELATED: 926 case IP_CT_RELATED_REPLY: 927 /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ 928 case IP_CT_NEW: 929 /* Seen it before? This can happen for loopback, retrans, 930 * or local packets. 931 */ 932 if (!nf_nat_initialized(ct, maniptype)) { 933 struct nf_nat_lookup_hook_priv *lpriv = priv; 934 struct nf_hook_entries *e = rcu_dereference(lpriv->entries); 935 unsigned int ret; 936 int i; 937 938 if (!e) 939 goto null_bind; 940 941 for (i = 0; i < e->num_hook_entries; i++) { 942 ret = e->hooks[i].hook(e->hooks[i].priv, skb, 943 state); 944 if (ret != NF_ACCEPT) 945 return ret; 946 if (nf_nat_initialized(ct, maniptype)) 947 goto do_nat; 948 } 949 null_bind: 950 ret = nf_nat_alloc_null_binding(ct, state->hook); 951 if (ret != NF_ACCEPT) 952 return ret; 953 } else { 954 pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", 955 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", 956 ct, ct->status); 957 if (nf_nat_oif_changed(state->hook, ctinfo, nat, 958 state->out)) 959 goto oif_changed; 960 } 961 break; 962 default: 963 /* ESTABLISHED */ 964 WARN_ON(ctinfo != IP_CT_ESTABLISHED && 965 ctinfo != IP_CT_ESTABLISHED_REPLY); 966 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) 967 goto oif_changed; 968 } 969 do_nat: 970 return nf_nat_packet(ct, ctinfo, state->hook, skb); 971 972 oif_changed: 973 nf_ct_kill_acct(ct, ctinfo, skb); 974 return NF_DROP; 975 } 976 EXPORT_SYMBOL_GPL(nf_nat_inet_fn); 977 978 struct nf_nat_proto_clean { 979 u8 l3proto; 980 u8 l4proto; 981 }; 982 983 /* kill conntracks with affected NAT section */ 984 static int nf_nat_proto_remove(struct nf_conn *i, void *data) 985 { 986 const struct nf_nat_proto_clean *clean = data; 987 988 if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || 989 (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) 990 return 0; 991 992 return i->status & IPS_NAT_MASK ? 1 : 0; 993 } 994 995 static void nf_nat_cleanup_conntrack(struct nf_conn *ct) 996 { 997 unsigned int h; 998 999 h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 1000 spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 1001 hlist_del_rcu(&ct->nat_bysource); 1002 spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 1003 } 1004 1005 static int nf_nat_proto_clean(struct nf_conn *ct, void *data) 1006 { 1007 if (nf_nat_proto_remove(ct, data)) 1008 return 1; 1009 1010 /* This module is being removed and conntrack has nat null binding. 1011 * Remove it from bysource hash, as the table will be freed soon. 1012 * 1013 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() 1014 * will delete entry from already-freed table. 1015 */ 1016 if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) 1017 nf_nat_cleanup_conntrack(ct); 1018 1019 /* don't delete conntrack. Although that would make things a lot 1020 * simpler, we'd end up flushing all conntracks on nat rmmod. 1021 */ 1022 return 0; 1023 } 1024 1025 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1026 1027 #include <linux/netfilter/nfnetlink.h> 1028 #include <linux/netfilter/nfnetlink_conntrack.h> 1029 1030 static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { 1031 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, 1032 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, 1033 }; 1034 1035 static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], 1036 struct nf_nat_range2 *range) 1037 { 1038 if (tb[CTA_PROTONAT_PORT_MIN]) { 1039 range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); 1040 range->max_proto.all = range->min_proto.all; 1041 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1042 } 1043 if (tb[CTA_PROTONAT_PORT_MAX]) { 1044 range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); 1045 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1046 } 1047 return 0; 1048 } 1049 1050 static int nfnetlink_parse_nat_proto(struct nlattr *attr, 1051 const struct nf_conn *ct, 1052 struct nf_nat_range2 *range) 1053 { 1054 struct nlattr *tb[CTA_PROTONAT_MAX+1]; 1055 int err; 1056 1057 err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, 1058 protonat_nla_policy, NULL); 1059 if (err < 0) 1060 return err; 1061 1062 return nf_nat_l4proto_nlattr_to_range(tb, range); 1063 } 1064 1065 static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { 1066 [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, 1067 [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, 1068 [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, 1069 [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, 1070 [CTA_NAT_PROTO] = { .type = NLA_NESTED }, 1071 }; 1072 1073 static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], 1074 struct nf_nat_range2 *range) 1075 { 1076 if (tb[CTA_NAT_V4_MINIP]) { 1077 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); 1078 range->flags |= NF_NAT_RANGE_MAP_IPS; 1079 } 1080 1081 range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], 1082 range->min_addr.ip); 1083 1084 return 0; 1085 } 1086 1087 static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], 1088 struct nf_nat_range2 *range) 1089 { 1090 if (tb[CTA_NAT_V6_MINIP]) { 1091 nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], 1092 sizeof(struct in6_addr)); 1093 range->flags |= NF_NAT_RANGE_MAP_IPS; 1094 } 1095 1096 if (tb[CTA_NAT_V6_MAXIP]) 1097 nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], 1098 sizeof(struct in6_addr)); 1099 else 1100 range->max_addr = range->min_addr; 1101 1102 return 0; 1103 } 1104 1105 static int 1106 nfnetlink_parse_nat(const struct nlattr *nat, 1107 const struct nf_conn *ct, struct nf_nat_range2 *range) 1108 { 1109 struct nlattr *tb[CTA_NAT_MAX+1]; 1110 int err; 1111 1112 memset(range, 0, sizeof(*range)); 1113 1114 err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, 1115 nat_nla_policy, NULL); 1116 if (err < 0) 1117 return err; 1118 1119 switch (nf_ct_l3num(ct)) { 1120 case NFPROTO_IPV4: 1121 err = nf_nat_ipv4_nlattr_to_range(tb, range); 1122 break; 1123 case NFPROTO_IPV6: 1124 err = nf_nat_ipv6_nlattr_to_range(tb, range); 1125 break; 1126 default: 1127 err = -EPROTONOSUPPORT; 1128 break; 1129 } 1130 1131 if (err) 1132 return err; 1133 1134 if (!tb[CTA_NAT_PROTO]) 1135 return 0; 1136 1137 return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); 1138 } 1139 1140 /* This function is called under rcu_read_lock() */ 1141 static int 1142 nfnetlink_parse_nat_setup(struct nf_conn *ct, 1143 enum nf_nat_manip_type manip, 1144 const struct nlattr *attr) 1145 { 1146 struct nf_nat_range2 range; 1147 int err; 1148 1149 /* Should not happen, restricted to creating new conntracks 1150 * via ctnetlink. 1151 */ 1152 if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) 1153 return -EEXIST; 1154 1155 /* No NAT information has been passed, allocate the null-binding */ 1156 if (attr == NULL) 1157 return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; 1158 1159 err = nfnetlink_parse_nat(attr, ct, &range); 1160 if (err < 0) 1161 return err; 1162 1163 return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; 1164 } 1165 #else 1166 static int 1167 nfnetlink_parse_nat_setup(struct nf_conn *ct, 1168 enum nf_nat_manip_type manip, 1169 const struct nlattr *attr) 1170 { 1171 return -EOPNOTSUPP; 1172 } 1173 #endif 1174 1175 static struct nf_ct_helper_expectfn follow_master_nat = { 1176 .name = "nat-follow-master", 1177 .expectfn = nf_nat_follow_master, 1178 }; 1179 1180 int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, 1181 const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) 1182 { 1183 struct nat_net *nat_net = net_generic(net, nat_net_id); 1184 struct nf_nat_hooks_net *nat_proto_net; 1185 struct nf_nat_lookup_hook_priv *priv; 1186 unsigned int hooknum = ops->hooknum; 1187 struct nf_hook_ops *nat_ops; 1188 int i, ret; 1189 1190 if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) 1191 return -EINVAL; 1192 1193 nat_proto_net = &nat_net->nat_proto_net[pf]; 1194 1195 for (i = 0; i < ops_count; i++) { 1196 if (orig_nat_ops[i].hooknum == hooknum) { 1197 hooknum = i; 1198 break; 1199 } 1200 } 1201 1202 if (WARN_ON_ONCE(i == ops_count)) 1203 return -EINVAL; 1204 1205 mutex_lock(&nf_nat_proto_mutex); 1206 if (!nat_proto_net->nat_hook_ops) { 1207 WARN_ON(nat_proto_net->users != 0); 1208 1209 nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); 1210 if (!nat_ops) { 1211 mutex_unlock(&nf_nat_proto_mutex); 1212 return -ENOMEM; 1213 } 1214 1215 for (i = 0; i < ops_count; i++) { 1216 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 1217 if (priv) { 1218 nat_ops[i].priv = priv; 1219 continue; 1220 } 1221 mutex_unlock(&nf_nat_proto_mutex); 1222 while (i) 1223 kfree(nat_ops[--i].priv); 1224 kfree(nat_ops); 1225 return -ENOMEM; 1226 } 1227 1228 ret = nf_register_net_hooks(net, nat_ops, ops_count); 1229 if (ret < 0) { 1230 mutex_unlock(&nf_nat_proto_mutex); 1231 for (i = 0; i < ops_count; i++) 1232 kfree(nat_ops[i].priv); 1233 kfree(nat_ops); 1234 return ret; 1235 } 1236 1237 nat_proto_net->nat_hook_ops = nat_ops; 1238 } 1239 1240 nat_ops = nat_proto_net->nat_hook_ops; 1241 priv = nat_ops[hooknum].priv; 1242 if (WARN_ON_ONCE(!priv)) { 1243 mutex_unlock(&nf_nat_proto_mutex); 1244 return -EOPNOTSUPP; 1245 } 1246 1247 ret = nf_hook_entries_insert_raw(&priv->entries, ops); 1248 if (ret == 0) 1249 nat_proto_net->users++; 1250 1251 mutex_unlock(&nf_nat_proto_mutex); 1252 return ret; 1253 } 1254 1255 void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, 1256 unsigned int ops_count) 1257 { 1258 struct nat_net *nat_net = net_generic(net, nat_net_id); 1259 struct nf_nat_hooks_net *nat_proto_net; 1260 struct nf_nat_lookup_hook_priv *priv; 1261 struct nf_hook_ops *nat_ops; 1262 int hooknum = ops->hooknum; 1263 int i; 1264 1265 if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) 1266 return; 1267 1268 nat_proto_net = &nat_net->nat_proto_net[pf]; 1269 1270 mutex_lock(&nf_nat_proto_mutex); 1271 if (WARN_ON(nat_proto_net->users == 0)) 1272 goto unlock; 1273 1274 nat_proto_net->users--; 1275 1276 nat_ops = nat_proto_net->nat_hook_ops; 1277 for (i = 0; i < ops_count; i++) { 1278 if (nat_ops[i].hooknum == hooknum) { 1279 hooknum = i; 1280 break; 1281 } 1282 } 1283 if (WARN_ON_ONCE(i == ops_count)) 1284 goto unlock; 1285 priv = nat_ops[hooknum].priv; 1286 nf_hook_entries_delete_raw(&priv->entries, ops); 1287 1288 if (nat_proto_net->users == 0) { 1289 nf_unregister_net_hooks(net, nat_ops, ops_count); 1290 1291 for (i = 0; i < ops_count; i++) { 1292 priv = nat_ops[i].priv; 1293 kfree_rcu(priv, rcu_head); 1294 } 1295 1296 nat_proto_net->nat_hook_ops = NULL; 1297 kfree(nat_ops); 1298 } 1299 unlock: 1300 mutex_unlock(&nf_nat_proto_mutex); 1301 } 1302 1303 static struct pernet_operations nat_net_ops = { 1304 .id = &nat_net_id, 1305 .size = sizeof(struct nat_net), 1306 }; 1307 1308 static const struct nf_nat_hook nat_hook = { 1309 .parse_nat_setup = nfnetlink_parse_nat_setup, 1310 #ifdef CONFIG_XFRM 1311 .decode_session = __nf_nat_decode_session, 1312 #endif 1313 .remove_nat_bysrc = nf_nat_cleanup_conntrack, 1314 }; 1315 1316 static int __init nf_nat_init(void) 1317 { 1318 int ret, i; 1319 1320 /* Leave them the same for the moment. */ 1321 nf_nat_htable_size = nf_conntrack_htable_size; 1322 if (nf_nat_htable_size < CONNTRACK_LOCKS) 1323 nf_nat_htable_size = CONNTRACK_LOCKS; 1324 1325 nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); 1326 if (!nf_nat_bysource) 1327 return -ENOMEM; 1328 1329 for (i = 0; i < CONNTRACK_LOCKS; i++) 1330 spin_lock_init(&nf_nat_locks[i]); 1331 1332 ret = register_pernet_subsys(&nat_net_ops); 1333 if (ret < 0) { 1334 kvfree(nf_nat_bysource); 1335 return ret; 1336 } 1337 1338 nf_ct_helper_expectfn_register(&follow_master_nat); 1339 1340 WARN_ON(nf_nat_hook != NULL); 1341 RCU_INIT_POINTER(nf_nat_hook, &nat_hook); 1342 1343 ret = register_nf_nat_bpf(); 1344 if (ret < 0) { 1345 RCU_INIT_POINTER(nf_nat_hook, NULL); 1346 nf_ct_helper_expectfn_unregister(&follow_master_nat); 1347 synchronize_net(); 1348 unregister_pernet_subsys(&nat_net_ops); 1349 kvfree(nf_nat_bysource); 1350 } 1351 1352 return ret; 1353 } 1354 1355 static void __exit nf_nat_cleanup(void) 1356 { 1357 struct nf_nat_proto_clean clean = {}; 1358 1359 nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); 1360 1361 nf_ct_helper_expectfn_unregister(&follow_master_nat); 1362 RCU_INIT_POINTER(nf_nat_hook, NULL); 1363 1364 synchronize_net(); 1365 kvfree(nf_nat_bysource); 1366 unregister_pernet_subsys(&nat_net_ops); 1367 } 1368 1369 MODULE_LICENSE("GPL"); 1370 MODULE_DESCRIPTION("Network address translation core"); 1371 1372 module_init(nf_nat_init); 1373 module_exit(nf_nat_cleanup); 1374