1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 #include <linux/vmalloc.h> 22 23 #include <net/inet_connection_sock.h> 24 #include <net/inet_hashtables.h> 25 #include <net/secure_seq.h> 26 #include <net/ip.h> 27 28 static u32 inet_ehashfn(const struct net *net, const __be32 laddr, 29 const __u16 lport, const __be32 faddr, 30 const __be16 fport) 31 { 32 static u32 inet_ehash_secret __read_mostly; 33 34 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 35 36 return __inet_ehashfn(laddr, lport, faddr, fport, 37 inet_ehash_secret + net_hash_mix(net)); 38 } 39 40 /* This function handles inet_sock, but also timewait and request sockets 41 * for IPv4/IPv6. 42 */ 43 u32 sk_ehashfn(const struct sock *sk) 44 { 45 #if IS_ENABLED(CONFIG_IPV6) 46 if (sk->sk_family == AF_INET6 && 47 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 48 return inet6_ehashfn(sock_net(sk), 49 &sk->sk_v6_rcv_saddr, sk->sk_num, 50 &sk->sk_v6_daddr, sk->sk_dport); 51 #endif 52 return inet_ehashfn(sock_net(sk), 53 sk->sk_rcv_saddr, sk->sk_num, 54 sk->sk_daddr, sk->sk_dport); 55 } 56 57 /* 58 * Allocate and initialize a new local port bind bucket. 59 * The bindhash mutex for snum's hash chain must be held here. 60 */ 61 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 62 struct net *net, 63 struct inet_bind_hashbucket *head, 64 const unsigned short snum) 65 { 66 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 67 68 if (tb) { 69 write_pnet(&tb->ib_net, net); 70 tb->port = snum; 71 tb->fastreuse = 0; 72 tb->fastreuseport = 0; 73 tb->num_owners = 0; 74 INIT_HLIST_HEAD(&tb->owners); 75 hlist_add_head(&tb->node, &head->chain); 76 } 77 return tb; 78 } 79 80 /* 81 * Caller must hold hashbucket lock for this tb with local BH disabled 82 */ 83 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 84 { 85 if (hlist_empty(&tb->owners)) { 86 __hlist_del(&tb->node); 87 kmem_cache_free(cachep, tb); 88 } 89 } 90 91 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 92 const unsigned short snum) 93 { 94 inet_sk(sk)->inet_num = snum; 95 sk_add_bind_node(sk, &tb->owners); 96 tb->num_owners++; 97 inet_csk(sk)->icsk_bind_hash = tb; 98 } 99 100 /* 101 * Get rid of any references to a local port held by the given sock. 102 */ 103 static void __inet_put_port(struct sock *sk) 104 { 105 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 106 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 107 hashinfo->bhash_size); 108 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 109 struct inet_bind_bucket *tb; 110 111 spin_lock(&head->lock); 112 tb = inet_csk(sk)->icsk_bind_hash; 113 __sk_del_bind_node(sk); 114 tb->num_owners--; 115 inet_csk(sk)->icsk_bind_hash = NULL; 116 inet_sk(sk)->inet_num = 0; 117 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 118 spin_unlock(&head->lock); 119 } 120 121 void inet_put_port(struct sock *sk) 122 { 123 local_bh_disable(); 124 __inet_put_port(sk); 125 local_bh_enable(); 126 } 127 EXPORT_SYMBOL(inet_put_port); 128 129 int __inet_inherit_port(struct sock *sk, struct sock *child) 130 { 131 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 132 unsigned short port = inet_sk(child)->inet_num; 133 const int bhash = inet_bhashfn(sock_net(sk), port, 134 table->bhash_size); 135 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 136 struct inet_bind_bucket *tb; 137 138 spin_lock(&head->lock); 139 tb = inet_csk(sk)->icsk_bind_hash; 140 if (tb->port != port) { 141 /* NOTE: using tproxy and redirecting skbs to a proxy 142 * on a different listener port breaks the assumption 143 * that the listener socket's icsk_bind_hash is the same 144 * as that of the child socket. We have to look up or 145 * create a new bind bucket for the child here. */ 146 inet_bind_bucket_for_each(tb, &head->chain) { 147 if (net_eq(ib_net(tb), sock_net(sk)) && 148 tb->port == port) 149 break; 150 } 151 if (!tb) { 152 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 153 sock_net(sk), head, port); 154 if (!tb) { 155 spin_unlock(&head->lock); 156 return -ENOMEM; 157 } 158 } 159 } 160 inet_bind_hash(child, tb, port); 161 spin_unlock(&head->lock); 162 163 return 0; 164 } 165 EXPORT_SYMBOL_GPL(__inet_inherit_port); 166 167 static inline int compute_score(struct sock *sk, struct net *net, 168 const unsigned short hnum, const __be32 daddr, 169 const int dif) 170 { 171 int score = -1; 172 struct inet_sock *inet = inet_sk(sk); 173 174 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && 175 !ipv6_only_sock(sk)) { 176 __be32 rcv_saddr = inet->inet_rcv_saddr; 177 score = sk->sk_family == PF_INET ? 2 : 1; 178 if (rcv_saddr) { 179 if (rcv_saddr != daddr) 180 return -1; 181 score += 4; 182 } 183 if (sk->sk_bound_dev_if) { 184 if (sk->sk_bound_dev_if != dif) 185 return -1; 186 score += 4; 187 } 188 } 189 return score; 190 } 191 192 /* 193 * Don't inline this cruft. Here are some nice properties to exploit here. The 194 * BSD API does not allow a listening sock to specify the remote port nor the 195 * remote address for the connection. So always assume those are both 196 * wildcarded during the search since they can never be otherwise. 197 */ 198 199 200 struct sock *__inet_lookup_listener(struct net *net, 201 struct inet_hashinfo *hashinfo, 202 const __be32 saddr, __be16 sport, 203 const __be32 daddr, const unsigned short hnum, 204 const int dif) 205 { 206 struct sock *sk, *result; 207 struct hlist_nulls_node *node; 208 unsigned int hash = inet_lhashfn(net, hnum); 209 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 210 int score, hiscore, matches = 0, reuseport = 0; 211 u32 phash = 0; 212 213 rcu_read_lock(); 214 begin: 215 result = NULL; 216 hiscore = 0; 217 sk_nulls_for_each_rcu(sk, node, &ilb->head) { 218 score = compute_score(sk, net, hnum, daddr, dif); 219 if (score > hiscore) { 220 result = sk; 221 hiscore = score; 222 reuseport = sk->sk_reuseport; 223 if (reuseport) { 224 phash = inet_ehashfn(net, daddr, hnum, 225 saddr, sport); 226 matches = 1; 227 } 228 } else if (score == hiscore && reuseport) { 229 matches++; 230 if (reciprocal_scale(phash, matches) == 0) 231 result = sk; 232 phash = next_pseudo_random32(phash); 233 } 234 } 235 /* 236 * if the nulls value we got at the end of this lookup is 237 * not the expected one, we must restart lookup. 238 * We probably met an item that was moved to another chain. 239 */ 240 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) 241 goto begin; 242 if (result) { 243 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 244 result = NULL; 245 else if (unlikely(compute_score(result, net, hnum, daddr, 246 dif) < hiscore)) { 247 sock_put(result); 248 goto begin; 249 } 250 } 251 rcu_read_unlock(); 252 return result; 253 } 254 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 255 256 /* All sockets share common refcount, but have different destructors */ 257 void sock_gen_put(struct sock *sk) 258 { 259 if (!atomic_dec_and_test(&sk->sk_refcnt)) 260 return; 261 262 if (sk->sk_state == TCP_TIME_WAIT) 263 inet_twsk_free(inet_twsk(sk)); 264 else if (sk->sk_state == TCP_NEW_SYN_RECV) 265 reqsk_free(inet_reqsk(sk)); 266 else 267 sk_free(sk); 268 } 269 EXPORT_SYMBOL_GPL(sock_gen_put); 270 271 void sock_edemux(struct sk_buff *skb) 272 { 273 sock_gen_put(skb->sk); 274 } 275 EXPORT_SYMBOL(sock_edemux); 276 277 struct sock *__inet_lookup_established(struct net *net, 278 struct inet_hashinfo *hashinfo, 279 const __be32 saddr, const __be16 sport, 280 const __be32 daddr, const u16 hnum, 281 const int dif) 282 { 283 INET_ADDR_COOKIE(acookie, saddr, daddr); 284 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 285 struct sock *sk; 286 const struct hlist_nulls_node *node; 287 /* Optimize here for direct hit, only listening connections can 288 * have wildcards anyways. 289 */ 290 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 291 unsigned int slot = hash & hashinfo->ehash_mask; 292 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 293 294 rcu_read_lock(); 295 begin: 296 sk_nulls_for_each_rcu(sk, node, &head->chain) { 297 if (sk->sk_hash != hash) 298 continue; 299 if (likely(INET_MATCH(sk, net, acookie, 300 saddr, daddr, ports, dif))) { 301 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 302 goto out; 303 if (unlikely(!INET_MATCH(sk, net, acookie, 304 saddr, daddr, ports, dif))) { 305 sock_gen_put(sk); 306 goto begin; 307 } 308 goto found; 309 } 310 } 311 /* 312 * if the nulls value we got at the end of this lookup is 313 * not the expected one, we must restart lookup. 314 * We probably met an item that was moved to another chain. 315 */ 316 if (get_nulls_value(node) != slot) 317 goto begin; 318 out: 319 sk = NULL; 320 found: 321 rcu_read_unlock(); 322 return sk; 323 } 324 EXPORT_SYMBOL_GPL(__inet_lookup_established); 325 326 /* called with local bh disabled */ 327 static int __inet_check_established(struct inet_timewait_death_row *death_row, 328 struct sock *sk, __u16 lport, 329 struct inet_timewait_sock **twp) 330 { 331 struct inet_hashinfo *hinfo = death_row->hashinfo; 332 struct inet_sock *inet = inet_sk(sk); 333 __be32 daddr = inet->inet_rcv_saddr; 334 __be32 saddr = inet->inet_daddr; 335 int dif = sk->sk_bound_dev_if; 336 INET_ADDR_COOKIE(acookie, saddr, daddr); 337 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 338 struct net *net = sock_net(sk); 339 unsigned int hash = inet_ehashfn(net, daddr, lport, 340 saddr, inet->inet_dport); 341 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 342 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 343 struct sock *sk2; 344 const struct hlist_nulls_node *node; 345 struct inet_timewait_sock *tw = NULL; 346 int twrefcnt = 0; 347 348 spin_lock(lock); 349 350 sk_nulls_for_each(sk2, node, &head->chain) { 351 if (sk2->sk_hash != hash) 352 continue; 353 354 if (likely(INET_MATCH(sk2, net, acookie, 355 saddr, daddr, ports, dif))) { 356 if (sk2->sk_state == TCP_TIME_WAIT) { 357 tw = inet_twsk(sk2); 358 if (twsk_unique(sk, sk2, twp)) 359 break; 360 } 361 goto not_unique; 362 } 363 } 364 365 /* Must record num and sport now. Otherwise we will see 366 * in hash table socket with a funny identity. 367 */ 368 inet->inet_num = lport; 369 inet->inet_sport = htons(lport); 370 sk->sk_hash = hash; 371 WARN_ON(!sk_unhashed(sk)); 372 __sk_nulls_add_node_rcu(sk, &head->chain); 373 if (tw) { 374 twrefcnt = inet_twsk_unhash(tw); 375 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); 376 } 377 spin_unlock(lock); 378 if (twrefcnt) 379 inet_twsk_put(tw); 380 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 381 382 if (twp) { 383 *twp = tw; 384 } else if (tw) { 385 /* Silly. Should hash-dance instead... */ 386 inet_twsk_deschedule(tw); 387 388 inet_twsk_put(tw); 389 } 390 return 0; 391 392 not_unique: 393 spin_unlock(lock); 394 return -EADDRNOTAVAIL; 395 } 396 397 static u32 inet_sk_port_offset(const struct sock *sk) 398 { 399 const struct inet_sock *inet = inet_sk(sk); 400 401 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 402 inet->inet_daddr, 403 inet->inet_dport); 404 } 405 406 int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) 407 { 408 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 409 struct hlist_nulls_head *list; 410 struct inet_ehash_bucket *head; 411 spinlock_t *lock; 412 int twrefcnt = 0; 413 414 WARN_ON(!sk_unhashed(sk)); 415 416 sk->sk_hash = sk_ehashfn(sk); 417 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 418 list = &head->chain; 419 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 420 421 spin_lock(lock); 422 __sk_nulls_add_node_rcu(sk, list); 423 if (tw) { 424 WARN_ON(sk->sk_hash != tw->tw_hash); 425 twrefcnt = inet_twsk_unhash(tw); 426 } 427 spin_unlock(lock); 428 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 429 return twrefcnt; 430 } 431 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 432 433 int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) 434 { 435 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 436 struct inet_listen_hashbucket *ilb; 437 438 if (sk->sk_state != TCP_LISTEN) 439 return __inet_hash_nolisten(sk, tw); 440 441 WARN_ON(!sk_unhashed(sk)); 442 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 443 444 spin_lock(&ilb->lock); 445 __sk_nulls_add_node_rcu(sk, &ilb->head); 446 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 447 spin_unlock(&ilb->lock); 448 return 0; 449 } 450 EXPORT_SYMBOL(__inet_hash); 451 452 void inet_hash(struct sock *sk) 453 { 454 if (sk->sk_state != TCP_CLOSE) { 455 local_bh_disable(); 456 __inet_hash(sk, NULL); 457 local_bh_enable(); 458 } 459 } 460 EXPORT_SYMBOL_GPL(inet_hash); 461 462 void inet_unhash(struct sock *sk) 463 { 464 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 465 spinlock_t *lock; 466 int done; 467 468 if (sk_unhashed(sk)) 469 return; 470 471 if (sk->sk_state == TCP_LISTEN) 472 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; 473 else 474 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 475 476 spin_lock_bh(lock); 477 done = __sk_nulls_del_node_init_rcu(sk); 478 if (done) 479 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 480 spin_unlock_bh(lock); 481 } 482 EXPORT_SYMBOL_GPL(inet_unhash); 483 484 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 485 struct sock *sk, u32 port_offset, 486 int (*check_established)(struct inet_timewait_death_row *, 487 struct sock *, __u16, struct inet_timewait_sock **)) 488 { 489 struct inet_hashinfo *hinfo = death_row->hashinfo; 490 const unsigned short snum = inet_sk(sk)->inet_num; 491 struct inet_bind_hashbucket *head; 492 struct inet_bind_bucket *tb; 493 int ret; 494 struct net *net = sock_net(sk); 495 int twrefcnt = 1; 496 497 if (!snum) { 498 int i, remaining, low, high, port; 499 static u32 hint; 500 u32 offset = hint + port_offset; 501 struct inet_timewait_sock *tw = NULL; 502 503 inet_get_local_port_range(net, &low, &high); 504 remaining = (high - low) + 1; 505 506 /* By starting with offset being an even number, 507 * we tend to leave about 50% of ports for other uses, 508 * like bind(0). 509 */ 510 offset &= ~1; 511 512 local_bh_disable(); 513 for (i = 0; i < remaining; i++) { 514 port = low + (i + offset) % remaining; 515 if (inet_is_local_reserved_port(net, port)) 516 continue; 517 head = &hinfo->bhash[inet_bhashfn(net, port, 518 hinfo->bhash_size)]; 519 spin_lock(&head->lock); 520 521 /* Does not bother with rcv_saddr checks, 522 * because the established check is already 523 * unique enough. 524 */ 525 inet_bind_bucket_for_each(tb, &head->chain) { 526 if (net_eq(ib_net(tb), net) && 527 tb->port == port) { 528 if (tb->fastreuse >= 0 || 529 tb->fastreuseport >= 0) 530 goto next_port; 531 WARN_ON(hlist_empty(&tb->owners)); 532 if (!check_established(death_row, sk, 533 port, &tw)) 534 goto ok; 535 goto next_port; 536 } 537 } 538 539 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 540 net, head, port); 541 if (!tb) { 542 spin_unlock(&head->lock); 543 break; 544 } 545 tb->fastreuse = -1; 546 tb->fastreuseport = -1; 547 goto ok; 548 549 next_port: 550 spin_unlock(&head->lock); 551 } 552 local_bh_enable(); 553 554 return -EADDRNOTAVAIL; 555 556 ok: 557 hint += (i + 2) & ~1; 558 559 /* Head lock still held and bh's disabled */ 560 inet_bind_hash(sk, tb, port); 561 if (sk_unhashed(sk)) { 562 inet_sk(sk)->inet_sport = htons(port); 563 twrefcnt += __inet_hash_nolisten(sk, tw); 564 } 565 if (tw) 566 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); 567 spin_unlock(&head->lock); 568 569 if (tw) { 570 inet_twsk_deschedule(tw); 571 while (twrefcnt) { 572 twrefcnt--; 573 inet_twsk_put(tw); 574 } 575 } 576 577 ret = 0; 578 goto out; 579 } 580 581 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; 582 tb = inet_csk(sk)->icsk_bind_hash; 583 spin_lock_bh(&head->lock); 584 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 585 __inet_hash_nolisten(sk, NULL); 586 spin_unlock_bh(&head->lock); 587 return 0; 588 } else { 589 spin_unlock(&head->lock); 590 /* No definite answer... Walk to established hash table */ 591 ret = check_established(death_row, sk, snum, NULL); 592 out: 593 local_bh_enable(); 594 return ret; 595 } 596 } 597 598 /* 599 * Bind a port for a connect operation and hash it. 600 */ 601 int inet_hash_connect(struct inet_timewait_death_row *death_row, 602 struct sock *sk) 603 { 604 u32 port_offset = 0; 605 606 if (!inet_sk(sk)->inet_num) 607 port_offset = inet_sk_port_offset(sk); 608 return __inet_hash_connect(death_row, sk, port_offset, 609 __inet_check_established); 610 } 611 EXPORT_SYMBOL_GPL(inet_hash_connect); 612 613 void inet_hashinfo_init(struct inet_hashinfo *h) 614 { 615 int i; 616 617 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 618 spin_lock_init(&h->listening_hash[i].lock); 619 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, 620 i + LISTENING_NULLS_BASE); 621 } 622 } 623 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 624 625 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 626 { 627 unsigned int locksz = sizeof(spinlock_t); 628 unsigned int i, nblocks = 1; 629 630 if (locksz != 0) { 631 /* allocate 2 cache lines or at least one spinlock per cpu */ 632 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); 633 nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); 634 635 /* no more locks than number of hash buckets */ 636 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 637 638 hashinfo->ehash_locks = kmalloc_array(nblocks, locksz, 639 GFP_KERNEL | __GFP_NOWARN); 640 if (!hashinfo->ehash_locks) 641 hashinfo->ehash_locks = vmalloc(nblocks * locksz); 642 643 if (!hashinfo->ehash_locks) 644 return -ENOMEM; 645 646 for (i = 0; i < nblocks; i++) 647 spin_lock_init(&hashinfo->ehash_locks[i]); 648 } 649 hashinfo->ehash_locks_mask = nblocks - 1; 650 return 0; 651 } 652 EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); 653