1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/secure_seq.h> 25 #include <net/ip.h> 26 27 static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, 28 const __u16 lport, const __be32 faddr, 29 const __be16 fport) 30 { 31 static u32 inet_ehash_secret __read_mostly; 32 33 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); 34 35 return __inet_ehashfn(laddr, lport, faddr, fport, 36 inet_ehash_secret + net_hash_mix(net)); 37 } 38 39 40 static unsigned int inet_sk_ehashfn(const struct sock *sk) 41 { 42 const struct inet_sock *inet = inet_sk(sk); 43 const __be32 laddr = inet->inet_rcv_saddr; 44 const __u16 lport = inet->inet_num; 45 const __be32 faddr = inet->inet_daddr; 46 const __be16 fport = inet->inet_dport; 47 struct net *net = sock_net(sk); 48 49 return inet_ehashfn(net, laddr, lport, faddr, fport); 50 } 51 52 /* 53 * Allocate and initialize a new local port bind bucket. 54 * The bindhash mutex for snum's hash chain must be held here. 55 */ 56 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 57 struct net *net, 58 struct inet_bind_hashbucket *head, 59 const unsigned short snum) 60 { 61 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 62 63 if (tb != NULL) { 64 write_pnet(&tb->ib_net, hold_net(net)); 65 tb->port = snum; 66 tb->fastreuse = 0; 67 tb->fastreuseport = 0; 68 tb->num_owners = 0; 69 INIT_HLIST_HEAD(&tb->owners); 70 hlist_add_head(&tb->node, &head->chain); 71 } 72 return tb; 73 } 74 75 /* 76 * Caller must hold hashbucket lock for this tb with local BH disabled 77 */ 78 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 79 { 80 if (hlist_empty(&tb->owners)) { 81 __hlist_del(&tb->node); 82 release_net(ib_net(tb)); 83 kmem_cache_free(cachep, tb); 84 } 85 } 86 87 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 88 const unsigned short snum) 89 { 90 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 91 92 atomic_inc(&hashinfo->bsockets); 93 94 inet_sk(sk)->inet_num = snum; 95 sk_add_bind_node(sk, &tb->owners); 96 tb->num_owners++; 97 inet_csk(sk)->icsk_bind_hash = tb; 98 } 99 100 /* 101 * Get rid of any references to a local port held by the given sock. 102 */ 103 static void __inet_put_port(struct sock *sk) 104 { 105 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 106 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, 107 hashinfo->bhash_size); 108 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 109 struct inet_bind_bucket *tb; 110 111 atomic_dec(&hashinfo->bsockets); 112 113 spin_lock(&head->lock); 114 tb = inet_csk(sk)->icsk_bind_hash; 115 __sk_del_bind_node(sk); 116 tb->num_owners--; 117 inet_csk(sk)->icsk_bind_hash = NULL; 118 inet_sk(sk)->inet_num = 0; 119 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 120 spin_unlock(&head->lock); 121 } 122 123 void inet_put_port(struct sock *sk) 124 { 125 local_bh_disable(); 126 __inet_put_port(sk); 127 local_bh_enable(); 128 } 129 EXPORT_SYMBOL(inet_put_port); 130 131 int __inet_inherit_port(struct sock *sk, struct sock *child) 132 { 133 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 134 unsigned short port = inet_sk(child)->inet_num; 135 const int bhash = inet_bhashfn(sock_net(sk), port, 136 table->bhash_size); 137 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 138 struct inet_bind_bucket *tb; 139 140 spin_lock(&head->lock); 141 tb = inet_csk(sk)->icsk_bind_hash; 142 if (tb->port != port) { 143 /* NOTE: using tproxy and redirecting skbs to a proxy 144 * on a different listener port breaks the assumption 145 * that the listener socket's icsk_bind_hash is the same 146 * as that of the child socket. We have to look up or 147 * create a new bind bucket for the child here. */ 148 inet_bind_bucket_for_each(tb, &head->chain) { 149 if (net_eq(ib_net(tb), sock_net(sk)) && 150 tb->port == port) 151 break; 152 } 153 if (!tb) { 154 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 155 sock_net(sk), head, port); 156 if (!tb) { 157 spin_unlock(&head->lock); 158 return -ENOMEM; 159 } 160 } 161 } 162 inet_bind_hash(child, tb, port); 163 spin_unlock(&head->lock); 164 165 return 0; 166 } 167 EXPORT_SYMBOL_GPL(__inet_inherit_port); 168 169 static inline int compute_score(struct sock *sk, struct net *net, 170 const unsigned short hnum, const __be32 daddr, 171 const int dif) 172 { 173 int score = -1; 174 struct inet_sock *inet = inet_sk(sk); 175 176 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && 177 !ipv6_only_sock(sk)) { 178 __be32 rcv_saddr = inet->inet_rcv_saddr; 179 score = sk->sk_family == PF_INET ? 2 : 1; 180 if (rcv_saddr) { 181 if (rcv_saddr != daddr) 182 return -1; 183 score += 4; 184 } 185 if (sk->sk_bound_dev_if) { 186 if (sk->sk_bound_dev_if != dif) 187 return -1; 188 score += 4; 189 } 190 } 191 return score; 192 } 193 194 /* 195 * Don't inline this cruft. Here are some nice properties to exploit here. The 196 * BSD API does not allow a listening sock to specify the remote port nor the 197 * remote address for the connection. So always assume those are both 198 * wildcarded during the search since they can never be otherwise. 199 */ 200 201 202 struct sock *__inet_lookup_listener(struct net *net, 203 struct inet_hashinfo *hashinfo, 204 const __be32 saddr, __be16 sport, 205 const __be32 daddr, const unsigned short hnum, 206 const int dif) 207 { 208 struct sock *sk, *result; 209 struct hlist_nulls_node *node; 210 unsigned int hash = inet_lhashfn(net, hnum); 211 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 212 int score, hiscore, matches = 0, reuseport = 0; 213 u32 phash = 0; 214 215 rcu_read_lock(); 216 begin: 217 result = NULL; 218 hiscore = 0; 219 sk_nulls_for_each_rcu(sk, node, &ilb->head) { 220 score = compute_score(sk, net, hnum, daddr, dif); 221 if (score > hiscore) { 222 result = sk; 223 hiscore = score; 224 reuseport = sk->sk_reuseport; 225 if (reuseport) { 226 phash = inet_ehashfn(net, daddr, hnum, 227 saddr, sport); 228 matches = 1; 229 } 230 } else if (score == hiscore && reuseport) { 231 matches++; 232 if (reciprocal_scale(phash, matches) == 0) 233 result = sk; 234 phash = next_pseudo_random32(phash); 235 } 236 } 237 /* 238 * if the nulls value we got at the end of this lookup is 239 * not the expected one, we must restart lookup. 240 * We probably met an item that was moved to another chain. 241 */ 242 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) 243 goto begin; 244 if (result) { 245 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 246 result = NULL; 247 else if (unlikely(compute_score(result, net, hnum, daddr, 248 dif) < hiscore)) { 249 sock_put(result); 250 goto begin; 251 } 252 } 253 rcu_read_unlock(); 254 return result; 255 } 256 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 257 258 /* All sockets share common refcount, but have different destructors */ 259 void sock_gen_put(struct sock *sk) 260 { 261 if (!atomic_dec_and_test(&sk->sk_refcnt)) 262 return; 263 264 if (sk->sk_state == TCP_TIME_WAIT) 265 inet_twsk_free(inet_twsk(sk)); 266 else 267 sk_free(sk); 268 } 269 EXPORT_SYMBOL_GPL(sock_gen_put); 270 271 struct sock *__inet_lookup_established(struct net *net, 272 struct inet_hashinfo *hashinfo, 273 const __be32 saddr, const __be16 sport, 274 const __be32 daddr, const u16 hnum, 275 const int dif) 276 { 277 INET_ADDR_COOKIE(acookie, saddr, daddr); 278 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 279 struct sock *sk; 280 const struct hlist_nulls_node *node; 281 /* Optimize here for direct hit, only listening connections can 282 * have wildcards anyways. 283 */ 284 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 285 unsigned int slot = hash & hashinfo->ehash_mask; 286 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 287 288 rcu_read_lock(); 289 begin: 290 sk_nulls_for_each_rcu(sk, node, &head->chain) { 291 if (sk->sk_hash != hash) 292 continue; 293 if (likely(INET_MATCH(sk, net, acookie, 294 saddr, daddr, ports, dif))) { 295 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 296 goto out; 297 if (unlikely(!INET_MATCH(sk, net, acookie, 298 saddr, daddr, ports, dif))) { 299 sock_gen_put(sk); 300 goto begin; 301 } 302 goto found; 303 } 304 } 305 /* 306 * if the nulls value we got at the end of this lookup is 307 * not the expected one, we must restart lookup. 308 * We probably met an item that was moved to another chain. 309 */ 310 if (get_nulls_value(node) != slot) 311 goto begin; 312 out: 313 sk = NULL; 314 found: 315 rcu_read_unlock(); 316 return sk; 317 } 318 EXPORT_SYMBOL_GPL(__inet_lookup_established); 319 320 /* called with local bh disabled */ 321 static int __inet_check_established(struct inet_timewait_death_row *death_row, 322 struct sock *sk, __u16 lport, 323 struct inet_timewait_sock **twp) 324 { 325 struct inet_hashinfo *hinfo = death_row->hashinfo; 326 struct inet_sock *inet = inet_sk(sk); 327 __be32 daddr = inet->inet_rcv_saddr; 328 __be32 saddr = inet->inet_daddr; 329 int dif = sk->sk_bound_dev_if; 330 INET_ADDR_COOKIE(acookie, saddr, daddr); 331 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 332 struct net *net = sock_net(sk); 333 unsigned int hash = inet_ehashfn(net, daddr, lport, 334 saddr, inet->inet_dport); 335 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 336 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 337 struct sock *sk2; 338 const struct hlist_nulls_node *node; 339 struct inet_timewait_sock *tw = NULL; 340 int twrefcnt = 0; 341 342 spin_lock(lock); 343 344 sk_nulls_for_each(sk2, node, &head->chain) { 345 if (sk2->sk_hash != hash) 346 continue; 347 348 if (likely(INET_MATCH(sk2, net, acookie, 349 saddr, daddr, ports, dif))) { 350 if (sk2->sk_state == TCP_TIME_WAIT) { 351 tw = inet_twsk(sk2); 352 if (twsk_unique(sk, sk2, twp)) 353 break; 354 } 355 goto not_unique; 356 } 357 } 358 359 /* Must record num and sport now. Otherwise we will see 360 * in hash table socket with a funny identity. 361 */ 362 inet->inet_num = lport; 363 inet->inet_sport = htons(lport); 364 sk->sk_hash = hash; 365 WARN_ON(!sk_unhashed(sk)); 366 __sk_nulls_add_node_rcu(sk, &head->chain); 367 if (tw) { 368 twrefcnt = inet_twsk_unhash(tw); 369 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); 370 } 371 spin_unlock(lock); 372 if (twrefcnt) 373 inet_twsk_put(tw); 374 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 376 if (twp) { 377 *twp = tw; 378 } else if (tw) { 379 /* Silly. Should hash-dance instead... */ 380 inet_twsk_deschedule(tw, death_row); 381 382 inet_twsk_put(tw); 383 } 384 return 0; 385 386 not_unique: 387 spin_unlock(lock); 388 return -EADDRNOTAVAIL; 389 } 390 391 static inline u32 inet_sk_port_offset(const struct sock *sk) 392 { 393 const struct inet_sock *inet = inet_sk(sk); 394 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 395 inet->inet_daddr, 396 inet->inet_dport); 397 } 398 399 int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) 400 { 401 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 402 struct hlist_nulls_head *list; 403 spinlock_t *lock; 404 struct inet_ehash_bucket *head; 405 int twrefcnt = 0; 406 407 WARN_ON(!sk_unhashed(sk)); 408 409 sk->sk_hash = inet_sk_ehashfn(sk); 410 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 411 list = &head->chain; 412 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 413 414 spin_lock(lock); 415 __sk_nulls_add_node_rcu(sk, list); 416 if (tw) { 417 WARN_ON(sk->sk_hash != tw->tw_hash); 418 twrefcnt = inet_twsk_unhash(tw); 419 } 420 spin_unlock(lock); 421 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 422 return twrefcnt; 423 } 424 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 425 426 static void __inet_hash(struct sock *sk) 427 { 428 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 429 struct inet_listen_hashbucket *ilb; 430 431 if (sk->sk_state != TCP_LISTEN) { 432 __inet_hash_nolisten(sk, NULL); 433 return; 434 } 435 436 WARN_ON(!sk_unhashed(sk)); 437 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 438 439 spin_lock(&ilb->lock); 440 __sk_nulls_add_node_rcu(sk, &ilb->head); 441 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 442 spin_unlock(&ilb->lock); 443 } 444 445 void inet_hash(struct sock *sk) 446 { 447 if (sk->sk_state != TCP_CLOSE) { 448 local_bh_disable(); 449 __inet_hash(sk); 450 local_bh_enable(); 451 } 452 } 453 EXPORT_SYMBOL_GPL(inet_hash); 454 455 void inet_unhash(struct sock *sk) 456 { 457 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 458 spinlock_t *lock; 459 int done; 460 461 if (sk_unhashed(sk)) 462 return; 463 464 if (sk->sk_state == TCP_LISTEN) 465 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; 466 else 467 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 468 469 spin_lock_bh(lock); 470 done = __sk_nulls_del_node_init_rcu(sk); 471 if (done) 472 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 473 spin_unlock_bh(lock); 474 } 475 EXPORT_SYMBOL_GPL(inet_unhash); 476 477 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 478 struct sock *sk, u32 port_offset, 479 int (*check_established)(struct inet_timewait_death_row *, 480 struct sock *, __u16, struct inet_timewait_sock **), 481 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) 482 { 483 struct inet_hashinfo *hinfo = death_row->hashinfo; 484 const unsigned short snum = inet_sk(sk)->inet_num; 485 struct inet_bind_hashbucket *head; 486 struct inet_bind_bucket *tb; 487 int ret; 488 struct net *net = sock_net(sk); 489 int twrefcnt = 1; 490 491 if (!snum) { 492 int i, remaining, low, high, port; 493 static u32 hint; 494 u32 offset = hint + port_offset; 495 struct inet_timewait_sock *tw = NULL; 496 497 inet_get_local_port_range(net, &low, &high); 498 remaining = (high - low) + 1; 499 500 local_bh_disable(); 501 for (i = 1; i <= remaining; i++) { 502 port = low + (i + offset) % remaining; 503 if (inet_is_local_reserved_port(net, port)) 504 continue; 505 head = &hinfo->bhash[inet_bhashfn(net, port, 506 hinfo->bhash_size)]; 507 spin_lock(&head->lock); 508 509 /* Does not bother with rcv_saddr checks, 510 * because the established check is already 511 * unique enough. 512 */ 513 inet_bind_bucket_for_each(tb, &head->chain) { 514 if (net_eq(ib_net(tb), net) && 515 tb->port == port) { 516 if (tb->fastreuse >= 0 || 517 tb->fastreuseport >= 0) 518 goto next_port; 519 WARN_ON(hlist_empty(&tb->owners)); 520 if (!check_established(death_row, sk, 521 port, &tw)) 522 goto ok; 523 goto next_port; 524 } 525 } 526 527 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 528 net, head, port); 529 if (!tb) { 530 spin_unlock(&head->lock); 531 break; 532 } 533 tb->fastreuse = -1; 534 tb->fastreuseport = -1; 535 goto ok; 536 537 next_port: 538 spin_unlock(&head->lock); 539 } 540 local_bh_enable(); 541 542 return -EADDRNOTAVAIL; 543 544 ok: 545 hint += i; 546 547 /* Head lock still held and bh's disabled */ 548 inet_bind_hash(sk, tb, port); 549 if (sk_unhashed(sk)) { 550 inet_sk(sk)->inet_sport = htons(port); 551 twrefcnt += hash(sk, tw); 552 } 553 if (tw) 554 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); 555 spin_unlock(&head->lock); 556 557 if (tw) { 558 inet_twsk_deschedule(tw, death_row); 559 while (twrefcnt) { 560 twrefcnt--; 561 inet_twsk_put(tw); 562 } 563 } 564 565 ret = 0; 566 goto out; 567 } 568 569 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; 570 tb = inet_csk(sk)->icsk_bind_hash; 571 spin_lock_bh(&head->lock); 572 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 573 hash(sk, NULL); 574 spin_unlock_bh(&head->lock); 575 return 0; 576 } else { 577 spin_unlock(&head->lock); 578 /* No definite answer... Walk to established hash table */ 579 ret = check_established(death_row, sk, snum, NULL); 580 out: 581 local_bh_enable(); 582 return ret; 583 } 584 } 585 586 /* 587 * Bind a port for a connect operation and hash it. 588 */ 589 int inet_hash_connect(struct inet_timewait_death_row *death_row, 590 struct sock *sk) 591 { 592 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 593 __inet_check_established, __inet_hash_nolisten); 594 } 595 EXPORT_SYMBOL_GPL(inet_hash_connect); 596 597 void inet_hashinfo_init(struct inet_hashinfo *h) 598 { 599 int i; 600 601 atomic_set(&h->bsockets, 0); 602 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 603 spin_lock_init(&h->listening_hash[i].lock); 604 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, 605 i + LISTENING_NULLS_BASE); 606 } 607 } 608 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 609