1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic INET transport hashtables 7 * 8 * Authors: Lotsa people, from code originally in tcp 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/random.h> 18 #include <linux/sched.h> 19 #include <linux/slab.h> 20 #include <linux/wait.h> 21 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #include <net/ip.h> 25 26 /* 27 * Allocate and initialize a new local port bind bucket. 28 * The bindhash mutex for snum's hash chain must be held here. 29 */ 30 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 31 struct net *net, 32 struct inet_bind_hashbucket *head, 33 const unsigned short snum) 34 { 35 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 36 37 if (tb != NULL) { 38 write_pnet(&tb->ib_net, hold_net(net)); 39 tb->port = snum; 40 tb->fastreuse = 0; 41 INIT_HLIST_HEAD(&tb->owners); 42 hlist_add_head(&tb->node, &head->chain); 43 } 44 return tb; 45 } 46 47 /* 48 * Caller must hold hashbucket lock for this tb with local BH disabled 49 */ 50 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) 51 { 52 if (hlist_empty(&tb->owners)) { 53 __hlist_del(&tb->node); 54 release_net(ib_net(tb)); 55 kmem_cache_free(cachep, tb); 56 } 57 } 58 59 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 60 const unsigned short snum) 61 { 62 inet_sk(sk)->num = snum; 63 sk_add_bind_node(sk, &tb->owners); 64 inet_csk(sk)->icsk_bind_hash = tb; 65 } 66 67 /* 68 * Get rid of any references to a local port held by the given sock. 69 */ 70 static void __inet_put_port(struct sock *sk) 71 { 72 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 73 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, 74 hashinfo->bhash_size); 75 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 76 struct inet_bind_bucket *tb; 77 78 spin_lock(&head->lock); 79 tb = inet_csk(sk)->icsk_bind_hash; 80 __sk_del_bind_node(sk); 81 inet_csk(sk)->icsk_bind_hash = NULL; 82 inet_sk(sk)->num = 0; 83 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 84 spin_unlock(&head->lock); 85 } 86 87 void inet_put_port(struct sock *sk) 88 { 89 local_bh_disable(); 90 __inet_put_port(sk); 91 local_bh_enable(); 92 } 93 94 EXPORT_SYMBOL(inet_put_port); 95 96 void __inet_inherit_port(struct sock *sk, struct sock *child) 97 { 98 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 99 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, 100 table->bhash_size); 101 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 102 struct inet_bind_bucket *tb; 103 104 spin_lock(&head->lock); 105 tb = inet_csk(sk)->icsk_bind_hash; 106 sk_add_bind_node(child, &tb->owners); 107 inet_csk(child)->icsk_bind_hash = tb; 108 spin_unlock(&head->lock); 109 } 110 111 EXPORT_SYMBOL_GPL(__inet_inherit_port); 112 113 static inline int compute_score(struct sock *sk, struct net *net, 114 const unsigned short hnum, const __be32 daddr, 115 const int dif) 116 { 117 int score = -1; 118 struct inet_sock *inet = inet_sk(sk); 119 120 if (net_eq(sock_net(sk), net) && inet->num == hnum && 121 !ipv6_only_sock(sk)) { 122 __be32 rcv_saddr = inet->rcv_saddr; 123 score = sk->sk_family == PF_INET ? 1 : 0; 124 if (rcv_saddr) { 125 if (rcv_saddr != daddr) 126 return -1; 127 score += 2; 128 } 129 if (sk->sk_bound_dev_if) { 130 if (sk->sk_bound_dev_if != dif) 131 return -1; 132 score += 2; 133 } 134 } 135 return score; 136 } 137 138 /* 139 * Don't inline this cruft. Here are some nice properties to exploit here. The 140 * BSD API does not allow a listening sock to specify the remote port nor the 141 * remote address for the connection. So always assume those are both 142 * wildcarded during the search since they can never be otherwise. 143 */ 144 145 146 struct sock *__inet_lookup_listener(struct net *net, 147 struct inet_hashinfo *hashinfo, 148 const __be32 daddr, const unsigned short hnum, 149 const int dif) 150 { 151 struct sock *sk, *result; 152 struct hlist_nulls_node *node; 153 unsigned int hash = inet_lhashfn(net, hnum); 154 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 155 int score, hiscore; 156 157 rcu_read_lock(); 158 begin: 159 result = NULL; 160 hiscore = -1; 161 sk_nulls_for_each_rcu(sk, node, &ilb->head) { 162 score = compute_score(sk, net, hnum, daddr, dif); 163 if (score > hiscore) { 164 result = sk; 165 hiscore = score; 166 } 167 } 168 /* 169 * if the nulls value we got at the end of this lookup is 170 * not the expected one, we must restart lookup. 171 * We probably met an item that was moved to another chain. 172 */ 173 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) 174 goto begin; 175 if (result) { 176 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 177 result = NULL; 178 else if (unlikely(compute_score(result, net, hnum, daddr, 179 dif) < hiscore)) { 180 sock_put(result); 181 goto begin; 182 } 183 } 184 rcu_read_unlock(); 185 return result; 186 } 187 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 188 189 struct sock * __inet_lookup_established(struct net *net, 190 struct inet_hashinfo *hashinfo, 191 const __be32 saddr, const __be16 sport, 192 const __be32 daddr, const u16 hnum, 193 const int dif) 194 { 195 INET_ADDR_COOKIE(acookie, saddr, daddr) 196 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 197 struct sock *sk; 198 const struct hlist_nulls_node *node; 199 /* Optimize here for direct hit, only listening connections can 200 * have wildcards anyways. 201 */ 202 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 203 unsigned int slot = hash & (hashinfo->ehash_size - 1); 204 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 205 206 rcu_read_lock(); 207 begin: 208 sk_nulls_for_each_rcu(sk, node, &head->chain) { 209 if (INET_MATCH(sk, net, hash, acookie, 210 saddr, daddr, ports, dif)) { 211 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 212 goto begintw; 213 if (unlikely(!INET_MATCH(sk, net, hash, acookie, 214 saddr, daddr, ports, dif))) { 215 sock_put(sk); 216 goto begin; 217 } 218 goto out; 219 } 220 } 221 /* 222 * if the nulls value we got at the end of this lookup is 223 * not the expected one, we must restart lookup. 224 * We probably met an item that was moved to another chain. 225 */ 226 if (get_nulls_value(node) != slot) 227 goto begin; 228 229 begintw: 230 /* Must check for a TIME_WAIT'er before going to listener hash. */ 231 sk_nulls_for_each_rcu(sk, node, &head->twchain) { 232 if (INET_TW_MATCH(sk, net, hash, acookie, 233 saddr, daddr, ports, dif)) { 234 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { 235 sk = NULL; 236 goto out; 237 } 238 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, 239 saddr, daddr, ports, dif))) { 240 sock_put(sk); 241 goto begintw; 242 } 243 goto out; 244 } 245 } 246 /* 247 * if the nulls value we got at the end of this lookup is 248 * not the expected one, we must restart lookup. 249 * We probably met an item that was moved to another chain. 250 */ 251 if (get_nulls_value(node) != slot) 252 goto begintw; 253 sk = NULL; 254 out: 255 rcu_read_unlock(); 256 return sk; 257 } 258 EXPORT_SYMBOL_GPL(__inet_lookup_established); 259 260 /* called with local bh disabled */ 261 static int __inet_check_established(struct inet_timewait_death_row *death_row, 262 struct sock *sk, __u16 lport, 263 struct inet_timewait_sock **twp) 264 { 265 struct inet_hashinfo *hinfo = death_row->hashinfo; 266 struct inet_sock *inet = inet_sk(sk); 267 __be32 daddr = inet->rcv_saddr; 268 __be32 saddr = inet->daddr; 269 int dif = sk->sk_bound_dev_if; 270 INET_ADDR_COOKIE(acookie, saddr, daddr) 271 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 272 struct net *net = sock_net(sk); 273 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); 274 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 275 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 276 struct sock *sk2; 277 const struct hlist_nulls_node *node; 278 struct inet_timewait_sock *tw; 279 280 spin_lock(lock); 281 282 /* Check TIME-WAIT sockets first. */ 283 sk_nulls_for_each(sk2, node, &head->twchain) { 284 tw = inet_twsk(sk2); 285 286 if (INET_TW_MATCH(sk2, net, hash, acookie, 287 saddr, daddr, ports, dif)) { 288 if (twsk_unique(sk, sk2, twp)) 289 goto unique; 290 else 291 goto not_unique; 292 } 293 } 294 tw = NULL; 295 296 /* And established part... */ 297 sk_nulls_for_each(sk2, node, &head->chain) { 298 if (INET_MATCH(sk2, net, hash, acookie, 299 saddr, daddr, ports, dif)) 300 goto not_unique; 301 } 302 303 unique: 304 /* Must record num and sport now. Otherwise we will see 305 * in hash table socket with a funny identity. */ 306 inet->num = lport; 307 inet->sport = htons(lport); 308 sk->sk_hash = hash; 309 WARN_ON(!sk_unhashed(sk)); 310 __sk_nulls_add_node_rcu(sk, &head->chain); 311 spin_unlock(lock); 312 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 313 314 if (twp) { 315 *twp = tw; 316 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); 317 } else if (tw) { 318 /* Silly. Should hash-dance instead... */ 319 inet_twsk_deschedule(tw, death_row); 320 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); 321 322 inet_twsk_put(tw); 323 } 324 325 return 0; 326 327 not_unique: 328 spin_unlock(lock); 329 return -EADDRNOTAVAIL; 330 } 331 332 static inline u32 inet_sk_port_offset(const struct sock *sk) 333 { 334 const struct inet_sock *inet = inet_sk(sk); 335 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 336 inet->dport); 337 } 338 339 void __inet_hash_nolisten(struct sock *sk) 340 { 341 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 342 struct hlist_nulls_head *list; 343 spinlock_t *lock; 344 struct inet_ehash_bucket *head; 345 346 WARN_ON(!sk_unhashed(sk)); 347 348 sk->sk_hash = inet_sk_ehashfn(sk); 349 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 350 list = &head->chain; 351 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 352 353 spin_lock(lock); 354 __sk_nulls_add_node_rcu(sk, list); 355 spin_unlock(lock); 356 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 357 } 358 EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 359 360 static void __inet_hash(struct sock *sk) 361 { 362 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 363 struct inet_listen_hashbucket *ilb; 364 365 if (sk->sk_state != TCP_LISTEN) { 366 __inet_hash_nolisten(sk); 367 return; 368 } 369 370 WARN_ON(!sk_unhashed(sk)); 371 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 372 373 spin_lock(&ilb->lock); 374 __sk_nulls_add_node_rcu(sk, &ilb->head); 375 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 376 spin_unlock(&ilb->lock); 377 } 378 379 void inet_hash(struct sock *sk) 380 { 381 if (sk->sk_state != TCP_CLOSE) { 382 local_bh_disable(); 383 __inet_hash(sk); 384 local_bh_enable(); 385 } 386 } 387 EXPORT_SYMBOL_GPL(inet_hash); 388 389 void inet_unhash(struct sock *sk) 390 { 391 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 392 spinlock_t *lock; 393 int done; 394 395 if (sk_unhashed(sk)) 396 return; 397 398 if (sk->sk_state == TCP_LISTEN) 399 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; 400 else 401 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 402 403 spin_lock_bh(lock); 404 done =__sk_nulls_del_node_init_rcu(sk); 405 if (done) 406 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 407 spin_unlock_bh(lock); 408 } 409 EXPORT_SYMBOL_GPL(inet_unhash); 410 411 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 412 struct sock *sk, u32 port_offset, 413 int (*check_established)(struct inet_timewait_death_row *, 414 struct sock *, __u16, struct inet_timewait_sock **), 415 void (*hash)(struct sock *sk)) 416 { 417 struct inet_hashinfo *hinfo = death_row->hashinfo; 418 const unsigned short snum = inet_sk(sk)->num; 419 struct inet_bind_hashbucket *head; 420 struct inet_bind_bucket *tb; 421 int ret; 422 struct net *net = sock_net(sk); 423 424 if (!snum) { 425 int i, remaining, low, high, port; 426 static u32 hint; 427 u32 offset = hint + port_offset; 428 struct hlist_node *node; 429 struct inet_timewait_sock *tw = NULL; 430 431 inet_get_local_port_range(&low, &high); 432 remaining = (high - low) + 1; 433 434 local_bh_disable(); 435 for (i = 1; i <= remaining; i++) { 436 port = low + (i + offset) % remaining; 437 head = &hinfo->bhash[inet_bhashfn(net, port, 438 hinfo->bhash_size)]; 439 spin_lock(&head->lock); 440 441 /* Does not bother with rcv_saddr checks, 442 * because the established check is already 443 * unique enough. 444 */ 445 inet_bind_bucket_for_each(tb, node, &head->chain) { 446 if (ib_net(tb) == net && tb->port == port) { 447 WARN_ON(hlist_empty(&tb->owners)); 448 if (tb->fastreuse >= 0) 449 goto next_port; 450 if (!check_established(death_row, sk, 451 port, &tw)) 452 goto ok; 453 goto next_port; 454 } 455 } 456 457 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 458 net, head, port); 459 if (!tb) { 460 spin_unlock(&head->lock); 461 break; 462 } 463 tb->fastreuse = -1; 464 goto ok; 465 466 next_port: 467 spin_unlock(&head->lock); 468 } 469 local_bh_enable(); 470 471 return -EADDRNOTAVAIL; 472 473 ok: 474 hint += i; 475 476 /* Head lock still held and bh's disabled */ 477 inet_bind_hash(sk, tb, port); 478 if (sk_unhashed(sk)) { 479 inet_sk(sk)->sport = htons(port); 480 hash(sk); 481 } 482 spin_unlock(&head->lock); 483 484 if (tw) { 485 inet_twsk_deschedule(tw, death_row); 486 inet_twsk_put(tw); 487 } 488 489 ret = 0; 490 goto out; 491 } 492 493 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; 494 tb = inet_csk(sk)->icsk_bind_hash; 495 spin_lock_bh(&head->lock); 496 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 497 hash(sk); 498 spin_unlock_bh(&head->lock); 499 return 0; 500 } else { 501 spin_unlock(&head->lock); 502 /* No definite answer... Walk to established hash table */ 503 ret = check_established(death_row, sk, snum, NULL); 504 out: 505 local_bh_enable(); 506 return ret; 507 } 508 } 509 510 /* 511 * Bind a port for a connect operation and hash it. 512 */ 513 int inet_hash_connect(struct inet_timewait_death_row *death_row, 514 struct sock *sk) 515 { 516 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 517 __inet_check_established, __inet_hash_nolisten); 518 } 519 520 EXPORT_SYMBOL_GPL(inet_hash_connect); 521 522 void inet_hashinfo_init(struct inet_hashinfo *h) 523 { 524 int i; 525 526 for (i = 0; i < INET_LHTABLE_SIZE; i++) { 527 spin_lock_init(&h->listening_hash[i].lock); 528 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, 529 i + LISTENING_NULLS_BASE); 530 } 531 } 532 533 EXPORT_SYMBOL_GPL(inet_hashinfo_init); 534