1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Support for INET connection oriented protocols. 7 * 8 * Authors: See the TCP sources 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or(at your option) any later version. 14 */ 15 16 #include <linux/config.h> 17 #include <linux/module.h> 18 #include <linux/jhash.h> 19 20 #include <net/inet_connection_sock.h> 21 #include <net/inet_hashtables.h> 22 #include <net/inet_timewait_sock.h> 23 #include <net/ip.h> 24 #include <net/route.h> 25 #include <net/tcp_states.h> 26 #include <net/xfrm.h> 27 28 #ifdef INET_CSK_DEBUG 29 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; 30 EXPORT_SYMBOL(inet_csk_timer_bug_msg); 31 #endif 32 33 /* 34 * This array holds the first and last local port number. 35 * For high-usage systems, use sysctl to change this to 36 * 32768-61000 37 */ 38 int sysctl_local_port_range[2] = { 1024, 4999 }; 39 40 static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) 41 { 42 const u32 sk_rcv_saddr = inet_rcv_saddr(sk); 43 struct sock *sk2; 44 struct hlist_node *node; 45 int reuse = sk->sk_reuse; 46 47 sk_for_each_bound(sk2, node, &tb->owners) { 48 if (sk != sk2 && 49 !inet_v6_ipv6only(sk2) && 50 (!sk->sk_bound_dev_if || 51 !sk2->sk_bound_dev_if || 52 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 53 if (!reuse || !sk2->sk_reuse || 54 sk2->sk_state == TCP_LISTEN) { 55 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 56 if (!sk2_rcv_saddr || !sk_rcv_saddr || 57 sk2_rcv_saddr == sk_rcv_saddr) 58 break; 59 } 60 } 61 } 62 return node != NULL; 63 } 64 65 /* Obtain a reference to a local port for the given sock, 66 * if snum is zero it means select any available local port. 67 */ 68 int inet_csk_get_port(struct inet_hashinfo *hashinfo, 69 struct sock *sk, unsigned short snum) 70 { 71 struct inet_bind_hashbucket *head; 72 struct hlist_node *node; 73 struct inet_bind_bucket *tb; 74 int ret; 75 76 local_bh_disable(); 77 if (!snum) { 78 int low = sysctl_local_port_range[0]; 79 int high = sysctl_local_port_range[1]; 80 int remaining = (high - low) + 1; 81 int rover = net_random() % (high - low) + low; 82 83 do { 84 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; 85 spin_lock(&head->lock); 86 inet_bind_bucket_for_each(tb, node, &head->chain) 87 if (tb->port == rover) 88 goto next; 89 break; 90 next: 91 spin_unlock(&head->lock); 92 if (++rover > high) 93 rover = low; 94 } while (--remaining > 0); 95 96 /* Exhausted local port range during search? It is not 97 * possible for us to be holding one of the bind hash 98 * locks if this test triggers, because if 'remaining' 99 * drops to zero, we broke out of the do/while loop at 100 * the top level, not from the 'break;' statement. 101 */ 102 ret = 1; 103 if (remaining <= 0) 104 goto fail; 105 106 /* OK, here is the one we will use. HEAD is 107 * non-NULL and we hold it's mutex. 108 */ 109 snum = rover; 110 } else { 111 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; 112 spin_lock(&head->lock); 113 inet_bind_bucket_for_each(tb, node, &head->chain) 114 if (tb->port == snum) 115 goto tb_found; 116 } 117 tb = NULL; 118 goto tb_not_found; 119 tb_found: 120 if (!hlist_empty(&tb->owners)) { 121 if (sk->sk_reuse > 1) 122 goto success; 123 if (tb->fastreuse > 0 && 124 sk->sk_reuse && sk->sk_state != TCP_LISTEN) { 125 goto success; 126 } else { 127 ret = 1; 128 if (inet_csk_bind_conflict(sk, tb)) 129 goto fail_unlock; 130 } 131 } 132 tb_not_found: 133 ret = 1; 134 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) 135 goto fail_unlock; 136 if (hlist_empty(&tb->owners)) { 137 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) 138 tb->fastreuse = 1; 139 else 140 tb->fastreuse = 0; 141 } else if (tb->fastreuse && 142 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) 143 tb->fastreuse = 0; 144 success: 145 if (!inet_csk(sk)->icsk_bind_hash) 146 inet_bind_hash(sk, tb, snum); 147 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); 148 ret = 0; 149 150 fail_unlock: 151 spin_unlock(&head->lock); 152 fail: 153 local_bh_enable(); 154 return ret; 155 } 156 157 EXPORT_SYMBOL_GPL(inet_csk_get_port); 158 159 /* 160 * Wait for an incoming connection, avoid race conditions. This must be called 161 * with the socket locked. 162 */ 163 static int inet_csk_wait_for_connect(struct sock *sk, long timeo) 164 { 165 struct inet_connection_sock *icsk = inet_csk(sk); 166 DEFINE_WAIT(wait); 167 int err; 168 169 /* 170 * True wake-one mechanism for incoming connections: only 171 * one process gets woken up, not the 'whole herd'. 172 * Since we do not 'race & poll' for established sockets 173 * anymore, the common case will execute the loop only once. 174 * 175 * Subtle issue: "add_wait_queue_exclusive()" will be added 176 * after any current non-exclusive waiters, and we know that 177 * it will always _stay_ after any new non-exclusive waiters 178 * because all non-exclusive waiters are added at the 179 * beginning of the wait-queue. As such, it's ok to "drop" 180 * our exclusiveness temporarily when we get woken up without 181 * having to remove and re-insert us on the wait queue. 182 */ 183 for (;;) { 184 prepare_to_wait_exclusive(sk->sk_sleep, &wait, 185 TASK_INTERRUPTIBLE); 186 release_sock(sk); 187 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 188 timeo = schedule_timeout(timeo); 189 lock_sock(sk); 190 err = 0; 191 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) 192 break; 193 err = -EINVAL; 194 if (sk->sk_state != TCP_LISTEN) 195 break; 196 err = sock_intr_errno(timeo); 197 if (signal_pending(current)) 198 break; 199 err = -EAGAIN; 200 if (!timeo) 201 break; 202 } 203 finish_wait(sk->sk_sleep, &wait); 204 return err; 205 } 206 207 /* 208 * This will accept the next outstanding connection. 209 */ 210 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 211 { 212 struct inet_connection_sock *icsk = inet_csk(sk); 213 struct sock *newsk; 214 int error; 215 216 lock_sock(sk); 217 218 /* We need to make sure that this socket is listening, 219 * and that it has something pending. 220 */ 221 error = -EINVAL; 222 if (sk->sk_state != TCP_LISTEN) 223 goto out_err; 224 225 /* Find already established connection */ 226 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { 227 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 228 229 /* If this is a non blocking socket don't sleep */ 230 error = -EAGAIN; 231 if (!timeo) 232 goto out_err; 233 234 error = inet_csk_wait_for_connect(sk, timeo); 235 if (error) 236 goto out_err; 237 } 238 239 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); 240 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); 241 out: 242 release_sock(sk); 243 return newsk; 244 out_err: 245 newsk = NULL; 246 *err = error; 247 goto out; 248 } 249 250 EXPORT_SYMBOL(inet_csk_accept); 251 252 /* 253 * Using different timers for retransmit, delayed acks and probes 254 * We may wish use just one timer maintaining a list of expire jiffies 255 * to optimize. 256 */ 257 void inet_csk_init_xmit_timers(struct sock *sk, 258 void (*retransmit_handler)(unsigned long), 259 void (*delack_handler)(unsigned long), 260 void (*keepalive_handler)(unsigned long)) 261 { 262 struct inet_connection_sock *icsk = inet_csk(sk); 263 264 init_timer(&icsk->icsk_retransmit_timer); 265 init_timer(&icsk->icsk_delack_timer); 266 init_timer(&sk->sk_timer); 267 268 icsk->icsk_retransmit_timer.function = retransmit_handler; 269 icsk->icsk_delack_timer.function = delack_handler; 270 sk->sk_timer.function = keepalive_handler; 271 272 icsk->icsk_retransmit_timer.data = 273 icsk->icsk_delack_timer.data = 274 sk->sk_timer.data = (unsigned long)sk; 275 276 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 277 } 278 279 EXPORT_SYMBOL(inet_csk_init_xmit_timers); 280 281 void inet_csk_clear_xmit_timers(struct sock *sk) 282 { 283 struct inet_connection_sock *icsk = inet_csk(sk); 284 285 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; 286 287 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 288 sk_stop_timer(sk, &icsk->icsk_delack_timer); 289 sk_stop_timer(sk, &sk->sk_timer); 290 } 291 292 EXPORT_SYMBOL(inet_csk_clear_xmit_timers); 293 294 void inet_csk_delete_keepalive_timer(struct sock *sk) 295 { 296 sk_stop_timer(sk, &sk->sk_timer); 297 } 298 299 EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); 300 301 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) 302 { 303 sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 304 } 305 306 EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 307 308 struct dst_entry* inet_csk_route_req(struct sock *sk, 309 const struct request_sock *req) 310 { 311 struct rtable *rt; 312 const struct inet_request_sock *ireq = inet_rsk(req); 313 struct ip_options *opt = inet_rsk(req)->opt; 314 struct flowi fl = { .oif = sk->sk_bound_dev_if, 315 .nl_u = { .ip4_u = 316 { .daddr = ((opt && opt->srr) ? 317 opt->faddr : 318 ireq->rmt_addr), 319 .saddr = ireq->loc_addr, 320 .tos = RT_CONN_FLAGS(sk) } }, 321 .proto = sk->sk_protocol, 322 .uli_u = { .ports = 323 { .sport = inet_sk(sk)->sport, 324 .dport = ireq->rmt_port } } }; 325 326 if (ip_route_output_flow(&rt, &fl, sk, 0)) { 327 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 328 return NULL; 329 } 330 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { 331 ip_rt_put(rt); 332 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 333 return NULL; 334 } 335 return &rt->u.dst; 336 } 337 338 EXPORT_SYMBOL_GPL(inet_csk_route_req); 339 340 static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, 341 const u32 rnd, const u16 synq_hsize) 342 { 343 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); 344 } 345 346 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 347 #define AF_INET_FAMILY(fam) ((fam) == AF_INET) 348 #else 349 #define AF_INET_FAMILY(fam) 1 350 #endif 351 352 struct request_sock *inet_csk_search_req(const struct sock *sk, 353 struct request_sock ***prevp, 354 const __u16 rport, const __u32 raddr, 355 const __u32 laddr) 356 { 357 const struct inet_connection_sock *icsk = inet_csk(sk); 358 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 359 struct request_sock *req, **prev; 360 361 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, 362 lopt->nr_table_entries)]; 363 (req = *prev) != NULL; 364 prev = &req->dl_next) { 365 const struct inet_request_sock *ireq = inet_rsk(req); 366 367 if (ireq->rmt_port == rport && 368 ireq->rmt_addr == raddr && 369 ireq->loc_addr == laddr && 370 AF_INET_FAMILY(req->rsk_ops->family)) { 371 BUG_TRAP(!req->sk); 372 *prevp = prev; 373 break; 374 } 375 } 376 377 return req; 378 } 379 380 EXPORT_SYMBOL_GPL(inet_csk_search_req); 381 382 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 383 const unsigned timeout) 384 { 385 struct inet_connection_sock *icsk = inet_csk(sk); 386 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 387 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, 388 lopt->hash_rnd, lopt->nr_table_entries); 389 390 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); 391 inet_csk_reqsk_queue_added(sk, timeout); 392 } 393 394 /* Only thing we need from tcp.h */ 395 extern int sysctl_tcp_synack_retries; 396 397 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); 398 399 void inet_csk_reqsk_queue_prune(struct sock *parent, 400 const unsigned long interval, 401 const unsigned long timeout, 402 const unsigned long max_rto) 403 { 404 struct inet_connection_sock *icsk = inet_csk(parent); 405 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 406 struct listen_sock *lopt = queue->listen_opt; 407 int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; 408 int thresh = max_retries; 409 unsigned long now = jiffies; 410 struct request_sock **reqp, *req; 411 int i, budget; 412 413 if (lopt == NULL || lopt->qlen == 0) 414 return; 415 416 /* Normally all the openreqs are young and become mature 417 * (i.e. converted to established socket) for first timeout. 418 * If synack was not acknowledged for 3 seconds, it means 419 * one of the following things: synack was lost, ack was lost, 420 * rtt is high or nobody planned to ack (i.e. synflood). 421 * When server is a bit loaded, queue is populated with old 422 * open requests, reducing effective size of queue. 423 * When server is well loaded, queue size reduces to zero 424 * after several minutes of work. It is not synflood, 425 * it is normal operation. The solution is pruning 426 * too old entries overriding normal timeout, when 427 * situation becomes dangerous. 428 * 429 * Essentially, we reserve half of room for young 430 * embrions; and abort old ones without pity, if old 431 * ones are about to clog our table. 432 */ 433 if (lopt->qlen>>(lopt->max_qlen_log-1)) { 434 int young = (lopt->qlen_young<<1); 435 436 while (thresh > 2) { 437 if (lopt->qlen < young) 438 break; 439 thresh--; 440 young <<= 1; 441 } 442 } 443 444 if (queue->rskq_defer_accept) 445 max_retries = queue->rskq_defer_accept; 446 447 budget = 2 * (lopt->nr_table_entries / (timeout / interval)); 448 i = lopt->clock_hand; 449 450 do { 451 reqp=&lopt->syn_table[i]; 452 while ((req = *reqp) != NULL) { 453 if (time_after_eq(now, req->expires)) { 454 if ((req->retrans < thresh || 455 (inet_rsk(req)->acked && req->retrans < max_retries)) 456 && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { 457 unsigned long timeo; 458 459 if (req->retrans++ == 0) 460 lopt->qlen_young--; 461 timeo = min((timeout << req->retrans), max_rto); 462 req->expires = now + timeo; 463 reqp = &req->dl_next; 464 continue; 465 } 466 467 /* Drop this request */ 468 inet_csk_reqsk_queue_unlink(parent, req, reqp); 469 reqsk_queue_removed(queue, req); 470 reqsk_free(req); 471 continue; 472 } 473 reqp = &req->dl_next; 474 } 475 476 i = (i + 1) & (lopt->nr_table_entries - 1); 477 478 } while (--budget > 0); 479 480 lopt->clock_hand = i; 481 482 if (lopt->qlen) 483 inet_csk_reset_keepalive_timer(parent, interval); 484 } 485 486 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 487 488 struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, 489 const gfp_t priority) 490 { 491 struct sock *newsk = sk_clone(sk, priority); 492 493 if (newsk != NULL) { 494 struct inet_connection_sock *newicsk = inet_csk(newsk); 495 496 newsk->sk_state = TCP_SYN_RECV; 497 newicsk->icsk_bind_hash = NULL; 498 499 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; 500 newsk->sk_write_space = sk_stream_write_space; 501 502 newicsk->icsk_retransmits = 0; 503 newicsk->icsk_backoff = 0; 504 newicsk->icsk_probes_out = 0; 505 506 /* Deinitialize accept_queue to trap illegal accesses. */ 507 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); 508 } 509 return newsk; 510 } 511 512 EXPORT_SYMBOL_GPL(inet_csk_clone); 513 514 /* 515 * At this point, there should be no process reference to this 516 * socket, and thus no user references at all. Therefore we 517 * can assume the socket waitqueue is inactive and nobody will 518 * try to jump onto it. 519 */ 520 void inet_csk_destroy_sock(struct sock *sk) 521 { 522 BUG_TRAP(sk->sk_state == TCP_CLOSE); 523 BUG_TRAP(sock_flag(sk, SOCK_DEAD)); 524 525 /* It cannot be in hash table! */ 526 BUG_TRAP(sk_unhashed(sk)); 527 528 /* If it has not 0 inet_sk(sk)->num, it must be bound */ 529 BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); 530 531 sk->sk_prot->destroy(sk); 532 533 sk_stream_kill_queues(sk); 534 535 xfrm_sk_free_policy(sk); 536 537 sk_refcnt_debug_release(sk); 538 539 atomic_dec(sk->sk_prot->orphan_count); 540 sock_put(sk); 541 } 542 543 EXPORT_SYMBOL(inet_csk_destroy_sock); 544 545 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 546 { 547 struct inet_sock *inet = inet_sk(sk); 548 struct inet_connection_sock *icsk = inet_csk(sk); 549 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); 550 551 if (rc != 0) 552 return rc; 553 554 sk->sk_max_ack_backlog = 0; 555 sk->sk_ack_backlog = 0; 556 inet_csk_delack_init(sk); 557 558 /* There is race window here: we announce ourselves listening, 559 * but this transition is still not validated by get_port(). 560 * It is OK, because this socket enters to hash table only 561 * after validation is complete. 562 */ 563 sk->sk_state = TCP_LISTEN; 564 if (!sk->sk_prot->get_port(sk, inet->num)) { 565 inet->sport = htons(inet->num); 566 567 sk_dst_reset(sk); 568 sk->sk_prot->hash(sk); 569 570 return 0; 571 } 572 573 sk->sk_state = TCP_CLOSE; 574 __reqsk_queue_destroy(&icsk->icsk_accept_queue); 575 return -EADDRINUSE; 576 } 577 578 EXPORT_SYMBOL_GPL(inet_csk_listen_start); 579 580 /* 581 * This routine closes sockets which have been at least partially 582 * opened, but not yet accepted. 583 */ 584 void inet_csk_listen_stop(struct sock *sk) 585 { 586 struct inet_connection_sock *icsk = inet_csk(sk); 587 struct request_sock *acc_req; 588 struct request_sock *req; 589 590 inet_csk_delete_keepalive_timer(sk); 591 592 /* make all the listen_opt local to us */ 593 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); 594 595 /* Following specs, it would be better either to send FIN 596 * (and enter FIN-WAIT-1, it is normal close) 597 * or to send active reset (abort). 598 * Certainly, it is pretty dangerous while synflood, but it is 599 * bad justification for our negligence 8) 600 * To be honest, we are not able to make either 601 * of the variants now. --ANK 602 */ 603 reqsk_queue_destroy(&icsk->icsk_accept_queue); 604 605 while ((req = acc_req) != NULL) { 606 struct sock *child = req->sk; 607 608 acc_req = req->dl_next; 609 610 local_bh_disable(); 611 bh_lock_sock(child); 612 BUG_TRAP(!sock_owned_by_user(child)); 613 sock_hold(child); 614 615 sk->sk_prot->disconnect(child, O_NONBLOCK); 616 617 sock_orphan(child); 618 619 atomic_inc(sk->sk_prot->orphan_count); 620 621 inet_csk_destroy_sock(child); 622 623 bh_unlock_sock(child); 624 local_bh_enable(); 625 sock_put(child); 626 627 sk_acceptq_removed(sk); 628 __reqsk_free(req); 629 } 630 BUG_TRAP(!sk->sk_ack_backlog); 631 } 632 633 EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 634