1 /* 2 * INETPEER - A storage for permanent information about peers 3 * 4 * This source is covered by the GNU GPL, the same as all kernel sources. 5 * 6 * Authors: Andrey V. Savochkin <saw@msu.ru> 7 */ 8 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/slab.h> 12 #include <linux/interrupt.h> 13 #include <linux/spinlock.h> 14 #include <linux/random.h> 15 #include <linux/timer.h> 16 #include <linux/time.h> 17 #include <linux/kernel.h> 18 #include <linux/mm.h> 19 #include <linux/net.h> 20 #include <net/ip.h> 21 #include <net/inetpeer.h> 22 23 /* 24 * Theory of operations. 25 * We keep one entry for each peer IP address. The nodes contains long-living 26 * information about the peer which doesn't depend on routes. 27 * At this moment this information consists only of ID field for the next 28 * outgoing IP packet. This field is incremented with each packet as encoded 29 * in inet_getid() function (include/net/inetpeer.h). 30 * At the moment of writing this notes identifier of IP packets is generated 31 * to be unpredictable using this code only for packets subjected 32 * (actually or potentially) to defragmentation. I.e. DF packets less than 33 * PMTU in size uses a constant ID and do not use this code (see 34 * ip_select_ident() in include/net/ip.h). 35 * 36 * Route cache entries hold references to our nodes. 37 * New cache entries get references via lookup by destination IP address in 38 * the avl tree. The reference is grabbed only when it's needed i.e. only 39 * when we try to output IP packet which needs an unpredictable ID (see 40 * __ip_select_ident() in net/ipv4/route.c). 41 * Nodes are removed only when reference counter goes to 0. 42 * When it's happened the node may be removed when a sufficient amount of 43 * time has been passed since its last use. The less-recently-used entry can 44 * also be removed if the pool is overloaded i.e. if the total amount of 45 * entries is greater-or-equal than the threshold. 46 * 47 * Node pool is organised as an AVL tree. 48 * Such an implementation has been chosen not just for fun. It's a way to 49 * prevent easy and efficient DoS attacks by creating hash collisions. A huge 50 * amount of long living nodes in a single hash slot would significantly delay 51 * lookups performed with disabled BHs. 52 * 53 * Serialisation issues. 54 * 1. Nodes may appear in the tree only with the pool lock held. 55 * 2. Nodes may disappear from the tree only with the pool lock held 56 * AND reference count being 0. 57 * 3. Nodes appears and disappears from unused node list only under 58 * "inet_peer_unused_lock". 59 * 4. Global variable peer_total is modified under the pool lock. 60 * 5. struct inet_peer fields modification: 61 * avl_left, avl_right, avl_parent, avl_height: pool lock 62 * unused: unused node list lock 63 * refcnt: atomically against modifications on other CPU; 64 * usually under some other lock to prevent node disappearing 65 * dtime: unused node list lock 66 * v4daddr: unchangeable 67 * ip_id_count: atomic value (no lock needed) 68 */ 69 70 static struct kmem_cache *peer_cachep __read_mostly; 71 72 #define node_height(x) x->avl_height 73 74 #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) 75 static const struct inet_peer peer_fake_node = { 76 .avl_left = peer_avl_empty, 77 .avl_right = peer_avl_empty, 78 .avl_height = 0 79 }; 80 81 static struct { 82 struct inet_peer *root; 83 spinlock_t lock; 84 int total; 85 } peers = { 86 .root = peer_avl_empty, 87 .lock = __SPIN_LOCK_UNLOCKED(peers.lock), 88 .total = 0, 89 }; 90 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 91 92 /* Exported for sysctl_net_ipv4. */ 93 int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more 94 * aggressively at this stage */ 95 int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ 96 int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ 97 int inet_peer_gc_mintime __read_mostly = 10 * HZ; 98 int inet_peer_gc_maxtime __read_mostly = 120 * HZ; 99 100 static struct { 101 struct list_head list; 102 spinlock_t lock; 103 } unused_peers = { 104 .list = LIST_HEAD_INIT(unused_peers.list), 105 .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), 106 }; 107 108 static void peer_check_expire(unsigned long dummy); 109 static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); 110 111 112 /* Called from ip_output.c:ip_init */ 113 void __init inet_initpeers(void) 114 { 115 struct sysinfo si; 116 117 /* Use the straight interface to information about memory. */ 118 si_meminfo(&si); 119 /* The values below were suggested by Alexey Kuznetsov 120 * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values 121 * myself. --SAW 122 */ 123 if (si.totalram <= (32768*1024)/PAGE_SIZE) 124 inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ 125 if (si.totalram <= (16384*1024)/PAGE_SIZE) 126 inet_peer_threshold >>= 1; /* about 512KB */ 127 if (si.totalram <= (8192*1024)/PAGE_SIZE) 128 inet_peer_threshold >>= 2; /* about 128KB */ 129 130 peer_cachep = kmem_cache_create("inet_peer_cache", 131 sizeof(struct inet_peer), 132 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 133 NULL); 134 135 /* All the timers, started at system startup tend 136 to synchronize. Perturb it a bit. 137 */ 138 peer_periodic_timer.expires = jiffies 139 + net_random() % inet_peer_gc_maxtime 140 + inet_peer_gc_maxtime; 141 add_timer(&peer_periodic_timer); 142 } 143 144 /* Called with or without local BH being disabled. */ 145 static void unlink_from_unused(struct inet_peer *p) 146 { 147 if (!list_empty(&p->unused)) { 148 spin_lock_bh(&unused_peers.lock); 149 list_del_init(&p->unused); 150 spin_unlock_bh(&unused_peers.lock); 151 } 152 } 153 154 /* 155 * Called with local BH disabled and the pool lock held. 156 */ 157 #define lookup(_daddr, _stack) \ 158 ({ \ 159 struct inet_peer *u, **v; \ 160 \ 161 stackptr = _stack; \ 162 *stackptr++ = &peers.root; \ 163 for (u = peers.root; u != peer_avl_empty; ) { \ 164 if (_daddr == u->v4daddr) \ 165 break; \ 166 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 167 v = &u->avl_left; \ 168 else \ 169 v = &u->avl_right; \ 170 *stackptr++ = v; \ 171 u = *v; \ 172 } \ 173 u; \ 174 }) 175 176 /* 177 * Called with rcu_read_lock_bh() 178 * Because we hold no lock against a writer, its quite possible we fall 179 * in an endless loop. 180 * But every pointer we follow is guaranteed to be valid thanks to RCU. 181 * We exit from this function if number of links exceeds PEER_MAXDEPTH 182 */ 183 static struct inet_peer *lookup_rcu_bh(__be32 daddr) 184 { 185 struct inet_peer *u = rcu_dereference_bh(peers.root); 186 int count = 0; 187 188 while (u != peer_avl_empty) { 189 if (daddr == u->v4daddr) { 190 /* Before taking a reference, check if this entry was 191 * deleted, unlink_from_pool() sets refcnt=-1 to make 192 * distinction between an unused entry (refcnt=0) and 193 * a freed one. 194 */ 195 if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) 196 u = NULL; 197 return u; 198 } 199 if ((__force __u32)daddr < (__force __u32)u->v4daddr) 200 u = rcu_dereference_bh(u->avl_left); 201 else 202 u = rcu_dereference_bh(u->avl_right); 203 if (unlikely(++count == PEER_MAXDEPTH)) 204 break; 205 } 206 return NULL; 207 } 208 209 /* Called with local BH disabled and the pool lock held. */ 210 #define lookup_rightempty(start) \ 211 ({ \ 212 struct inet_peer *u, **v; \ 213 *stackptr++ = &start->avl_left; \ 214 v = &start->avl_left; \ 215 for (u = *v; u->avl_right != peer_avl_empty; ) { \ 216 v = &u->avl_right; \ 217 *stackptr++ = v; \ 218 u = *v; \ 219 } \ 220 u; \ 221 }) 222 223 /* Called with local BH disabled and the pool lock held. 224 * Variable names are the proof of operation correctness. 225 * Look into mm/map_avl.c for more detail description of the ideas. 226 */ 227 static void peer_avl_rebalance(struct inet_peer **stack[], 228 struct inet_peer ***stackend) 229 { 230 struct inet_peer **nodep, *node, *l, *r; 231 int lh, rh; 232 233 while (stackend > stack) { 234 nodep = *--stackend; 235 node = *nodep; 236 l = node->avl_left; 237 r = node->avl_right; 238 lh = node_height(l); 239 rh = node_height(r); 240 if (lh > rh + 1) { /* l: RH+2 */ 241 struct inet_peer *ll, *lr, *lrl, *lrr; 242 int lrh; 243 ll = l->avl_left; 244 lr = l->avl_right; 245 lrh = node_height(lr); 246 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 247 node->avl_left = lr; /* lr: RH or RH+1 */ 248 node->avl_right = r; /* r: RH */ 249 node->avl_height = lrh + 1; /* RH+1 or RH+2 */ 250 l->avl_left = ll; /* ll: RH+1 */ 251 l->avl_right = node; /* node: RH+1 or RH+2 */ 252 l->avl_height = node->avl_height + 1; 253 *nodep = l; 254 } else { /* ll: RH, lr: RH+1 */ 255 lrl = lr->avl_left; /* lrl: RH or RH-1 */ 256 lrr = lr->avl_right; /* lrr: RH or RH-1 */ 257 node->avl_left = lrr; /* lrr: RH or RH-1 */ 258 node->avl_right = r; /* r: RH */ 259 node->avl_height = rh + 1; /* node: RH+1 */ 260 l->avl_left = ll; /* ll: RH */ 261 l->avl_right = lrl; /* lrl: RH or RH-1 */ 262 l->avl_height = rh + 1; /* l: RH+1 */ 263 lr->avl_left = l; /* l: RH+1 */ 264 lr->avl_right = node; /* node: RH+1 */ 265 lr->avl_height = rh + 2; 266 *nodep = lr; 267 } 268 } else if (rh > lh + 1) { /* r: LH+2 */ 269 struct inet_peer *rr, *rl, *rlr, *rll; 270 int rlh; 271 rr = r->avl_right; 272 rl = r->avl_left; 273 rlh = node_height(rl); 274 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 275 node->avl_right = rl; /* rl: LH or LH+1 */ 276 node->avl_left = l; /* l: LH */ 277 node->avl_height = rlh + 1; /* LH+1 or LH+2 */ 278 r->avl_right = rr; /* rr: LH+1 */ 279 r->avl_left = node; /* node: LH+1 or LH+2 */ 280 r->avl_height = node->avl_height + 1; 281 *nodep = r; 282 } else { /* rr: RH, rl: RH+1 */ 283 rlr = rl->avl_right; /* rlr: LH or LH-1 */ 284 rll = rl->avl_left; /* rll: LH or LH-1 */ 285 node->avl_right = rll; /* rll: LH or LH-1 */ 286 node->avl_left = l; /* l: LH */ 287 node->avl_height = lh + 1; /* node: LH+1 */ 288 r->avl_right = rr; /* rr: LH */ 289 r->avl_left = rlr; /* rlr: LH or LH-1 */ 290 r->avl_height = lh + 1; /* r: LH+1 */ 291 rl->avl_right = r; /* r: LH+1 */ 292 rl->avl_left = node; /* node: LH+1 */ 293 rl->avl_height = lh + 2; 294 *nodep = rl; 295 } 296 } else { 297 node->avl_height = (lh > rh ? lh : rh) + 1; 298 } 299 } 300 } 301 302 /* Called with local BH disabled and the pool lock held. */ 303 #define link_to_pool(n) \ 304 do { \ 305 n->avl_height = 1; \ 306 n->avl_left = peer_avl_empty; \ 307 n->avl_right = peer_avl_empty; \ 308 smp_wmb(); /* lockless readers can catch us now */ \ 309 **--stackptr = n; \ 310 peer_avl_rebalance(stack, stackptr); \ 311 } while (0) 312 313 static void inetpeer_free_rcu(struct rcu_head *head) 314 { 315 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); 316 } 317 318 /* May be called with local BH enabled. */ 319 static void unlink_from_pool(struct inet_peer *p) 320 { 321 int do_free; 322 323 do_free = 0; 324 325 spin_lock_bh(&peers.lock); 326 /* Check the reference counter. It was artificially incremented by 1 327 * in cleanup() function to prevent sudden disappearing. If we can 328 * atomically (because of lockless readers) take this last reference, 329 * it's safe to remove the node and free it later. 330 * We use refcnt=-1 to alert lockless readers this entry is deleted. 331 */ 332 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 333 struct inet_peer **stack[PEER_MAXDEPTH]; 334 struct inet_peer ***stackptr, ***delp; 335 if (lookup(p->v4daddr, stack) != p) 336 BUG(); 337 delp = stackptr - 1; /* *delp[0] == p */ 338 if (p->avl_left == peer_avl_empty) { 339 *delp[0] = p->avl_right; 340 --stackptr; 341 } else { 342 /* look for a node to insert instead of p */ 343 struct inet_peer *t; 344 t = lookup_rightempty(p); 345 BUG_ON(*stackptr[-1] != t); 346 **--stackptr = t->avl_left; 347 /* t is removed, t->v4daddr > x->v4daddr for any 348 * x in p->avl_left subtree. 349 * Put t in the old place of p. */ 350 *delp[0] = t; 351 t->avl_left = p->avl_left; 352 t->avl_right = p->avl_right; 353 t->avl_height = p->avl_height; 354 BUG_ON(delp[1] != &p->avl_left); 355 delp[1] = &t->avl_left; /* was &p->avl_left */ 356 } 357 peer_avl_rebalance(stack, stackptr); 358 peers.total--; 359 do_free = 1; 360 } 361 spin_unlock_bh(&peers.lock); 362 363 if (do_free) 364 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 365 else 366 /* The node is used again. Decrease the reference counter 367 * back. The loop "cleanup -> unlink_from_unused 368 * -> unlink_from_pool -> putpeer -> link_to_unused 369 * -> cleanup (for the same node)" 370 * doesn't really exist because the entry will have a 371 * recent deletion time and will not be cleaned again soon. 372 */ 373 inet_putpeer(p); 374 } 375 376 /* May be called with local BH enabled. */ 377 static int cleanup_once(unsigned long ttl) 378 { 379 struct inet_peer *p = NULL; 380 381 /* Remove the first entry from the list of unused nodes. */ 382 spin_lock_bh(&unused_peers.lock); 383 if (!list_empty(&unused_peers.list)) { 384 __u32 delta; 385 386 p = list_first_entry(&unused_peers.list, struct inet_peer, unused); 387 delta = (__u32)jiffies - p->dtime; 388 389 if (delta < ttl) { 390 /* Do not prune fresh entries. */ 391 spin_unlock_bh(&unused_peers.lock); 392 return -1; 393 } 394 395 list_del_init(&p->unused); 396 397 /* Grab an extra reference to prevent node disappearing 398 * before unlink_from_pool() call. */ 399 atomic_inc(&p->refcnt); 400 } 401 spin_unlock_bh(&unused_peers.lock); 402 403 if (p == NULL) 404 /* It means that the total number of USED entries has 405 * grown over inet_peer_threshold. It shouldn't really 406 * happen because of entry limits in route cache. */ 407 return -1; 408 409 unlink_from_pool(p); 410 return 0; 411 } 412 413 /* Called with or without local BH being disabled. */ 414 struct inet_peer *inet_getpeer(__be32 daddr, int create) 415 { 416 struct inet_peer *p; 417 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 418 419 /* Look up for the address quickly, lockless. 420 * Because of a concurrent writer, we might not find an existing entry. 421 */ 422 rcu_read_lock_bh(); 423 p = lookup_rcu_bh(daddr); 424 rcu_read_unlock_bh(); 425 426 if (p) { 427 /* The existing node has been found. 428 * Remove the entry from unused list if it was there. 429 */ 430 unlink_from_unused(p); 431 return p; 432 } 433 434 /* retry an exact lookup, taking the lock before. 435 * At least, nodes should be hot in our cache. 436 */ 437 spin_lock_bh(&peers.lock); 438 p = lookup(daddr, stack); 439 if (p != peer_avl_empty) { 440 atomic_inc(&p->refcnt); 441 spin_unlock_bh(&peers.lock); 442 /* Remove the entry from unused list if it was there. */ 443 unlink_from_unused(p); 444 return p; 445 } 446 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; 447 if (p) { 448 p->v4daddr = daddr; 449 atomic_set(&p->refcnt, 1); 450 atomic_set(&p->rid, 0); 451 atomic_set(&p->ip_id_count, secure_ip_id(daddr)); 452 p->tcp_ts_stamp = 0; 453 INIT_LIST_HEAD(&p->unused); 454 455 456 /* Link the node. */ 457 link_to_pool(p); 458 peers.total++; 459 } 460 spin_unlock_bh(&peers.lock); 461 462 if (peers.total >= inet_peer_threshold) 463 /* Remove one less-recently-used entry. */ 464 cleanup_once(0); 465 466 return p; 467 } 468 469 /* Called with local BH disabled. */ 470 static void peer_check_expire(unsigned long dummy) 471 { 472 unsigned long now = jiffies; 473 int ttl; 474 475 if (peers.total >= inet_peer_threshold) 476 ttl = inet_peer_minttl; 477 else 478 ttl = inet_peer_maxttl 479 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 480 peers.total / inet_peer_threshold * HZ; 481 while (!cleanup_once(ttl)) { 482 if (jiffies != now) 483 break; 484 } 485 486 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 487 * interval depending on the total number of entries (more entries, 488 * less interval). */ 489 if (peers.total >= inet_peer_threshold) 490 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; 491 else 492 peer_periodic_timer.expires = jiffies 493 + inet_peer_gc_maxtime 494 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 495 peers.total / inet_peer_threshold * HZ; 496 add_timer(&peer_periodic_timer); 497 } 498 499 void inet_putpeer(struct inet_peer *p) 500 { 501 local_bh_disable(); 502 503 if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { 504 list_add_tail(&p->unused, &unused_peers.list); 505 p->dtime = (__u32)jiffies; 506 spin_unlock(&unused_peers.lock); 507 } 508 509 local_bh_enable(); 510 } 511