inetpeer.c (f9181f4ffc71d7b7dd1906c9a11d51d6659220ae) | inetpeer.c (aa1039e73cc2cf834e99c09d2033d5d2675357b9) |
---|---|
1/* 2 * INETPEER - A storage for permanent information about peers 3 * 4 * This source is covered by the GNU GPL, the same as all kernel sources. 5 * 6 * Authors: Andrey V. Savochkin <saw@msu.ru> 7 */ 8 --- 37 unchanged lines hidden (view full) --- 46 * 47 * Node pool is organised as an AVL tree. 48 * Such an implementation has been chosen not just for fun. It's a way to 49 * prevent easy and efficient DoS attacks by creating hash collisions. A huge 50 * amount of long living nodes in a single hash slot would significantly delay 51 * lookups performed with disabled BHs. 52 * 53 * Serialisation issues. | 1/* 2 * INETPEER - A storage for permanent information about peers 3 * 4 * This source is covered by the GNU GPL, the same as all kernel sources. 5 * 6 * Authors: Andrey V. Savochkin <saw@msu.ru> 7 */ 8 --- 37 unchanged lines hidden (view full) --- 46 * 47 * Node pool is organised as an AVL tree. 48 * Such an implementation has been chosen not just for fun. It's a way to 49 * prevent easy and efficient DoS attacks by creating hash collisions. A huge 50 * amount of long living nodes in a single hash slot would significantly delay 51 * lookups performed with disabled BHs. 52 * 53 * Serialisation issues. |
54 * 1. Nodes may appear in the tree only with the pool write lock held. 55 * 2. Nodes may disappear from the tree only with the pool write lock held | 54 * 1. Nodes may appear in the tree only with the pool lock held. 55 * 2. Nodes may disappear from the tree only with the pool lock held |
56 * AND reference count being 0. 57 * 3. Nodes appears and disappears from unused node list only under 58 * "inet_peer_unused_lock". 59 * 4. Global variable peer_total is modified under the pool lock. 60 * 5. struct inet_peer fields modification: 61 * avl_left, avl_right, avl_parent, avl_height: pool lock 62 * unused: unused node list lock 63 * refcnt: atomically against modifications on other CPU; --- 11 unchanged lines hidden (view full) --- 75static const struct inet_peer peer_fake_node = { 76 .avl_left = peer_avl_empty, 77 .avl_right = peer_avl_empty, 78 .avl_height = 0 79}; 80 81static struct { 82 struct inet_peer *root; | 56 * AND reference count being 0. 57 * 3. Nodes appears and disappears from unused node list only under 58 * "inet_peer_unused_lock". 59 * 4. Global variable peer_total is modified under the pool lock. 60 * 5. struct inet_peer fields modification: 61 * avl_left, avl_right, avl_parent, avl_height: pool lock 62 * unused: unused node list lock 63 * refcnt: atomically against modifications on other CPU; --- 11 unchanged lines hidden (view full) --- 75static const struct inet_peer peer_fake_node = { 76 .avl_left = peer_avl_empty, 77 .avl_right = peer_avl_empty, 78 .avl_height = 0 79}; 80 81static struct { 82 struct inet_peer *root; |
83 rwlock_t lock; | 83 spinlock_t lock; |
84 int total; 85} peers = { 86 .root = peer_avl_empty, | 84 int total; 85} peers = { 86 .root = peer_avl_empty, |
87 .lock = __RW_LOCK_UNLOCKED(peers.lock), | 87 .lock = __SPIN_LOCK_UNLOCKED(peers.lock), |
88 .total = 0, 89}; 90#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 91 92/* Exported for sysctl_net_ipv4. */ 93int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more 94 * aggressively at this stage */ 95int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ --- 28 unchanged lines hidden (view full) --- 124 inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ 125 if (si.totalram <= (16384*1024)/PAGE_SIZE) 126 inet_peer_threshold >>= 1; /* about 512KB */ 127 if (si.totalram <= (8192*1024)/PAGE_SIZE) 128 inet_peer_threshold >>= 2; /* about 128KB */ 129 130 peer_cachep = kmem_cache_create("inet_peer_cache", 131 sizeof(struct inet_peer), | 88 .total = 0, 89}; 90#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 91 92/* Exported for sysctl_net_ipv4. */ 93int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more 94 * aggressively at this stage */ 95int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ --- 28 unchanged lines hidden (view full) --- 124 inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ 125 if (si.totalram <= (16384*1024)/PAGE_SIZE) 126 inet_peer_threshold >>= 1; /* about 512KB */ 127 if (si.totalram <= (8192*1024)/PAGE_SIZE) 128 inet_peer_threshold >>= 2; /* about 128KB */ 129 130 peer_cachep = kmem_cache_create("inet_peer_cache", 131 sizeof(struct inet_peer), |
132 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, | 132 0, SLAB_PANIC, |
133 NULL); 134 135 /* All the timers, started at system startup tend 136 to synchronize. Perturb it a bit. 137 */ 138 peer_periodic_timer.expires = jiffies 139 + net_random() % inet_peer_gc_maxtime 140 + inet_peer_gc_maxtime; --- 7 unchanged lines hidden (view full) --- 148 spin_lock_bh(&unused_peers.lock); 149 list_del_init(&p->unused); 150 spin_unlock_bh(&unused_peers.lock); 151 } 152} 153 154/* 155 * Called with local BH disabled and the pool lock held. | 133 NULL); 134 135 /* All the timers, started at system startup tend 136 to synchronize. Perturb it a bit. 137 */ 138 peer_periodic_timer.expires = jiffies 139 + net_random() % inet_peer_gc_maxtime 140 + inet_peer_gc_maxtime; --- 7 unchanged lines hidden (view full) --- 148 spin_lock_bh(&unused_peers.lock); 149 list_del_init(&p->unused); 150 spin_unlock_bh(&unused_peers.lock); 151 } 152} 153 154/* 155 * Called with local BH disabled and the pool lock held. |
156 * _stack is known to be NULL or not at compile time, 157 * so compiler will optimize the if (_stack) tests. | |
158 */ 159#define lookup(_daddr, _stack) \ 160({ \ 161 struct inet_peer *u, **v; \ | 156 */ 157#define lookup(_daddr, _stack) \ 158({ \ 159 struct inet_peer *u, **v; \ |
162 if (_stack != NULL) { \ 163 stackptr = _stack; \ 164 *stackptr++ = &peers.root; \ 165 } \ | 160 \ 161 stackptr = _stack; \ 162 *stackptr++ = &peers.root; \ |
166 for (u = peers.root; u != peer_avl_empty; ) { \ 167 if (_daddr == u->v4daddr) \ 168 break; \ 169 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 170 v = &u->avl_left; \ 171 else \ 172 v = &u->avl_right; \ | 163 for (u = peers.root; u != peer_avl_empty; ) { \ 164 if (_daddr == u->v4daddr) \ 165 break; \ 166 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 167 v = &u->avl_left; \ 168 else \ 169 v = &u->avl_right; \ |
173 if (_stack != NULL) \ 174 *stackptr++ = v; \ | 170 *stackptr++ = v; \ |
175 u = *v; \ 176 } \ 177 u; \ 178}) 179 | 171 u = *v; \ 172 } \ 173 u; \ 174}) 175 |
180/* Called with local BH disabled and the pool write lock held. */ | 176/* 177 * Called with rcu_read_lock_bh() 178 * Because we hold no lock against a writer, its quite possible we fall 179 * in an endless loop. 180 * But every pointer we follow is guaranteed to be valid thanks to RCU. 181 * We exit from this function if number of links exceeds PEER_MAXDEPTH 182 */ 183static struct inet_peer *lookup_rcu_bh(__be32 daddr) 184{ 185 struct inet_peer *u = rcu_dereference_bh(peers.root); 186 int count = 0; 187 188 while (u != peer_avl_empty) { 189 if (daddr == u->v4daddr) { 190 if (unlikely(!atomic_inc_not_zero(&u->refcnt))) 191 u = NULL; 192 return u; 193 } 194 if ((__force __u32)daddr < (__force __u32)u->v4daddr) 195 u = rcu_dereference_bh(u->avl_left); 196 else 197 u = rcu_dereference_bh(u->avl_right); 198 if (unlikely(++count == PEER_MAXDEPTH)) 199 break; 200 } 201 return NULL; 202} 203 204/* Called with local BH disabled and the pool lock held. */ |
181#define lookup_rightempty(start) \ 182({ \ 183 struct inet_peer *u, **v; \ 184 *stackptr++ = &start->avl_left; \ 185 v = &start->avl_left; \ 186 for (u = *v; u->avl_right != peer_avl_empty; ) { \ 187 v = &u->avl_right; \ 188 *stackptr++ = v; \ 189 u = *v; \ 190 } \ 191 u; \ 192}) 193 | 205#define lookup_rightempty(start) \ 206({ \ 207 struct inet_peer *u, **v; \ 208 *stackptr++ = &start->avl_left; \ 209 v = &start->avl_left; \ 210 for (u = *v; u->avl_right != peer_avl_empty; ) { \ 211 v = &u->avl_right; \ 212 *stackptr++ = v; \ 213 u = *v; \ 214 } \ 215 u; \ 216}) 217 |
194/* Called with local BH disabled and the pool write lock held. | 218/* Called with local BH disabled and the pool lock held. |
195 * Variable names are the proof of operation correctness. | 219 * Variable names are the proof of operation correctness. |
196 * Look into mm/map_avl.c for more detail description of the ideas. */ | 220 * Look into mm/map_avl.c for more detail description of the ideas. 221 */ |
197static void peer_avl_rebalance(struct inet_peer **stack[], 198 struct inet_peer ***stackend) 199{ 200 struct inet_peer **nodep, *node, *l, *r; 201 int lh, rh; 202 203 while (stackend > stack) { 204 nodep = *--stackend; --- 59 unchanged lines hidden (view full) --- 264 *nodep = rl; 265 } 266 } else { 267 node->avl_height = (lh > rh ? lh : rh) + 1; 268 } 269 } 270} 271 | 222static void peer_avl_rebalance(struct inet_peer **stack[], 223 struct inet_peer ***stackend) 224{ 225 struct inet_peer **nodep, *node, *l, *r; 226 int lh, rh; 227 228 while (stackend > stack) { 229 nodep = *--stackend; --- 59 unchanged lines hidden (view full) --- 289 *nodep = rl; 290 } 291 } else { 292 node->avl_height = (lh > rh ? lh : rh) + 1; 293 } 294 } 295} 296 |
272/* Called with local BH disabled and the pool write lock held. */ | 297/* Called with local BH disabled and the pool lock held. */ |
273#define link_to_pool(n) \ 274do { \ 275 n->avl_height = 1; \ 276 n->avl_left = peer_avl_empty; \ 277 n->avl_right = peer_avl_empty; \ | 298#define link_to_pool(n) \ 299do { \ 300 n->avl_height = 1; \ 301 n->avl_left = peer_avl_empty; \ 302 n->avl_right = peer_avl_empty; \ |
303 smp_wmb(); /* lockless readers can catch us now */ \ |
|
278 **--stackptr = n; \ 279 peer_avl_rebalance(stack, stackptr); \ 280} while (0) 281 | 304 **--stackptr = n; \ 305 peer_avl_rebalance(stack, stackptr); \ 306} while (0) 307 |
308static void inetpeer_free_rcu(struct rcu_head *head) 309{ 310 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); 311} 312 |
|
282/* May be called with local BH enabled. */ 283static void unlink_from_pool(struct inet_peer *p) 284{ 285 int do_free; 286 287 do_free = 0; 288 | 313/* May be called with local BH enabled. */ 314static void unlink_from_pool(struct inet_peer *p) 315{ 316 int do_free; 317 318 do_free = 0; 319 |
289 write_lock_bh(&peers.lock); | 320 spin_lock_bh(&peers.lock); |
290 /* Check the reference counter. It was artificially incremented by 1 | 321 /* Check the reference counter. It was artificially incremented by 1 |
291 * in cleanup() function to prevent sudden disappearing. If the 292 * reference count is still 1 then the node is referenced only as `p' 293 * here and from the pool. So under the exclusive pool lock it's safe 294 * to remove the node and free it later. */ 295 if (atomic_read(&p->refcnt) == 1) { | 322 * in cleanup() function to prevent sudden disappearing. If we can 323 * atomically (because of lockless readers) take this last reference, 324 * it's safe to remove the node and free it later. 325 */ 326 if (atomic_cmpxchg(&p->refcnt, 1, 0) == 1) { |
296 struct inet_peer **stack[PEER_MAXDEPTH]; 297 struct inet_peer ***stackptr, ***delp; 298 if (lookup(p->v4daddr, stack) != p) 299 BUG(); 300 delp = stackptr - 1; /* *delp[0] == p */ 301 if (p->avl_left == peer_avl_empty) { 302 *delp[0] = p->avl_right; 303 --stackptr; --- 12 unchanged lines hidden (view full) --- 316 t->avl_height = p->avl_height; 317 BUG_ON(delp[1] != &p->avl_left); 318 delp[1] = &t->avl_left; /* was &p->avl_left */ 319 } 320 peer_avl_rebalance(stack, stackptr); 321 peers.total--; 322 do_free = 1; 323 } | 327 struct inet_peer **stack[PEER_MAXDEPTH]; 328 struct inet_peer ***stackptr, ***delp; 329 if (lookup(p->v4daddr, stack) != p) 330 BUG(); 331 delp = stackptr - 1; /* *delp[0] == p */ 332 if (p->avl_left == peer_avl_empty) { 333 *delp[0] = p->avl_right; 334 --stackptr; --- 12 unchanged lines hidden (view full) --- 347 t->avl_height = p->avl_height; 348 BUG_ON(delp[1] != &p->avl_left); 349 delp[1] = &t->avl_left; /* was &p->avl_left */ 350 } 351 peer_avl_rebalance(stack, stackptr); 352 peers.total--; 353 do_free = 1; 354 } |
324 write_unlock_bh(&peers.lock); | 355 spin_unlock_bh(&peers.lock); |
325 326 if (do_free) | 356 357 if (do_free) |
327 kmem_cache_free(peer_cachep, p); | 358 call_rcu_bh(&p->rcu, inetpeer_free_rcu); |
328 else 329 /* The node is used again. Decrease the reference counter 330 * back. The loop "cleanup -> unlink_from_unused 331 * -> unlink_from_pool -> putpeer -> link_to_unused 332 * -> cleanup (for the same node)" 333 * doesn't really exist because the entry will have a | 359 else 360 /* The node is used again. Decrease the reference counter 361 * back. The loop "cleanup -> unlink_from_unused 362 * -> unlink_from_pool -> putpeer -> link_to_unused 363 * -> cleanup (for the same node)" 364 * doesn't really exist because the entry will have a |
334 * recent deletion time and will not be cleaned again soon. */ | 365 * recent deletion time and will not be cleaned again soon. 366 */ |
335 inet_putpeer(p); 336} 337 338/* May be called with local BH enabled. */ 339static int cleanup_once(unsigned long ttl) 340{ 341 struct inet_peer *p = NULL; 342 --- 27 unchanged lines hidden (view full) --- 370 371 unlink_from_pool(p); 372 return 0; 373} 374 375/* Called with or without local BH being disabled. */ 376struct inet_peer *inet_getpeer(__be32 daddr, int create) 377{ | 367 inet_putpeer(p); 368} 369 370/* May be called with local BH enabled. */ 371static int cleanup_once(unsigned long ttl) 372{ 373 struct inet_peer *p = NULL; 374 --- 27 unchanged lines hidden (view full) --- 402 403 unlink_from_pool(p); 404 return 0; 405} 406 407/* Called with or without local BH being disabled. */ 408struct inet_peer *inet_getpeer(__be32 daddr, int create) 409{ |
378 struct inet_peer *p, *n; | 410 struct inet_peer *p; |
379 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 380 | 411 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 412 |
381 /* Look up for the address quickly. */ 382 read_lock_bh(&peers.lock); 383 p = lookup(daddr, NULL); 384 if (p != peer_avl_empty) 385 atomic_inc(&p->refcnt); 386 read_unlock_bh(&peers.lock); | 413 /* Look up for the address quickly, lockless. 414 * Because of a concurrent writer, we might not find an existing entry. 415 */ 416 rcu_read_lock_bh(); 417 p = lookup_rcu_bh(daddr); 418 rcu_read_unlock_bh(); |
387 | 419 |
420 if (p) { 421 /* The existing node has been found. 422 * Remove the entry from unused list if it was there. 423 */ 424 unlink_from_unused(p); 425 return p; 426 } 427 428 /* retry an exact lookup, taking the lock before. 429 * At least, nodes should be hot in our cache. 430 */ 431 spin_lock_bh(&peers.lock); 432 p = lookup(daddr, stack); |
|
388 if (p != peer_avl_empty) { | 433 if (p != peer_avl_empty) { |
389 /* The existing node has been found. */ | 434 atomic_inc(&p->refcnt); 435 spin_unlock_bh(&peers.lock); |
390 /* Remove the entry from unused list if it was there. */ 391 unlink_from_unused(p); 392 return p; 393 } | 436 /* Remove the entry from unused list if it was there. */ 437 unlink_from_unused(p); 438 return p; 439 } |
440 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; 441 if (p) { 442 p->v4daddr = daddr; 443 atomic_set(&p->refcnt, 1); 444 atomic_set(&p->rid, 0); 445 atomic_set(&p->ip_id_count, secure_ip_id(daddr)); 446 p->tcp_ts_stamp = 0; 447 INIT_LIST_HEAD(&p->unused); |
|
394 | 448 |
395 if (!create) 396 return NULL; | |
397 | 449 |
398 /* Allocate the space outside the locked region. */ 399 n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); 400 if (n == NULL) 401 return NULL; 402 n->v4daddr = daddr; 403 atomic_set(&n->refcnt, 1); 404 atomic_set(&n->rid, 0); 405 atomic_set(&n->ip_id_count, secure_ip_id(daddr)); 406 n->tcp_ts_stamp = 0; | 450 /* Link the node. */ 451 link_to_pool(p); 452 peers.total++; 453 } 454 spin_unlock_bh(&peers.lock); |
407 | 455 |
408 write_lock_bh(&peers.lock); 409 /* Check if an entry has suddenly appeared. */ 410 p = lookup(daddr, stack); 411 if (p != peer_avl_empty) 412 goto out_free; 413 414 /* Link the node. */ 415 link_to_pool(n); 416 INIT_LIST_HEAD(&n->unused); 417 peers.total++; 418 write_unlock_bh(&peers.lock); 419 | |
420 if (peers.total >= inet_peer_threshold) 421 /* Remove one less-recently-used entry. */ 422 cleanup_once(0); 423 | 456 if (peers.total >= inet_peer_threshold) 457 /* Remove one less-recently-used entry. */ 458 cleanup_once(0); 459 |
424 return n; 425 426out_free: 427 /* The appropriate node is already in the pool. */ 428 atomic_inc(&p->refcnt); 429 write_unlock_bh(&peers.lock); 430 /* Remove the entry from unused list if it was there. */ 431 unlink_from_unused(p); 432 /* Free preallocated the preallocated node. */ 433 kmem_cache_free(peer_cachep, n); | |
434 return p; 435} 436 437/* Called with local BH disabled. */ 438static void peer_check_expire(unsigned long dummy) 439{ 440 unsigned long now = jiffies; 441 int ttl; --- 37 unchanged lines hidden --- | 460 return p; 461} 462 463/* Called with local BH disabled. */ 464static void peer_check_expire(unsigned long dummy) 465{ 466 unsigned long now = jiffies; 467 int ttl; --- 37 unchanged lines hidden --- |