inetpeer.c (f9181f4ffc71d7b7dd1906c9a11d51d6659220ae) inetpeer.c (aa1039e73cc2cf834e99c09d2033d5d2675357b9)
1/*
2 * INETPEER - A storage for permanent information about peers
3 *
4 * This source is covered by the GNU GPL, the same as all kernel sources.
5 *
6 * Authors: Andrey V. Savochkin <saw@msu.ru>
7 */
8

--- 37 unchanged lines hidden (view full) ---

46 *
47 * Node pool is organised as an AVL tree.
48 * Such an implementation has been chosen not just for fun. It's a way to
49 * prevent easy and efficient DoS attacks by creating hash collisions. A huge
50 * amount of long living nodes in a single hash slot would significantly delay
51 * lookups performed with disabled BHs.
52 *
53 * Serialisation issues.
1/*
2 * INETPEER - A storage for permanent information about peers
3 *
4 * This source is covered by the GNU GPL, the same as all kernel sources.
5 *
6 * Authors: Andrey V. Savochkin <saw@msu.ru>
7 */
8

--- 37 unchanged lines hidden (view full) ---

46 *
47 * Node pool is organised as an AVL tree.
48 * Such an implementation has been chosen not just for fun. It's a way to
49 * prevent easy and efficient DoS attacks by creating hash collisions. A huge
50 * amount of long living nodes in a single hash slot would significantly delay
51 * lookups performed with disabled BHs.
52 *
53 * Serialisation issues.
54 * 1. Nodes may appear in the tree only with the pool write lock held.
55 * 2. Nodes may disappear from the tree only with the pool write lock held
54 * 1. Nodes may appear in the tree only with the pool lock held.
55 * 2. Nodes may disappear from the tree only with the pool lock held
56 * AND reference count being 0.
57 * 3. Nodes appears and disappears from unused node list only under
58 * "inet_peer_unused_lock".
59 * 4. Global variable peer_total is modified under the pool lock.
60 * 5. struct inet_peer fields modification:
61 * avl_left, avl_right, avl_parent, avl_height: pool lock
62 * unused: unused node list lock
63 * refcnt: atomically against modifications on other CPU;

--- 11 unchanged lines hidden (view full) ---

75static const struct inet_peer peer_fake_node = {
76 .avl_left = peer_avl_empty,
77 .avl_right = peer_avl_empty,
78 .avl_height = 0
79};
80
81static struct {
82 struct inet_peer *root;
56 * AND reference count being 0.
57 * 3. Nodes appears and disappears from unused node list only under
58 * "inet_peer_unused_lock".
59 * 4. Global variable peer_total is modified under the pool lock.
60 * 5. struct inet_peer fields modification:
61 * avl_left, avl_right, avl_parent, avl_height: pool lock
62 * unused: unused node list lock
63 * refcnt: atomically against modifications on other CPU;

--- 11 unchanged lines hidden (view full) ---

75static const struct inet_peer peer_fake_node = {
76 .avl_left = peer_avl_empty,
77 .avl_right = peer_avl_empty,
78 .avl_height = 0
79};
80
81static struct {
82 struct inet_peer *root;
83 rwlock_t lock;
83 spinlock_t lock;
84 int total;
85} peers = {
86 .root = peer_avl_empty,
84 int total;
85} peers = {
86 .root = peer_avl_empty,
87 .lock = __RW_LOCK_UNLOCKED(peers.lock),
87 .lock = __SPIN_LOCK_UNLOCKED(peers.lock),
88 .total = 0,
89};
90#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
91
92/* Exported for sysctl_net_ipv4. */
93int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
94 * aggressively at this stage */
95int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */

--- 28 unchanged lines hidden (view full) ---

124 inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
125 if (si.totalram <= (16384*1024)/PAGE_SIZE)
126 inet_peer_threshold >>= 1; /* about 512KB */
127 if (si.totalram <= (8192*1024)/PAGE_SIZE)
128 inet_peer_threshold >>= 2; /* about 128KB */
129
130 peer_cachep = kmem_cache_create("inet_peer_cache",
131 sizeof(struct inet_peer),
88 .total = 0,
89};
90#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
91
92/* Exported for sysctl_net_ipv4. */
93int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
94 * aggressively at this stage */
95int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */

--- 28 unchanged lines hidden (view full) ---

124 inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
125 if (si.totalram <= (16384*1024)/PAGE_SIZE)
126 inet_peer_threshold >>= 1; /* about 512KB */
127 if (si.totalram <= (8192*1024)/PAGE_SIZE)
128 inet_peer_threshold >>= 2; /* about 128KB */
129
130 peer_cachep = kmem_cache_create("inet_peer_cache",
131 sizeof(struct inet_peer),
132 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
132 0, SLAB_PANIC,
133 NULL);
134
135 /* All the timers, started at system startup tend
136 to synchronize. Perturb it a bit.
137 */
138 peer_periodic_timer.expires = jiffies
139 + net_random() % inet_peer_gc_maxtime
140 + inet_peer_gc_maxtime;

--- 7 unchanged lines hidden (view full) ---

148 spin_lock_bh(&unused_peers.lock);
149 list_del_init(&p->unused);
150 spin_unlock_bh(&unused_peers.lock);
151 }
152}
153
154/*
155 * Called with local BH disabled and the pool lock held.
133 NULL);
134
135 /* All the timers, started at system startup tend
136 to synchronize. Perturb it a bit.
137 */
138 peer_periodic_timer.expires = jiffies
139 + net_random() % inet_peer_gc_maxtime
140 + inet_peer_gc_maxtime;

--- 7 unchanged lines hidden (view full) ---

148 spin_lock_bh(&unused_peers.lock);
149 list_del_init(&p->unused);
150 spin_unlock_bh(&unused_peers.lock);
151 }
152}
153
154/*
155 * Called with local BH disabled and the pool lock held.
156 * _stack is known to be NULL or not at compile time,
157 * so compiler will optimize the if (_stack) tests.
158 */
159#define lookup(_daddr, _stack) \
160({ \
161 struct inet_peer *u, **v; \
156 */
157#define lookup(_daddr, _stack) \
158({ \
159 struct inet_peer *u, **v; \
162 if (_stack != NULL) { \
163 stackptr = _stack; \
164 *stackptr++ = &peers.root; \
165 } \
160 \
161 stackptr = _stack; \
162 *stackptr++ = &peers.root; \
166 for (u = peers.root; u != peer_avl_empty; ) { \
167 if (_daddr == u->v4daddr) \
168 break; \
169 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
170 v = &u->avl_left; \
171 else \
172 v = &u->avl_right; \
163 for (u = peers.root; u != peer_avl_empty; ) { \
164 if (_daddr == u->v4daddr) \
165 break; \
166 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
167 v = &u->avl_left; \
168 else \
169 v = &u->avl_right; \
173 if (_stack != NULL) \
174 *stackptr++ = v; \
170 *stackptr++ = v; \
175 u = *v; \
176 } \
177 u; \
178})
179
171 u = *v; \
172 } \
173 u; \
174})
175
180/* Called with local BH disabled and the pool write lock held. */
176/*
177 * Called with rcu_read_lock_bh()
178 * Because we hold no lock against a writer, its quite possible we fall
179 * in an endless loop.
180 * But every pointer we follow is guaranteed to be valid thanks to RCU.
181 * We exit from this function if number of links exceeds PEER_MAXDEPTH
182 */
183static struct inet_peer *lookup_rcu_bh(__be32 daddr)
184{
185 struct inet_peer *u = rcu_dereference_bh(peers.root);
186 int count = 0;
187
188 while (u != peer_avl_empty) {
189 if (daddr == u->v4daddr) {
190 if (unlikely(!atomic_inc_not_zero(&u->refcnt)))
191 u = NULL;
192 return u;
193 }
194 if ((__force __u32)daddr < (__force __u32)u->v4daddr)
195 u = rcu_dereference_bh(u->avl_left);
196 else
197 u = rcu_dereference_bh(u->avl_right);
198 if (unlikely(++count == PEER_MAXDEPTH))
199 break;
200 }
201 return NULL;
202}
203
204/* Called with local BH disabled and the pool lock held. */
181#define lookup_rightempty(start) \
182({ \
183 struct inet_peer *u, **v; \
184 *stackptr++ = &start->avl_left; \
185 v = &start->avl_left; \
186 for (u = *v; u->avl_right != peer_avl_empty; ) { \
187 v = &u->avl_right; \
188 *stackptr++ = v; \
189 u = *v; \
190 } \
191 u; \
192})
193
205#define lookup_rightempty(start) \
206({ \
207 struct inet_peer *u, **v; \
208 *stackptr++ = &start->avl_left; \
209 v = &start->avl_left; \
210 for (u = *v; u->avl_right != peer_avl_empty; ) { \
211 v = &u->avl_right; \
212 *stackptr++ = v; \
213 u = *v; \
214 } \
215 u; \
216})
217
194/* Called with local BH disabled and the pool write lock held.
218/* Called with local BH disabled and the pool lock held.
195 * Variable names are the proof of operation correctness.
219 * Variable names are the proof of operation correctness.
196 * Look into mm/map_avl.c for more detail description of the ideas. */
220 * Look into mm/map_avl.c for more detail description of the ideas.
221 */
197static void peer_avl_rebalance(struct inet_peer **stack[],
198 struct inet_peer ***stackend)
199{
200 struct inet_peer **nodep, *node, *l, *r;
201 int lh, rh;
202
203 while (stackend > stack) {
204 nodep = *--stackend;

--- 59 unchanged lines hidden (view full) ---

264 *nodep = rl;
265 }
266 } else {
267 node->avl_height = (lh > rh ? lh : rh) + 1;
268 }
269 }
270}
271
222static void peer_avl_rebalance(struct inet_peer **stack[],
223 struct inet_peer ***stackend)
224{
225 struct inet_peer **nodep, *node, *l, *r;
226 int lh, rh;
227
228 while (stackend > stack) {
229 nodep = *--stackend;

--- 59 unchanged lines hidden (view full) ---

289 *nodep = rl;
290 }
291 } else {
292 node->avl_height = (lh > rh ? lh : rh) + 1;
293 }
294 }
295}
296
272/* Called with local BH disabled and the pool write lock held. */
297/* Called with local BH disabled and the pool lock held. */
273#define link_to_pool(n) \
274do { \
275 n->avl_height = 1; \
276 n->avl_left = peer_avl_empty; \
277 n->avl_right = peer_avl_empty; \
298#define link_to_pool(n) \
299do { \
300 n->avl_height = 1; \
301 n->avl_left = peer_avl_empty; \
302 n->avl_right = peer_avl_empty; \
303 smp_wmb(); /* lockless readers can catch us now */ \
278 **--stackptr = n; \
279 peer_avl_rebalance(stack, stackptr); \
280} while (0)
281
304 **--stackptr = n; \
305 peer_avl_rebalance(stack, stackptr); \
306} while (0)
307
308static void inetpeer_free_rcu(struct rcu_head *head)
309{
310 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
311}
312
282/* May be called with local BH enabled. */
283static void unlink_from_pool(struct inet_peer *p)
284{
285 int do_free;
286
287 do_free = 0;
288
313/* May be called with local BH enabled. */
314static void unlink_from_pool(struct inet_peer *p)
315{
316 int do_free;
317
318 do_free = 0;
319
289 write_lock_bh(&peers.lock);
320 spin_lock_bh(&peers.lock);
290 /* Check the reference counter. It was artificially incremented by 1
321 /* Check the reference counter. It was artificially incremented by 1
291 * in cleanup() function to prevent sudden disappearing. If the
292 * reference count is still 1 then the node is referenced only as `p'
293 * here and from the pool. So under the exclusive pool lock it's safe
294 * to remove the node and free it later. */
295 if (atomic_read(&p->refcnt) == 1) {
322 * in cleanup() function to prevent sudden disappearing. If we can
323 * atomically (because of lockless readers) take this last reference,
324 * it's safe to remove the node and free it later.
325 */
326 if (atomic_cmpxchg(&p->refcnt, 1, 0) == 1) {
296 struct inet_peer **stack[PEER_MAXDEPTH];
297 struct inet_peer ***stackptr, ***delp;
298 if (lookup(p->v4daddr, stack) != p)
299 BUG();
300 delp = stackptr - 1; /* *delp[0] == p */
301 if (p->avl_left == peer_avl_empty) {
302 *delp[0] = p->avl_right;
303 --stackptr;

--- 12 unchanged lines hidden (view full) ---

316 t->avl_height = p->avl_height;
317 BUG_ON(delp[1] != &p->avl_left);
318 delp[1] = &t->avl_left; /* was &p->avl_left */
319 }
320 peer_avl_rebalance(stack, stackptr);
321 peers.total--;
322 do_free = 1;
323 }
327 struct inet_peer **stack[PEER_MAXDEPTH];
328 struct inet_peer ***stackptr, ***delp;
329 if (lookup(p->v4daddr, stack) != p)
330 BUG();
331 delp = stackptr - 1; /* *delp[0] == p */
332 if (p->avl_left == peer_avl_empty) {
333 *delp[0] = p->avl_right;
334 --stackptr;

--- 12 unchanged lines hidden (view full) ---

347 t->avl_height = p->avl_height;
348 BUG_ON(delp[1] != &p->avl_left);
349 delp[1] = &t->avl_left; /* was &p->avl_left */
350 }
351 peer_avl_rebalance(stack, stackptr);
352 peers.total--;
353 do_free = 1;
354 }
324 write_unlock_bh(&peers.lock);
355 spin_unlock_bh(&peers.lock);
325
326 if (do_free)
356
357 if (do_free)
327 kmem_cache_free(peer_cachep, p);
358 call_rcu_bh(&p->rcu, inetpeer_free_rcu);
328 else
329 /* The node is used again. Decrease the reference counter
330 * back. The loop "cleanup -> unlink_from_unused
331 * -> unlink_from_pool -> putpeer -> link_to_unused
332 * -> cleanup (for the same node)"
333 * doesn't really exist because the entry will have a
359 else
360 /* The node is used again. Decrease the reference counter
361 * back. The loop "cleanup -> unlink_from_unused
362 * -> unlink_from_pool -> putpeer -> link_to_unused
363 * -> cleanup (for the same node)"
364 * doesn't really exist because the entry will have a
334 * recent deletion time and will not be cleaned again soon. */
365 * recent deletion time and will not be cleaned again soon.
366 */
335 inet_putpeer(p);
336}
337
338/* May be called with local BH enabled. */
339static int cleanup_once(unsigned long ttl)
340{
341 struct inet_peer *p = NULL;
342

--- 27 unchanged lines hidden (view full) ---

370
371 unlink_from_pool(p);
372 return 0;
373}
374
375/* Called with or without local BH being disabled. */
376struct inet_peer *inet_getpeer(__be32 daddr, int create)
377{
367 inet_putpeer(p);
368}
369
370/* May be called with local BH enabled. */
371static int cleanup_once(unsigned long ttl)
372{
373 struct inet_peer *p = NULL;
374

--- 27 unchanged lines hidden (view full) ---

402
403 unlink_from_pool(p);
404 return 0;
405}
406
407/* Called with or without local BH being disabled. */
408struct inet_peer *inet_getpeer(__be32 daddr, int create)
409{
378 struct inet_peer *p, *n;
410 struct inet_peer *p;
379 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
380
411 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
412
381 /* Look up for the address quickly. */
382 read_lock_bh(&peers.lock);
383 p = lookup(daddr, NULL);
384 if (p != peer_avl_empty)
385 atomic_inc(&p->refcnt);
386 read_unlock_bh(&peers.lock);
413 /* Look up for the address quickly, lockless.
414 * Because of a concurrent writer, we might not find an existing entry.
415 */
416 rcu_read_lock_bh();
417 p = lookup_rcu_bh(daddr);
418 rcu_read_unlock_bh();
387
419
420 if (p) {
421 /* The existing node has been found.
422 * Remove the entry from unused list if it was there.
423 */
424 unlink_from_unused(p);
425 return p;
426 }
427
428 /* retry an exact lookup, taking the lock before.
429 * At least, nodes should be hot in our cache.
430 */
431 spin_lock_bh(&peers.lock);
432 p = lookup(daddr, stack);
388 if (p != peer_avl_empty) {
433 if (p != peer_avl_empty) {
389 /* The existing node has been found. */
434 atomic_inc(&p->refcnt);
435 spin_unlock_bh(&peers.lock);
390 /* Remove the entry from unused list if it was there. */
391 unlink_from_unused(p);
392 return p;
393 }
436 /* Remove the entry from unused list if it was there. */
437 unlink_from_unused(p);
438 return p;
439 }
440 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
441 if (p) {
442 p->v4daddr = daddr;
443 atomic_set(&p->refcnt, 1);
444 atomic_set(&p->rid, 0);
445 atomic_set(&p->ip_id_count, secure_ip_id(daddr));
446 p->tcp_ts_stamp = 0;
447 INIT_LIST_HEAD(&p->unused);
394
448
395 if (!create)
396 return NULL;
397
449
398 /* Allocate the space outside the locked region. */
399 n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
400 if (n == NULL)
401 return NULL;
402 n->v4daddr = daddr;
403 atomic_set(&n->refcnt, 1);
404 atomic_set(&n->rid, 0);
405 atomic_set(&n->ip_id_count, secure_ip_id(daddr));
406 n->tcp_ts_stamp = 0;
450 /* Link the node. */
451 link_to_pool(p);
452 peers.total++;
453 }
454 spin_unlock_bh(&peers.lock);
407
455
408 write_lock_bh(&peers.lock);
409 /* Check if an entry has suddenly appeared. */
410 p = lookup(daddr, stack);
411 if (p != peer_avl_empty)
412 goto out_free;
413
414 /* Link the node. */
415 link_to_pool(n);
416 INIT_LIST_HEAD(&n->unused);
417 peers.total++;
418 write_unlock_bh(&peers.lock);
419
420 if (peers.total >= inet_peer_threshold)
421 /* Remove one less-recently-used entry. */
422 cleanup_once(0);
423
456 if (peers.total >= inet_peer_threshold)
457 /* Remove one less-recently-used entry. */
458 cleanup_once(0);
459
424 return n;
425
426out_free:
427 /* The appropriate node is already in the pool. */
428 atomic_inc(&p->refcnt);
429 write_unlock_bh(&peers.lock);
430 /* Remove the entry from unused list if it was there. */
431 unlink_from_unused(p);
432 /* Free preallocated the preallocated node. */
433 kmem_cache_free(peer_cachep, n);
434 return p;
435}
436
437/* Called with local BH disabled. */
438static void peer_check_expire(unsigned long dummy)
439{
440 unsigned long now = jiffies;
441 int ttl;

--- 37 unchanged lines hidden ---
460 return p;
461}
462
463/* Called with local BH disabled. */
464static void peer_check_expire(unsigned long dummy)
465{
466 unsigned long now = jiffies;
467 int ttl;

--- 37 unchanged lines hidden ---