xref: /linux/kernel/bpf/hashtab.c (revision 3f2fb9a834cb1fcddbae22deca7fde136944dc89)
1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2  *
3  * This program is free software; you can redistribute it and/or
4  * modify it under the terms of version 2 of the GNU General Public
5  * License as published by the Free Software Foundation.
6  *
7  * This program is distributed in the hope that it will be useful, but
8  * WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10  * General Public License for more details.
11  */
12 #include <linux/bpf.h>
13 #include <linux/jhash.h>
14 #include <linux/filter.h>
15 #include <linux/vmalloc.h>
16 
17 struct bucket {
18 	struct hlist_head head;
19 	raw_spinlock_t lock;
20 };
21 
22 struct bpf_htab {
23 	struct bpf_map map;
24 	struct bucket *buckets;
25 	atomic_t count;	/* number of elements in this hashtable */
26 	u32 n_buckets;	/* number of hash buckets */
27 	u32 elem_size;	/* size of each element in bytes */
28 };
29 
30 /* each htab element is struct htab_elem + key + value */
31 struct htab_elem {
32 	struct hlist_node hash_node;
33 	struct rcu_head rcu;
34 	union {
35 		u32 hash;
36 		u32 key_size;
37 	};
38 	char key[0] __aligned(8);
39 };
40 
41 /* Called from syscall */
42 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
43 {
44 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
45 	struct bpf_htab *htab;
46 	int err, i;
47 	u64 cost;
48 
49 	htab = kzalloc(sizeof(*htab), GFP_USER);
50 	if (!htab)
51 		return ERR_PTR(-ENOMEM);
52 
53 	/* mandatory map attributes */
54 	htab->map.map_type = attr->map_type;
55 	htab->map.key_size = attr->key_size;
56 	htab->map.value_size = attr->value_size;
57 	htab->map.max_entries = attr->max_entries;
58 
59 	/* check sanity of attributes.
60 	 * value_size == 0 may be allowed in the future to use map as a set
61 	 */
62 	err = -EINVAL;
63 	if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
64 	    htab->map.value_size == 0)
65 		goto free_htab;
66 
67 	/* hash table size must be power of 2 */
68 	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
69 
70 	err = -E2BIG;
71 	if (htab->map.key_size > MAX_BPF_STACK)
72 		/* eBPF programs initialize keys on stack, so they cannot be
73 		 * larger than max stack size
74 		 */
75 		goto free_htab;
76 
77 	if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
78 	    MAX_BPF_STACK - sizeof(struct htab_elem))
79 		/* if value_size is bigger, the user space won't be able to
80 		 * access the elements via bpf syscall. This check also makes
81 		 * sure that the elem_size doesn't overflow and it's
82 		 * kmalloc-able later in htab_map_update_elem()
83 		 */
84 		goto free_htab;
85 
86 	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
87 		/* make sure the size for pcpu_alloc() is reasonable */
88 		goto free_htab;
89 
90 	htab->elem_size = sizeof(struct htab_elem) +
91 			  round_up(htab->map.key_size, 8);
92 	if (percpu)
93 		htab->elem_size += sizeof(void *);
94 	else
95 		htab->elem_size += htab->map.value_size;
96 
97 	/* prevent zero size kmalloc and check for u32 overflow */
98 	if (htab->n_buckets == 0 ||
99 	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
100 		goto free_htab;
101 
102 	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
103 	       (u64) htab->elem_size * htab->map.max_entries;
104 
105 	if (percpu)
106 		cost += (u64) round_up(htab->map.value_size, 8) *
107 			num_possible_cpus() * htab->map.max_entries;
108 
109 	if (cost >= U32_MAX - PAGE_SIZE)
110 		/* make sure page count doesn't overflow */
111 		goto free_htab;
112 
113 	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
114 
115 	err = -ENOMEM;
116 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
117 				      GFP_USER | __GFP_NOWARN);
118 
119 	if (!htab->buckets) {
120 		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
121 		if (!htab->buckets)
122 			goto free_htab;
123 	}
124 
125 	for (i = 0; i < htab->n_buckets; i++) {
126 		INIT_HLIST_HEAD(&htab->buckets[i].head);
127 		raw_spin_lock_init(&htab->buckets[i].lock);
128 	}
129 
130 	atomic_set(&htab->count, 0);
131 
132 	return &htab->map;
133 
134 free_htab:
135 	kfree(htab);
136 	return ERR_PTR(err);
137 }
138 
139 static inline u32 htab_map_hash(const void *key, u32 key_len)
140 {
141 	return jhash(key, key_len, 0);
142 }
143 
144 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
145 {
146 	return &htab->buckets[hash & (htab->n_buckets - 1)];
147 }
148 
149 static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
150 {
151 	return &__select_bucket(htab, hash)->head;
152 }
153 
154 static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
155 					 void *key, u32 key_size)
156 {
157 	struct htab_elem *l;
158 
159 	hlist_for_each_entry_rcu(l, head, hash_node)
160 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
161 			return l;
162 
163 	return NULL;
164 }
165 
166 /* Called from syscall or from eBPF program */
167 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
168 {
169 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
170 	struct hlist_head *head;
171 	struct htab_elem *l;
172 	u32 hash, key_size;
173 
174 	/* Must be called with rcu_read_lock. */
175 	WARN_ON_ONCE(!rcu_read_lock_held());
176 
177 	key_size = map->key_size;
178 
179 	hash = htab_map_hash(key, key_size);
180 
181 	head = select_bucket(htab, hash);
182 
183 	l = lookup_elem_raw(head, hash, key, key_size);
184 
185 	return l;
186 }
187 
188 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
189 {
190 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
191 
192 	if (l)
193 		return l->key + round_up(map->key_size, 8);
194 
195 	return NULL;
196 }
197 
198 /* Called from syscall */
199 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
200 {
201 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
202 	struct hlist_head *head;
203 	struct htab_elem *l, *next_l;
204 	u32 hash, key_size;
205 	int i;
206 
207 	WARN_ON_ONCE(!rcu_read_lock_held());
208 
209 	key_size = map->key_size;
210 
211 	hash = htab_map_hash(key, key_size);
212 
213 	head = select_bucket(htab, hash);
214 
215 	/* lookup the key */
216 	l = lookup_elem_raw(head, hash, key, key_size);
217 
218 	if (!l) {
219 		i = 0;
220 		goto find_first_elem;
221 	}
222 
223 	/* key was found, get next key in the same bucket */
224 	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
225 				  struct htab_elem, hash_node);
226 
227 	if (next_l) {
228 		/* if next elem in this hash list is non-zero, just return it */
229 		memcpy(next_key, next_l->key, key_size);
230 		return 0;
231 	}
232 
233 	/* no more elements in this hash list, go to the next bucket */
234 	i = hash & (htab->n_buckets - 1);
235 	i++;
236 
237 find_first_elem:
238 	/* iterate over buckets */
239 	for (; i < htab->n_buckets; i++) {
240 		head = select_bucket(htab, i);
241 
242 		/* pick first element in the bucket */
243 		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
244 					  struct htab_elem, hash_node);
245 		if (next_l) {
246 			/* if it's not empty, just return it */
247 			memcpy(next_key, next_l->key, key_size);
248 			return 0;
249 		}
250 	}
251 
252 	/* itereated over all buckets and all elements */
253 	return -ENOENT;
254 }
255 
256 
257 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
258 				     void __percpu *pptr)
259 {
260 	*(void __percpu **)(l->key + key_size) = pptr;
261 }
262 
263 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
264 {
265 	return *(void __percpu **)(l->key + key_size);
266 }
267 
268 static void htab_percpu_elem_free(struct htab_elem *l)
269 {
270 	free_percpu(htab_elem_get_ptr(l, l->key_size));
271 	kfree(l);
272 }
273 
274 static void htab_percpu_elem_free_rcu(struct rcu_head *head)
275 {
276 	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
277 
278 	htab_percpu_elem_free(l);
279 }
280 
281 static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
282 {
283 	if (percpu) {
284 		l->key_size = key_size;
285 		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
286 	} else {
287 		kfree_rcu(l, rcu);
288 	}
289 }
290 
291 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
292 					 void *value, u32 key_size, u32 hash,
293 					 bool percpu, bool onallcpus)
294 {
295 	u32 size = htab->map.value_size;
296 	struct htab_elem *l_new;
297 	void __percpu *pptr;
298 
299 	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
300 	if (!l_new)
301 		return NULL;
302 
303 	memcpy(l_new->key, key, key_size);
304 	if (percpu) {
305 		/* round up value_size to 8 bytes */
306 		size = round_up(size, 8);
307 
308 		/* alloc_percpu zero-fills */
309 		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
310 		if (!pptr) {
311 			kfree(l_new);
312 			return NULL;
313 		}
314 
315 		if (!onallcpus) {
316 			/* copy true value_size bytes */
317 			memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
318 		} else {
319 			int off = 0, cpu;
320 
321 			for_each_possible_cpu(cpu) {
322 				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
323 						value + off, size);
324 				off += size;
325 			}
326 		}
327 		htab_elem_set_ptr(l_new, key_size, pptr);
328 	} else {
329 		memcpy(l_new->key + round_up(key_size, 8), value, size);
330 	}
331 
332 	l_new->hash = hash;
333 	return l_new;
334 }
335 
336 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
337 		       u64 map_flags)
338 {
339 	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
340 		/* if elem with this 'key' doesn't exist and we've reached
341 		 * max_entries limit, fail insertion of new elem
342 		 */
343 		return -E2BIG;
344 
345 	if (l_old && map_flags == BPF_NOEXIST)
346 		/* elem already exists */
347 		return -EEXIST;
348 
349 	if (!l_old && map_flags == BPF_EXIST)
350 		/* elem doesn't exist, cannot update it */
351 		return -ENOENT;
352 
353 	return 0;
354 }
355 
356 /* Called from syscall or from eBPF program */
357 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
358 				u64 map_flags)
359 {
360 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
361 	struct htab_elem *l_new = NULL, *l_old;
362 	struct hlist_head *head;
363 	unsigned long flags;
364 	struct bucket *b;
365 	u32 key_size, hash;
366 	int ret;
367 
368 	if (unlikely(map_flags > BPF_EXIST))
369 		/* unknown flags */
370 		return -EINVAL;
371 
372 	WARN_ON_ONCE(!rcu_read_lock_held());
373 
374 	key_size = map->key_size;
375 
376 	hash = htab_map_hash(key, key_size);
377 
378 	/* allocate new element outside of the lock, since
379 	 * we're most likley going to insert it
380 	 */
381 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
382 	if (!l_new)
383 		return -ENOMEM;
384 
385 	b = __select_bucket(htab, hash);
386 	head = &b->head;
387 
388 	/* bpf_map_update_elem() can be called in_irq() */
389 	raw_spin_lock_irqsave(&b->lock, flags);
390 
391 	l_old = lookup_elem_raw(head, hash, key, key_size);
392 
393 	ret = check_flags(htab, l_old, map_flags);
394 	if (ret)
395 		goto err;
396 
397 	/* add new element to the head of the list, so that
398 	 * concurrent search will find it before old elem
399 	 */
400 	hlist_add_head_rcu(&l_new->hash_node, head);
401 	if (l_old) {
402 		hlist_del_rcu(&l_old->hash_node);
403 		kfree_rcu(l_old, rcu);
404 	} else {
405 		atomic_inc(&htab->count);
406 	}
407 	raw_spin_unlock_irqrestore(&b->lock, flags);
408 	return 0;
409 err:
410 	raw_spin_unlock_irqrestore(&b->lock, flags);
411 	kfree(l_new);
412 	return ret;
413 }
414 
415 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
416 					 void *value, u64 map_flags,
417 					 bool onallcpus)
418 {
419 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
420 	struct htab_elem *l_new = NULL, *l_old;
421 	struct hlist_head *head;
422 	unsigned long flags;
423 	struct bucket *b;
424 	u32 key_size, hash;
425 	int ret;
426 
427 	if (unlikely(map_flags > BPF_EXIST))
428 		/* unknown flags */
429 		return -EINVAL;
430 
431 	WARN_ON_ONCE(!rcu_read_lock_held());
432 
433 	key_size = map->key_size;
434 
435 	hash = htab_map_hash(key, key_size);
436 
437 	b = __select_bucket(htab, hash);
438 	head = &b->head;
439 
440 	/* bpf_map_update_elem() can be called in_irq() */
441 	raw_spin_lock_irqsave(&b->lock, flags);
442 
443 	l_old = lookup_elem_raw(head, hash, key, key_size);
444 
445 	ret = check_flags(htab, l_old, map_flags);
446 	if (ret)
447 		goto err;
448 
449 	if (l_old) {
450 		void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
451 		u32 size = htab->map.value_size;
452 
453 		/* per-cpu hash map can update value in-place */
454 		if (!onallcpus) {
455 			memcpy(this_cpu_ptr(pptr), value, size);
456 		} else {
457 			int off = 0, cpu;
458 
459 			size = round_up(size, 8);
460 			for_each_possible_cpu(cpu) {
461 				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
462 						value + off, size);
463 				off += size;
464 			}
465 		}
466 	} else {
467 		l_new = alloc_htab_elem(htab, key, value, key_size,
468 					hash, true, onallcpus);
469 		if (!l_new) {
470 			ret = -ENOMEM;
471 			goto err;
472 		}
473 		hlist_add_head_rcu(&l_new->hash_node, head);
474 		atomic_inc(&htab->count);
475 	}
476 	ret = 0;
477 err:
478 	raw_spin_unlock_irqrestore(&b->lock, flags);
479 	return ret;
480 }
481 
482 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
483 				       void *value, u64 map_flags)
484 {
485 	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
486 }
487 
488 /* Called from syscall or from eBPF program */
489 static int htab_map_delete_elem(struct bpf_map *map, void *key)
490 {
491 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
492 	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
493 	struct hlist_head *head;
494 	struct bucket *b;
495 	struct htab_elem *l;
496 	unsigned long flags;
497 	u32 hash, key_size;
498 	int ret = -ENOENT;
499 
500 	WARN_ON_ONCE(!rcu_read_lock_held());
501 
502 	key_size = map->key_size;
503 
504 	hash = htab_map_hash(key, key_size);
505 	b = __select_bucket(htab, hash);
506 	head = &b->head;
507 
508 	raw_spin_lock_irqsave(&b->lock, flags);
509 
510 	l = lookup_elem_raw(head, hash, key, key_size);
511 
512 	if (l) {
513 		hlist_del_rcu(&l->hash_node);
514 		atomic_dec(&htab->count);
515 		free_htab_elem(l, percpu, key_size);
516 		ret = 0;
517 	}
518 
519 	raw_spin_unlock_irqrestore(&b->lock, flags);
520 	return ret;
521 }
522 
523 static void delete_all_elements(struct bpf_htab *htab)
524 {
525 	int i;
526 
527 	for (i = 0; i < htab->n_buckets; i++) {
528 		struct hlist_head *head = select_bucket(htab, i);
529 		struct hlist_node *n;
530 		struct htab_elem *l;
531 
532 		hlist_for_each_entry_safe(l, n, head, hash_node) {
533 			hlist_del_rcu(&l->hash_node);
534 			atomic_dec(&htab->count);
535 			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
536 				l->key_size = htab->map.key_size;
537 				htab_percpu_elem_free(l);
538 			} else {
539 				kfree(l);
540 			}
541 		}
542 	}
543 }
544 
545 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
546 static void htab_map_free(struct bpf_map *map)
547 {
548 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
549 
550 	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
551 	 * so the programs (can be more than one that used this map) were
552 	 * disconnected from events. Wait for outstanding critical sections in
553 	 * these programs to complete
554 	 */
555 	synchronize_rcu();
556 
557 	/* some of kfree_rcu() callbacks for elements of this map may not have
558 	 * executed. It's ok. Proceed to free residual elements and map itself
559 	 */
560 	delete_all_elements(htab);
561 	kvfree(htab->buckets);
562 	kfree(htab);
563 }
564 
565 static const struct bpf_map_ops htab_ops = {
566 	.map_alloc = htab_map_alloc,
567 	.map_free = htab_map_free,
568 	.map_get_next_key = htab_map_get_next_key,
569 	.map_lookup_elem = htab_map_lookup_elem,
570 	.map_update_elem = htab_map_update_elem,
571 	.map_delete_elem = htab_map_delete_elem,
572 };
573 
574 static struct bpf_map_type_list htab_type __read_mostly = {
575 	.ops = &htab_ops,
576 	.type = BPF_MAP_TYPE_HASH,
577 };
578 
579 /* Called from eBPF program */
580 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
581 {
582 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
583 
584 	if (l)
585 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
586 	else
587 		return NULL;
588 }
589 
590 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
591 {
592 	struct htab_elem *l;
593 	void __percpu *pptr;
594 	int ret = -ENOENT;
595 	int cpu, off = 0;
596 	u32 size;
597 
598 	/* per_cpu areas are zero-filled and bpf programs can only
599 	 * access 'value_size' of them, so copying rounded areas
600 	 * will not leak any kernel data
601 	 */
602 	size = round_up(map->value_size, 8);
603 	rcu_read_lock();
604 	l = __htab_map_lookup_elem(map, key);
605 	if (!l)
606 		goto out;
607 	pptr = htab_elem_get_ptr(l, map->key_size);
608 	for_each_possible_cpu(cpu) {
609 		bpf_long_memcpy(value + off,
610 				per_cpu_ptr(pptr, cpu), size);
611 		off += size;
612 	}
613 	ret = 0;
614 out:
615 	rcu_read_unlock();
616 	return ret;
617 }
618 
619 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
620 			   u64 map_flags)
621 {
622 	int ret;
623 
624 	rcu_read_lock();
625 	ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
626 	rcu_read_unlock();
627 
628 	return ret;
629 }
630 
631 static const struct bpf_map_ops htab_percpu_ops = {
632 	.map_alloc = htab_map_alloc,
633 	.map_free = htab_map_free,
634 	.map_get_next_key = htab_map_get_next_key,
635 	.map_lookup_elem = htab_percpu_map_lookup_elem,
636 	.map_update_elem = htab_percpu_map_update_elem,
637 	.map_delete_elem = htab_map_delete_elem,
638 };
639 
640 static struct bpf_map_type_list htab_percpu_type __read_mostly = {
641 	.ops = &htab_percpu_ops,
642 	.type = BPF_MAP_TYPE_PERCPU_HASH,
643 };
644 
645 static int __init register_htab_map(void)
646 {
647 	bpf_register_map_type(&htab_type);
648 	bpf_register_map_type(&htab_percpu_type);
649 	return 0;
650 }
651 late_initcall(register_htab_map);
652