xref: /linux/kernel/bpf/hashtab.c (revision eedb3c44313ae0785e1dc62c6910557953887388)
1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2  * Copyright (c) 2016 Facebook
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  */
13 #include <linux/bpf.h>
14 #include <linux/jhash.h>
15 #include <linux/filter.h>
16 #include <linux/rculist_nulls.h>
17 #include "percpu_freelist.h"
18 #include "bpf_lru_list.h"
19 
20 struct bucket {
21 	struct hlist_nulls_head head;
22 	raw_spinlock_t lock;
23 };
24 
25 struct bpf_htab {
26 	struct bpf_map map;
27 	struct bucket *buckets;
28 	void *elems;
29 	union {
30 		struct pcpu_freelist freelist;
31 		struct bpf_lru lru;
32 	};
33 	void __percpu *extra_elems;
34 	atomic_t count;	/* number of elements in this hashtable */
35 	u32 n_buckets;	/* number of hash buckets */
36 	u32 elem_size;	/* size of each element in bytes */
37 };
38 
39 enum extra_elem_state {
40 	HTAB_NOT_AN_EXTRA_ELEM = 0,
41 	HTAB_EXTRA_ELEM_FREE,
42 	HTAB_EXTRA_ELEM_USED
43 };
44 
45 /* each htab element is struct htab_elem + key + value */
46 struct htab_elem {
47 	union {
48 		struct hlist_nulls_node hash_node;
49 		struct {
50 			void *padding;
51 			union {
52 				struct bpf_htab *htab;
53 				struct pcpu_freelist_node fnode;
54 			};
55 		};
56 	};
57 	union {
58 		struct rcu_head rcu;
59 		enum extra_elem_state state;
60 		struct bpf_lru_node lru_node;
61 	};
62 	u32 hash;
63 	char key[0] __aligned(8);
64 };
65 
66 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
67 
68 static bool htab_is_lru(const struct bpf_htab *htab)
69 {
70 	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
71 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
72 }
73 
74 static bool htab_is_percpu(const struct bpf_htab *htab)
75 {
76 	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
77 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
78 }
79 
80 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
81 				     void __percpu *pptr)
82 {
83 	*(void __percpu **)(l->key + key_size) = pptr;
84 }
85 
86 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
87 {
88 	return *(void __percpu **)(l->key + key_size);
89 }
90 
91 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
92 {
93 	return (struct htab_elem *) (htab->elems + i * htab->elem_size);
94 }
95 
96 static void htab_free_elems(struct bpf_htab *htab)
97 {
98 	int i;
99 
100 	if (!htab_is_percpu(htab))
101 		goto free_elems;
102 
103 	for (i = 0; i < htab->map.max_entries; i++) {
104 		void __percpu *pptr;
105 
106 		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
107 					 htab->map.key_size);
108 		free_percpu(pptr);
109 	}
110 free_elems:
111 	bpf_map_area_free(htab->elems);
112 }
113 
114 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
115 					  u32 hash)
116 {
117 	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
118 	struct htab_elem *l;
119 
120 	if (node) {
121 		l = container_of(node, struct htab_elem, lru_node);
122 		memcpy(l->key, key, htab->map.key_size);
123 		return l;
124 	}
125 
126 	return NULL;
127 }
128 
129 static int prealloc_init(struct bpf_htab *htab)
130 {
131 	int err = -ENOMEM, i;
132 
133 	htab->elems = bpf_map_area_alloc(htab->elem_size *
134 					 htab->map.max_entries);
135 	if (!htab->elems)
136 		return -ENOMEM;
137 
138 	if (!htab_is_percpu(htab))
139 		goto skip_percpu_elems;
140 
141 	for (i = 0; i < htab->map.max_entries; i++) {
142 		u32 size = round_up(htab->map.value_size, 8);
143 		void __percpu *pptr;
144 
145 		pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
146 		if (!pptr)
147 			goto free_elems;
148 		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
149 				  pptr);
150 	}
151 
152 skip_percpu_elems:
153 	if (htab_is_lru(htab))
154 		err = bpf_lru_init(&htab->lru,
155 				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
156 				   offsetof(struct htab_elem, hash) -
157 				   offsetof(struct htab_elem, lru_node),
158 				   htab_lru_map_delete_node,
159 				   htab);
160 	else
161 		err = pcpu_freelist_init(&htab->freelist);
162 
163 	if (err)
164 		goto free_elems;
165 
166 	if (htab_is_lru(htab))
167 		bpf_lru_populate(&htab->lru, htab->elems,
168 				 offsetof(struct htab_elem, lru_node),
169 				 htab->elem_size, htab->map.max_entries);
170 	else
171 		pcpu_freelist_populate(&htab->freelist,
172 				       htab->elems + offsetof(struct htab_elem, fnode),
173 				       htab->elem_size, htab->map.max_entries);
174 
175 	return 0;
176 
177 free_elems:
178 	htab_free_elems(htab);
179 	return err;
180 }
181 
182 static void prealloc_destroy(struct bpf_htab *htab)
183 {
184 	htab_free_elems(htab);
185 
186 	if (htab_is_lru(htab))
187 		bpf_lru_destroy(&htab->lru);
188 	else
189 		pcpu_freelist_destroy(&htab->freelist);
190 }
191 
192 static int alloc_extra_elems(struct bpf_htab *htab)
193 {
194 	void __percpu *pptr;
195 	int cpu;
196 
197 	pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
198 	if (!pptr)
199 		return -ENOMEM;
200 
201 	for_each_possible_cpu(cpu) {
202 		((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
203 			HTAB_EXTRA_ELEM_FREE;
204 	}
205 	htab->extra_elems = pptr;
206 	return 0;
207 }
208 
209 /* Called from syscall */
210 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
211 {
212 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
213 		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
214 	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
215 		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
216 	/* percpu_lru means each cpu has its own LRU list.
217 	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
218 	 * the map's value itself is percpu.  percpu_lru has
219 	 * nothing to do with the map's value.
220 	 */
221 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
222 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
223 	struct bpf_htab *htab;
224 	int err, i;
225 	u64 cost;
226 
227 	BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
228 		     offsetof(struct htab_elem, hash_node.pprev));
229 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
230 		     offsetof(struct htab_elem, hash_node.pprev));
231 
232 	if (lru && !capable(CAP_SYS_ADMIN))
233 		/* LRU implementation is much complicated than other
234 		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
235 		 */
236 		return ERR_PTR(-EPERM);
237 
238 	if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
239 		/* reserved bits should not be used */
240 		return ERR_PTR(-EINVAL);
241 
242 	if (!lru && percpu_lru)
243 		return ERR_PTR(-EINVAL);
244 
245 	if (lru && !prealloc)
246 		return ERR_PTR(-ENOTSUPP);
247 
248 	htab = kzalloc(sizeof(*htab), GFP_USER);
249 	if (!htab)
250 		return ERR_PTR(-ENOMEM);
251 
252 	/* mandatory map attributes */
253 	htab->map.map_type = attr->map_type;
254 	htab->map.key_size = attr->key_size;
255 	htab->map.value_size = attr->value_size;
256 	htab->map.max_entries = attr->max_entries;
257 	htab->map.map_flags = attr->map_flags;
258 
259 	/* check sanity of attributes.
260 	 * value_size == 0 may be allowed in the future to use map as a set
261 	 */
262 	err = -EINVAL;
263 	if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
264 	    htab->map.value_size == 0)
265 		goto free_htab;
266 
267 	if (percpu_lru) {
268 		/* ensure each CPU's lru list has >=1 elements.
269 		 * since we are at it, make each lru list has the same
270 		 * number of elements.
271 		 */
272 		htab->map.max_entries = roundup(attr->max_entries,
273 						num_possible_cpus());
274 		if (htab->map.max_entries < attr->max_entries)
275 			htab->map.max_entries = rounddown(attr->max_entries,
276 							  num_possible_cpus());
277 	}
278 
279 	/* hash table size must be power of 2 */
280 	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
281 
282 	err = -E2BIG;
283 	if (htab->map.key_size > MAX_BPF_STACK)
284 		/* eBPF programs initialize keys on stack, so they cannot be
285 		 * larger than max stack size
286 		 */
287 		goto free_htab;
288 
289 	if (htab->map.value_size >= KMALLOC_MAX_SIZE -
290 	    MAX_BPF_STACK - sizeof(struct htab_elem))
291 		/* if value_size is bigger, the user space won't be able to
292 		 * access the elements via bpf syscall. This check also makes
293 		 * sure that the elem_size doesn't overflow and it's
294 		 * kmalloc-able later in htab_map_update_elem()
295 		 */
296 		goto free_htab;
297 
298 	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
299 		/* make sure the size for pcpu_alloc() is reasonable */
300 		goto free_htab;
301 
302 	htab->elem_size = sizeof(struct htab_elem) +
303 			  round_up(htab->map.key_size, 8);
304 	if (percpu)
305 		htab->elem_size += sizeof(void *);
306 	else
307 		htab->elem_size += round_up(htab->map.value_size, 8);
308 
309 	/* prevent zero size kmalloc and check for u32 overflow */
310 	if (htab->n_buckets == 0 ||
311 	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
312 		goto free_htab;
313 
314 	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
315 	       (u64) htab->elem_size * htab->map.max_entries;
316 
317 	if (percpu)
318 		cost += (u64) round_up(htab->map.value_size, 8) *
319 			num_possible_cpus() * htab->map.max_entries;
320 	else
321 	       cost += (u64) htab->elem_size * num_possible_cpus();
322 
323 	if (cost >= U32_MAX - PAGE_SIZE)
324 		/* make sure page count doesn't overflow */
325 		goto free_htab;
326 
327 	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
328 
329 	/* if map size is larger than memlock limit, reject it early */
330 	err = bpf_map_precharge_memlock(htab->map.pages);
331 	if (err)
332 		goto free_htab;
333 
334 	err = -ENOMEM;
335 	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
336 					   sizeof(struct bucket));
337 	if (!htab->buckets)
338 		goto free_htab;
339 
340 	for (i = 0; i < htab->n_buckets; i++) {
341 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
342 		raw_spin_lock_init(&htab->buckets[i].lock);
343 	}
344 
345 	if (!percpu && !lru) {
346 		/* lru itself can remove the least used element, so
347 		 * there is no need for an extra elem during map_update.
348 		 */
349 		err = alloc_extra_elems(htab);
350 		if (err)
351 			goto free_buckets;
352 	}
353 
354 	if (prealloc) {
355 		err = prealloc_init(htab);
356 		if (err)
357 			goto free_extra_elems;
358 	}
359 
360 	return &htab->map;
361 
362 free_extra_elems:
363 	free_percpu(htab->extra_elems);
364 free_buckets:
365 	bpf_map_area_free(htab->buckets);
366 free_htab:
367 	kfree(htab);
368 	return ERR_PTR(err);
369 }
370 
371 static inline u32 htab_map_hash(const void *key, u32 key_len)
372 {
373 	return jhash(key, key_len, 0);
374 }
375 
376 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
377 {
378 	return &htab->buckets[hash & (htab->n_buckets - 1)];
379 }
380 
381 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
382 {
383 	return &__select_bucket(htab, hash)->head;
384 }
385 
386 /* this lookup function can only be called with bucket lock taken */
387 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
388 					 void *key, u32 key_size)
389 {
390 	struct hlist_nulls_node *n;
391 	struct htab_elem *l;
392 
393 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
394 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
395 			return l;
396 
397 	return NULL;
398 }
399 
400 /* can be called without bucket lock. it will repeat the loop in
401  * the unlikely event when elements moved from one bucket into another
402  * while link list is being walked
403  */
404 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
405 					       u32 hash, void *key,
406 					       u32 key_size, u32 n_buckets)
407 {
408 	struct hlist_nulls_node *n;
409 	struct htab_elem *l;
410 
411 again:
412 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
413 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
414 			return l;
415 
416 	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
417 		goto again;
418 
419 	return NULL;
420 }
421 
422 /* Called from syscall or from eBPF program */
423 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
424 {
425 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
426 	struct hlist_nulls_head *head;
427 	struct htab_elem *l;
428 	u32 hash, key_size;
429 
430 	/* Must be called with rcu_read_lock. */
431 	WARN_ON_ONCE(!rcu_read_lock_held());
432 
433 	key_size = map->key_size;
434 
435 	hash = htab_map_hash(key, key_size);
436 
437 	head = select_bucket(htab, hash);
438 
439 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
440 
441 	return l;
442 }
443 
444 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
445 {
446 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
447 
448 	if (l)
449 		return l->key + round_up(map->key_size, 8);
450 
451 	return NULL;
452 }
453 
454 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
455 {
456 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
457 
458 	if (l) {
459 		bpf_lru_node_set_ref(&l->lru_node);
460 		return l->key + round_up(map->key_size, 8);
461 	}
462 
463 	return NULL;
464 }
465 
466 /* It is called from the bpf_lru_list when the LRU needs to delete
467  * older elements from the htab.
468  */
469 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
470 {
471 	struct bpf_htab *htab = (struct bpf_htab *)arg;
472 	struct htab_elem *l = NULL, *tgt_l;
473 	struct hlist_nulls_head *head;
474 	struct hlist_nulls_node *n;
475 	unsigned long flags;
476 	struct bucket *b;
477 
478 	tgt_l = container_of(node, struct htab_elem, lru_node);
479 	b = __select_bucket(htab, tgt_l->hash);
480 	head = &b->head;
481 
482 	raw_spin_lock_irqsave(&b->lock, flags);
483 
484 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
485 		if (l == tgt_l) {
486 			hlist_nulls_del_rcu(&l->hash_node);
487 			break;
488 		}
489 
490 	raw_spin_unlock_irqrestore(&b->lock, flags);
491 
492 	return l == tgt_l;
493 }
494 
495 /* Called from syscall */
496 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
497 {
498 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
499 	struct hlist_nulls_head *head;
500 	struct htab_elem *l, *next_l;
501 	u32 hash, key_size;
502 	int i;
503 
504 	WARN_ON_ONCE(!rcu_read_lock_held());
505 
506 	key_size = map->key_size;
507 
508 	hash = htab_map_hash(key, key_size);
509 
510 	head = select_bucket(htab, hash);
511 
512 	/* lookup the key */
513 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
514 
515 	if (!l) {
516 		i = 0;
517 		goto find_first_elem;
518 	}
519 
520 	/* key was found, get next key in the same bucket */
521 	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
522 				  struct htab_elem, hash_node);
523 
524 	if (next_l) {
525 		/* if next elem in this hash list is non-zero, just return it */
526 		memcpy(next_key, next_l->key, key_size);
527 		return 0;
528 	}
529 
530 	/* no more elements in this hash list, go to the next bucket */
531 	i = hash & (htab->n_buckets - 1);
532 	i++;
533 
534 find_first_elem:
535 	/* iterate over buckets */
536 	for (; i < htab->n_buckets; i++) {
537 		head = select_bucket(htab, i);
538 
539 		/* pick first element in the bucket */
540 		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
541 					  struct htab_elem, hash_node);
542 		if (next_l) {
543 			/* if it's not empty, just return it */
544 			memcpy(next_key, next_l->key, key_size);
545 			return 0;
546 		}
547 	}
548 
549 	/* iterated over all buckets and all elements */
550 	return -ENOENT;
551 }
552 
553 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
554 {
555 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
556 		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
557 	kfree(l);
558 }
559 
560 static void htab_elem_free_rcu(struct rcu_head *head)
561 {
562 	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
563 	struct bpf_htab *htab = l->htab;
564 
565 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
566 	 * we're calling kfree, otherwise deadlock is possible if kprobes
567 	 * are placed somewhere inside of slub
568 	 */
569 	preempt_disable();
570 	__this_cpu_inc(bpf_prog_active);
571 	htab_elem_free(htab, l);
572 	__this_cpu_dec(bpf_prog_active);
573 	preempt_enable();
574 }
575 
576 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
577 {
578 	if (l->state == HTAB_EXTRA_ELEM_USED) {
579 		l->state = HTAB_EXTRA_ELEM_FREE;
580 		return;
581 	}
582 
583 	if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
584 		pcpu_freelist_push(&htab->freelist, &l->fnode);
585 	} else {
586 		atomic_dec(&htab->count);
587 		l->htab = htab;
588 		call_rcu(&l->rcu, htab_elem_free_rcu);
589 	}
590 }
591 
592 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
593 			    void *value, bool onallcpus)
594 {
595 	if (!onallcpus) {
596 		/* copy true value_size bytes */
597 		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
598 	} else {
599 		u32 size = round_up(htab->map.value_size, 8);
600 		int off = 0, cpu;
601 
602 		for_each_possible_cpu(cpu) {
603 			bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
604 					value + off, size);
605 			off += size;
606 		}
607 	}
608 }
609 
610 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
611 					 void *value, u32 key_size, u32 hash,
612 					 bool percpu, bool onallcpus,
613 					 bool old_elem_exists)
614 {
615 	u32 size = htab->map.value_size;
616 	bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
617 	struct htab_elem *l_new;
618 	void __percpu *pptr;
619 	int err = 0;
620 
621 	if (prealloc) {
622 		struct pcpu_freelist_node *l;
623 
624 		l = pcpu_freelist_pop(&htab->freelist);
625 		if (!l)
626 			err = -E2BIG;
627 		else
628 			l_new = container_of(l, struct htab_elem, fnode);
629 	} else {
630 		if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
631 			atomic_dec(&htab->count);
632 			err = -E2BIG;
633 		} else {
634 			l_new = kmalloc(htab->elem_size,
635 					GFP_ATOMIC | __GFP_NOWARN);
636 			if (!l_new)
637 				return ERR_PTR(-ENOMEM);
638 		}
639 	}
640 
641 	if (err) {
642 		if (!old_elem_exists)
643 			return ERR_PTR(err);
644 
645 		/* if we're updating the existing element and the hash table
646 		 * is full, use per-cpu extra elems
647 		 */
648 		l_new = this_cpu_ptr(htab->extra_elems);
649 		if (l_new->state != HTAB_EXTRA_ELEM_FREE)
650 			return ERR_PTR(-E2BIG);
651 		l_new->state = HTAB_EXTRA_ELEM_USED;
652 	} else {
653 		l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
654 	}
655 
656 	memcpy(l_new->key, key, key_size);
657 	if (percpu) {
658 		/* round up value_size to 8 bytes */
659 		size = round_up(size, 8);
660 
661 		if (prealloc) {
662 			pptr = htab_elem_get_ptr(l_new, key_size);
663 		} else {
664 			/* alloc_percpu zero-fills */
665 			pptr = __alloc_percpu_gfp(size, 8,
666 						  GFP_ATOMIC | __GFP_NOWARN);
667 			if (!pptr) {
668 				kfree(l_new);
669 				return ERR_PTR(-ENOMEM);
670 			}
671 		}
672 
673 		pcpu_copy_value(htab, pptr, value, onallcpus);
674 
675 		if (!prealloc)
676 			htab_elem_set_ptr(l_new, key_size, pptr);
677 	} else {
678 		memcpy(l_new->key + round_up(key_size, 8), value, size);
679 	}
680 
681 	l_new->hash = hash;
682 	return l_new;
683 }
684 
685 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
686 		       u64 map_flags)
687 {
688 	if (l_old && map_flags == BPF_NOEXIST)
689 		/* elem already exists */
690 		return -EEXIST;
691 
692 	if (!l_old && map_flags == BPF_EXIST)
693 		/* elem doesn't exist, cannot update it */
694 		return -ENOENT;
695 
696 	return 0;
697 }
698 
699 /* Called from syscall or from eBPF program */
700 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
701 				u64 map_flags)
702 {
703 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
704 	struct htab_elem *l_new = NULL, *l_old;
705 	struct hlist_nulls_head *head;
706 	unsigned long flags;
707 	struct bucket *b;
708 	u32 key_size, hash;
709 	int ret;
710 
711 	if (unlikely(map_flags > BPF_EXIST))
712 		/* unknown flags */
713 		return -EINVAL;
714 
715 	WARN_ON_ONCE(!rcu_read_lock_held());
716 
717 	key_size = map->key_size;
718 
719 	hash = htab_map_hash(key, key_size);
720 
721 	b = __select_bucket(htab, hash);
722 	head = &b->head;
723 
724 	/* bpf_map_update_elem() can be called in_irq() */
725 	raw_spin_lock_irqsave(&b->lock, flags);
726 
727 	l_old = lookup_elem_raw(head, hash, key, key_size);
728 
729 	ret = check_flags(htab, l_old, map_flags);
730 	if (ret)
731 		goto err;
732 
733 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
734 				!!l_old);
735 	if (IS_ERR(l_new)) {
736 		/* all pre-allocated elements are in use or memory exhausted */
737 		ret = PTR_ERR(l_new);
738 		goto err;
739 	}
740 
741 	/* add new element to the head of the list, so that
742 	 * concurrent search will find it before old elem
743 	 */
744 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
745 	if (l_old) {
746 		hlist_nulls_del_rcu(&l_old->hash_node);
747 		free_htab_elem(htab, l_old);
748 	}
749 	ret = 0;
750 err:
751 	raw_spin_unlock_irqrestore(&b->lock, flags);
752 	return ret;
753 }
754 
755 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
756 				    u64 map_flags)
757 {
758 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
759 	struct htab_elem *l_new, *l_old = NULL;
760 	struct hlist_nulls_head *head;
761 	unsigned long flags;
762 	struct bucket *b;
763 	u32 key_size, hash;
764 	int ret;
765 
766 	if (unlikely(map_flags > BPF_EXIST))
767 		/* unknown flags */
768 		return -EINVAL;
769 
770 	WARN_ON_ONCE(!rcu_read_lock_held());
771 
772 	key_size = map->key_size;
773 
774 	hash = htab_map_hash(key, key_size);
775 
776 	b = __select_bucket(htab, hash);
777 	head = &b->head;
778 
779 	/* For LRU, we need to alloc before taking bucket's
780 	 * spinlock because getting free nodes from LRU may need
781 	 * to remove older elements from htab and this removal
782 	 * operation will need a bucket lock.
783 	 */
784 	l_new = prealloc_lru_pop(htab, key, hash);
785 	if (!l_new)
786 		return -ENOMEM;
787 	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
788 
789 	/* bpf_map_update_elem() can be called in_irq() */
790 	raw_spin_lock_irqsave(&b->lock, flags);
791 
792 	l_old = lookup_elem_raw(head, hash, key, key_size);
793 
794 	ret = check_flags(htab, l_old, map_flags);
795 	if (ret)
796 		goto err;
797 
798 	/* add new element to the head of the list, so that
799 	 * concurrent search will find it before old elem
800 	 */
801 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
802 	if (l_old) {
803 		bpf_lru_node_set_ref(&l_new->lru_node);
804 		hlist_nulls_del_rcu(&l_old->hash_node);
805 	}
806 	ret = 0;
807 
808 err:
809 	raw_spin_unlock_irqrestore(&b->lock, flags);
810 
811 	if (ret)
812 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
813 	else if (l_old)
814 		bpf_lru_push_free(&htab->lru, &l_old->lru_node);
815 
816 	return ret;
817 }
818 
819 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
820 					 void *value, u64 map_flags,
821 					 bool onallcpus)
822 {
823 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
824 	struct htab_elem *l_new = NULL, *l_old;
825 	struct hlist_nulls_head *head;
826 	unsigned long flags;
827 	struct bucket *b;
828 	u32 key_size, hash;
829 	int ret;
830 
831 	if (unlikely(map_flags > BPF_EXIST))
832 		/* unknown flags */
833 		return -EINVAL;
834 
835 	WARN_ON_ONCE(!rcu_read_lock_held());
836 
837 	key_size = map->key_size;
838 
839 	hash = htab_map_hash(key, key_size);
840 
841 	b = __select_bucket(htab, hash);
842 	head = &b->head;
843 
844 	/* bpf_map_update_elem() can be called in_irq() */
845 	raw_spin_lock_irqsave(&b->lock, flags);
846 
847 	l_old = lookup_elem_raw(head, hash, key, key_size);
848 
849 	ret = check_flags(htab, l_old, map_flags);
850 	if (ret)
851 		goto err;
852 
853 	if (l_old) {
854 		/* per-cpu hash map can update value in-place */
855 		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
856 				value, onallcpus);
857 	} else {
858 		l_new = alloc_htab_elem(htab, key, value, key_size,
859 					hash, true, onallcpus, false);
860 		if (IS_ERR(l_new)) {
861 			ret = PTR_ERR(l_new);
862 			goto err;
863 		}
864 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
865 	}
866 	ret = 0;
867 err:
868 	raw_spin_unlock_irqrestore(&b->lock, flags);
869 	return ret;
870 }
871 
872 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
873 					     void *value, u64 map_flags,
874 					     bool onallcpus)
875 {
876 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
877 	struct htab_elem *l_new = NULL, *l_old;
878 	struct hlist_nulls_head *head;
879 	unsigned long flags;
880 	struct bucket *b;
881 	u32 key_size, hash;
882 	int ret;
883 
884 	if (unlikely(map_flags > BPF_EXIST))
885 		/* unknown flags */
886 		return -EINVAL;
887 
888 	WARN_ON_ONCE(!rcu_read_lock_held());
889 
890 	key_size = map->key_size;
891 
892 	hash = htab_map_hash(key, key_size);
893 
894 	b = __select_bucket(htab, hash);
895 	head = &b->head;
896 
897 	/* For LRU, we need to alloc before taking bucket's
898 	 * spinlock because LRU's elem alloc may need
899 	 * to remove older elem from htab and this removal
900 	 * operation will need a bucket lock.
901 	 */
902 	if (map_flags != BPF_EXIST) {
903 		l_new = prealloc_lru_pop(htab, key, hash);
904 		if (!l_new)
905 			return -ENOMEM;
906 	}
907 
908 	/* bpf_map_update_elem() can be called in_irq() */
909 	raw_spin_lock_irqsave(&b->lock, flags);
910 
911 	l_old = lookup_elem_raw(head, hash, key, key_size);
912 
913 	ret = check_flags(htab, l_old, map_flags);
914 	if (ret)
915 		goto err;
916 
917 	if (l_old) {
918 		bpf_lru_node_set_ref(&l_old->lru_node);
919 
920 		/* per-cpu hash map can update value in-place */
921 		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
922 				value, onallcpus);
923 	} else {
924 		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
925 				value, onallcpus);
926 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
927 		l_new = NULL;
928 	}
929 	ret = 0;
930 err:
931 	raw_spin_unlock_irqrestore(&b->lock, flags);
932 	if (l_new)
933 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
934 	return ret;
935 }
936 
937 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
938 				       void *value, u64 map_flags)
939 {
940 	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
941 }
942 
943 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
944 					   void *value, u64 map_flags)
945 {
946 	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
947 						 false);
948 }
949 
950 /* Called from syscall or from eBPF program */
951 static int htab_map_delete_elem(struct bpf_map *map, void *key)
952 {
953 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
954 	struct hlist_nulls_head *head;
955 	struct bucket *b;
956 	struct htab_elem *l;
957 	unsigned long flags;
958 	u32 hash, key_size;
959 	int ret = -ENOENT;
960 
961 	WARN_ON_ONCE(!rcu_read_lock_held());
962 
963 	key_size = map->key_size;
964 
965 	hash = htab_map_hash(key, key_size);
966 	b = __select_bucket(htab, hash);
967 	head = &b->head;
968 
969 	raw_spin_lock_irqsave(&b->lock, flags);
970 
971 	l = lookup_elem_raw(head, hash, key, key_size);
972 
973 	if (l) {
974 		hlist_nulls_del_rcu(&l->hash_node);
975 		free_htab_elem(htab, l);
976 		ret = 0;
977 	}
978 
979 	raw_spin_unlock_irqrestore(&b->lock, flags);
980 	return ret;
981 }
982 
983 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
984 {
985 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
986 	struct hlist_nulls_head *head;
987 	struct bucket *b;
988 	struct htab_elem *l;
989 	unsigned long flags;
990 	u32 hash, key_size;
991 	int ret = -ENOENT;
992 
993 	WARN_ON_ONCE(!rcu_read_lock_held());
994 
995 	key_size = map->key_size;
996 
997 	hash = htab_map_hash(key, key_size);
998 	b = __select_bucket(htab, hash);
999 	head = &b->head;
1000 
1001 	raw_spin_lock_irqsave(&b->lock, flags);
1002 
1003 	l = lookup_elem_raw(head, hash, key, key_size);
1004 
1005 	if (l) {
1006 		hlist_nulls_del_rcu(&l->hash_node);
1007 		ret = 0;
1008 	}
1009 
1010 	raw_spin_unlock_irqrestore(&b->lock, flags);
1011 	if (l)
1012 		bpf_lru_push_free(&htab->lru, &l->lru_node);
1013 	return ret;
1014 }
1015 
1016 static void delete_all_elements(struct bpf_htab *htab)
1017 {
1018 	int i;
1019 
1020 	for (i = 0; i < htab->n_buckets; i++) {
1021 		struct hlist_nulls_head *head = select_bucket(htab, i);
1022 		struct hlist_nulls_node *n;
1023 		struct htab_elem *l;
1024 
1025 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1026 			hlist_nulls_del_rcu(&l->hash_node);
1027 			if (l->state != HTAB_EXTRA_ELEM_USED)
1028 				htab_elem_free(htab, l);
1029 		}
1030 	}
1031 }
1032 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1033 static void htab_map_free(struct bpf_map *map)
1034 {
1035 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1036 
1037 	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
1038 	 * so the programs (can be more than one that used this map) were
1039 	 * disconnected from events. Wait for outstanding critical sections in
1040 	 * these programs to complete
1041 	 */
1042 	synchronize_rcu();
1043 
1044 	/* some of free_htab_elem() callbacks for elements of this map may
1045 	 * not have executed. Wait for them.
1046 	 */
1047 	rcu_barrier();
1048 	if (htab->map.map_flags & BPF_F_NO_PREALLOC)
1049 		delete_all_elements(htab);
1050 	else
1051 		prealloc_destroy(htab);
1052 
1053 	free_percpu(htab->extra_elems);
1054 	bpf_map_area_free(htab->buckets);
1055 	kfree(htab);
1056 }
1057 
1058 static const struct bpf_map_ops htab_ops = {
1059 	.map_alloc = htab_map_alloc,
1060 	.map_free = htab_map_free,
1061 	.map_get_next_key = htab_map_get_next_key,
1062 	.map_lookup_elem = htab_map_lookup_elem,
1063 	.map_update_elem = htab_map_update_elem,
1064 	.map_delete_elem = htab_map_delete_elem,
1065 };
1066 
1067 static struct bpf_map_type_list htab_type __ro_after_init = {
1068 	.ops = &htab_ops,
1069 	.type = BPF_MAP_TYPE_HASH,
1070 };
1071 
1072 static const struct bpf_map_ops htab_lru_ops = {
1073 	.map_alloc = htab_map_alloc,
1074 	.map_free = htab_map_free,
1075 	.map_get_next_key = htab_map_get_next_key,
1076 	.map_lookup_elem = htab_lru_map_lookup_elem,
1077 	.map_update_elem = htab_lru_map_update_elem,
1078 	.map_delete_elem = htab_lru_map_delete_elem,
1079 };
1080 
1081 static struct bpf_map_type_list htab_lru_type __ro_after_init = {
1082 	.ops = &htab_lru_ops,
1083 	.type = BPF_MAP_TYPE_LRU_HASH,
1084 };
1085 
1086 /* Called from eBPF program */
1087 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1088 {
1089 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
1090 
1091 	if (l)
1092 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
1093 	else
1094 		return NULL;
1095 }
1096 
1097 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1098 {
1099 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
1100 
1101 	if (l) {
1102 		bpf_lru_node_set_ref(&l->lru_node);
1103 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
1104 	}
1105 
1106 	return NULL;
1107 }
1108 
1109 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
1110 {
1111 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1112 	struct htab_elem *l;
1113 	void __percpu *pptr;
1114 	int ret = -ENOENT;
1115 	int cpu, off = 0;
1116 	u32 size;
1117 
1118 	/* per_cpu areas are zero-filled and bpf programs can only
1119 	 * access 'value_size' of them, so copying rounded areas
1120 	 * will not leak any kernel data
1121 	 */
1122 	size = round_up(map->value_size, 8);
1123 	rcu_read_lock();
1124 	l = __htab_map_lookup_elem(map, key);
1125 	if (!l)
1126 		goto out;
1127 	if (htab_is_lru(htab))
1128 		bpf_lru_node_set_ref(&l->lru_node);
1129 	pptr = htab_elem_get_ptr(l, map->key_size);
1130 	for_each_possible_cpu(cpu) {
1131 		bpf_long_memcpy(value + off,
1132 				per_cpu_ptr(pptr, cpu), size);
1133 		off += size;
1134 	}
1135 	ret = 0;
1136 out:
1137 	rcu_read_unlock();
1138 	return ret;
1139 }
1140 
1141 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
1142 			   u64 map_flags)
1143 {
1144 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1145 	int ret;
1146 
1147 	rcu_read_lock();
1148 	if (htab_is_lru(htab))
1149 		ret = __htab_lru_percpu_map_update_elem(map, key, value,
1150 							map_flags, true);
1151 	else
1152 		ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
1153 						    true);
1154 	rcu_read_unlock();
1155 
1156 	return ret;
1157 }
1158 
1159 static const struct bpf_map_ops htab_percpu_ops = {
1160 	.map_alloc = htab_map_alloc,
1161 	.map_free = htab_map_free,
1162 	.map_get_next_key = htab_map_get_next_key,
1163 	.map_lookup_elem = htab_percpu_map_lookup_elem,
1164 	.map_update_elem = htab_percpu_map_update_elem,
1165 	.map_delete_elem = htab_map_delete_elem,
1166 };
1167 
1168 static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
1169 	.ops = &htab_percpu_ops,
1170 	.type = BPF_MAP_TYPE_PERCPU_HASH,
1171 };
1172 
1173 static const struct bpf_map_ops htab_lru_percpu_ops = {
1174 	.map_alloc = htab_map_alloc,
1175 	.map_free = htab_map_free,
1176 	.map_get_next_key = htab_map_get_next_key,
1177 	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
1178 	.map_update_elem = htab_lru_percpu_map_update_elem,
1179 	.map_delete_elem = htab_lru_map_delete_elem,
1180 };
1181 
1182 static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
1183 	.ops = &htab_lru_percpu_ops,
1184 	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
1185 };
1186 
1187 static int __init register_htab_map(void)
1188 {
1189 	bpf_register_map_type(&htab_type);
1190 	bpf_register_map_type(&htab_percpu_type);
1191 	bpf_register_map_type(&htab_lru_type);
1192 	bpf_register_map_type(&htab_lru_percpu_type);
1193 	return 0;
1194 }
1195 late_initcall(register_htab_map);
1196