xref: /linux/kernel/bpf/hashtab.c (revision 23b0f90ba871f096474e1c27c3d14f455189d2d9)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  * Copyright (c) 2016 Facebook
4  */
5 #include <linux/bpf.h>
6 #include <linux/btf.h>
7 #include <linux/jhash.h>
8 #include <linux/filter.h>
9 #include <linux/rculist_nulls.h>
10 #include <linux/rcupdate_wait.h>
11 #include <linux/random.h>
12 #include <uapi/linux/btf.h>
13 #include <linux/rcupdate_trace.h>
14 #include <linux/btf_ids.h>
15 #include "percpu_freelist.h"
16 #include "bpf_lru_list.h"
17 #include "map_in_map.h"
18 #include <linux/bpf_mem_alloc.h>
19 #include <asm/rqspinlock.h>
20 
21 #define HTAB_CREATE_FLAG_MASK						\
22 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
23 	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
24 
25 #define BATCH_OPS(_name)			\
26 	.map_lookup_batch =			\
27 	_name##_map_lookup_batch,		\
28 	.map_lookup_and_delete_batch =		\
29 	_name##_map_lookup_and_delete_batch,	\
30 	.map_update_batch =			\
31 	generic_map_update_batch,		\
32 	.map_delete_batch =			\
33 	generic_map_delete_batch
34 
35 /*
36  * The bucket lock has two protection scopes:
37  *
38  * 1) Serializing concurrent operations from BPF programs on different
39  *    CPUs
40  *
41  * 2) Serializing concurrent operations from BPF programs and sys_bpf()
42  *
43  * BPF programs can execute in any context including perf, kprobes and
44  * tracing. As there are almost no limits where perf, kprobes and tracing
45  * can be invoked from the lock operations need to be protected against
46  * deadlocks. Deadlocks can be caused by recursion and by an invocation in
47  * the lock held section when functions which acquire this lock are invoked
48  * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
49  * variable bpf_prog_active, which prevents BPF programs attached to perf
50  * events, kprobes and tracing to be invoked before the prior invocation
51  * from one of these contexts completed. sys_bpf() uses the same mechanism
52  * by pinning the task to the current CPU and incrementing the recursion
53  * protection across the map operation.
54  *
55  * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
56  * operations like memory allocations (even with GFP_ATOMIC) from atomic
57  * contexts. This is required because even with GFP_ATOMIC the memory
58  * allocator calls into code paths which acquire locks with long held lock
59  * sections. To ensure the deterministic behaviour these locks are regular
60  * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
61  * true atomic contexts on an RT kernel are the low level hardware
62  * handling, scheduling, low level interrupt handling, NMIs etc. None of
63  * these contexts should ever do memory allocations.
64  *
65  * As regular device interrupt handlers and soft interrupts are forced into
66  * thread context, the existing code which does
67  *   spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
68  * just works.
69  *
70  * In theory the BPF locks could be converted to regular spinlocks as well,
71  * but the bucket locks and percpu_freelist locks can be taken from
72  * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
73  * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
74  * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
75  * because there is no memory allocation within the lock held sections. However
76  * after hash map was fully converted to use bpf_mem_alloc, there will be
77  * non-synchronous memory allocation for non-preallocated hash map, so it is
78  * safe to always use raw spinlock for bucket lock.
79  */
80 struct bucket {
81 	struct hlist_nulls_head head;
82 	rqspinlock_t raw_lock;
83 };
84 
85 struct bpf_htab {
86 	struct bpf_map map;
87 	struct bpf_mem_alloc ma;
88 	struct bpf_mem_alloc pcpu_ma;
89 	struct bucket *buckets;
90 	void *elems;
91 	union {
92 		struct pcpu_freelist freelist;
93 		struct bpf_lru lru;
94 	};
95 	struct htab_elem *__percpu *extra_elems;
96 	/* number of elements in non-preallocated hashtable are kept
97 	 * in either pcount or count
98 	 */
99 	struct percpu_counter pcount;
100 	atomic_t count;
101 	bool use_percpu_counter;
102 	u32 n_buckets;	/* number of hash buckets */
103 	u32 elem_size;	/* size of each element in bytes */
104 	u32 hashrnd;
105 };
106 
107 /* each htab element is struct htab_elem + key + value */
108 struct htab_elem {
109 	union {
110 		struct hlist_nulls_node hash_node;
111 		struct {
112 			void *padding;
113 			union {
114 				struct pcpu_freelist_node fnode;
115 				struct htab_elem *batch_flink;
116 			};
117 		};
118 	};
119 	union {
120 		/* pointer to per-cpu pointer */
121 		void *ptr_to_pptr;
122 		struct bpf_lru_node lru_node;
123 	};
124 	u32 hash;
125 	char key[] __aligned(8);
126 };
127 
128 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
129 {
130 	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
131 }
132 
133 static void htab_init_buckets(struct bpf_htab *htab)
134 {
135 	unsigned int i;
136 
137 	for (i = 0; i < htab->n_buckets; i++) {
138 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
139 		raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
140 		cond_resched();
141 	}
142 }
143 
144 static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
145 {
146 	unsigned long flags;
147 	int ret;
148 
149 	ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
150 	if (ret)
151 		return ret;
152 	*pflags = flags;
153 	return 0;
154 }
155 
156 static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
157 {
158 	raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
159 }
160 
161 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
162 
163 static bool htab_is_lru(const struct bpf_htab *htab)
164 {
165 	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
166 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
167 }
168 
169 static bool htab_is_percpu(const struct bpf_htab *htab)
170 {
171 	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
172 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
173 }
174 
175 static inline bool is_fd_htab(const struct bpf_htab *htab)
176 {
177 	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
178 }
179 
180 static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
181 {
182 	return l->key + round_up(key_size, 8);
183 }
184 
185 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
186 				     void __percpu *pptr)
187 {
188 	*(void __percpu **)htab_elem_value(l, key_size) = pptr;
189 }
190 
191 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
192 {
193 	return *(void __percpu **)htab_elem_value(l, key_size);
194 }
195 
196 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
197 {
198 	return *(void **)htab_elem_value(l, map->key_size);
199 }
200 
201 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
202 {
203 	return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
204 }
205 
206 /* Both percpu and fd htab support in-place update, so no need for
207  * extra elem. LRU itself can remove the least used element, so
208  * there is no need for an extra elem during map_update.
209  */
210 static bool htab_has_extra_elems(struct bpf_htab *htab)
211 {
212 	return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
213 }
214 
215 static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
216 {
217 	u32 num_entries = htab->map.max_entries;
218 	int i;
219 
220 	if (htab_has_extra_elems(htab))
221 		num_entries += num_possible_cpus();
222 
223 	for (i = 0; i < num_entries; i++) {
224 		struct htab_elem *elem;
225 
226 		elem = get_htab_elem(htab, i);
227 		bpf_map_free_internal_structs(&htab->map,
228 					      htab_elem_value(elem, htab->map.key_size));
229 		cond_resched();
230 	}
231 }
232 
233 static void htab_free_prealloced_fields(struct bpf_htab *htab)
234 {
235 	u32 num_entries = htab->map.max_entries;
236 	int i;
237 
238 	if (IS_ERR_OR_NULL(htab->map.record))
239 		return;
240 	if (htab_has_extra_elems(htab))
241 		num_entries += num_possible_cpus();
242 	for (i = 0; i < num_entries; i++) {
243 		struct htab_elem *elem;
244 
245 		elem = get_htab_elem(htab, i);
246 		if (htab_is_percpu(htab)) {
247 			void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
248 			int cpu;
249 
250 			for_each_possible_cpu(cpu) {
251 				bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
252 				cond_resched();
253 			}
254 		} else {
255 			bpf_obj_free_fields(htab->map.record,
256 					    htab_elem_value(elem, htab->map.key_size));
257 			cond_resched();
258 		}
259 		cond_resched();
260 	}
261 }
262 
263 static void htab_free_elems(struct bpf_htab *htab)
264 {
265 	int i;
266 
267 	if (!htab_is_percpu(htab))
268 		goto free_elems;
269 
270 	for (i = 0; i < htab->map.max_entries; i++) {
271 		void __percpu *pptr;
272 
273 		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
274 					 htab->map.key_size);
275 		free_percpu(pptr);
276 		cond_resched();
277 	}
278 free_elems:
279 	bpf_map_area_free(htab->elems);
280 }
281 
282 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock
283  * (bucket_lock). If both locks need to be acquired together, the lock
284  * order is always lru_lock -> bucket_lock and this only happens in
285  * bpf_lru_list.c logic. For example, certain code path of
286  * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
287  * will acquire lru_lock first followed by acquiring bucket_lock.
288  *
289  * In hashtab.c, to avoid deadlock, lock acquisition of
290  * bucket_lock followed by lru_lock is not allowed. In such cases,
291  * bucket_lock needs to be released first before acquiring lru_lock.
292  */
293 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
294 					  u32 hash)
295 {
296 	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
297 	struct htab_elem *l;
298 
299 	if (node) {
300 		bpf_map_inc_elem_count(&htab->map);
301 		l = container_of(node, struct htab_elem, lru_node);
302 		memcpy(l->key, key, htab->map.key_size);
303 		return l;
304 	}
305 
306 	return NULL;
307 }
308 
309 static int prealloc_init(struct bpf_htab *htab)
310 {
311 	u32 num_entries = htab->map.max_entries;
312 	int err = -ENOMEM, i;
313 
314 	if (htab_has_extra_elems(htab))
315 		num_entries += num_possible_cpus();
316 
317 	htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
318 					 htab->map.numa_node);
319 	if (!htab->elems)
320 		return -ENOMEM;
321 
322 	if (!htab_is_percpu(htab))
323 		goto skip_percpu_elems;
324 
325 	for (i = 0; i < num_entries; i++) {
326 		u32 size = round_up(htab->map.value_size, 8);
327 		void __percpu *pptr;
328 
329 		pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
330 					    GFP_USER | __GFP_NOWARN);
331 		if (!pptr)
332 			goto free_elems;
333 		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
334 				  pptr);
335 		cond_resched();
336 	}
337 
338 skip_percpu_elems:
339 	if (htab_is_lru(htab))
340 		err = bpf_lru_init(&htab->lru,
341 				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
342 				   offsetof(struct htab_elem, hash) -
343 				   offsetof(struct htab_elem, lru_node),
344 				   htab_lru_map_delete_node,
345 				   htab);
346 	else
347 		err = pcpu_freelist_init(&htab->freelist);
348 
349 	if (err)
350 		goto free_elems;
351 
352 	if (htab_is_lru(htab))
353 		bpf_lru_populate(&htab->lru, htab->elems,
354 				 offsetof(struct htab_elem, lru_node),
355 				 htab->elem_size, num_entries);
356 	else
357 		pcpu_freelist_populate(&htab->freelist,
358 				       htab->elems + offsetof(struct htab_elem, fnode),
359 				       htab->elem_size, num_entries);
360 
361 	return 0;
362 
363 free_elems:
364 	htab_free_elems(htab);
365 	return err;
366 }
367 
368 static void prealloc_destroy(struct bpf_htab *htab)
369 {
370 	htab_free_elems(htab);
371 
372 	if (htab_is_lru(htab))
373 		bpf_lru_destroy(&htab->lru);
374 	else
375 		pcpu_freelist_destroy(&htab->freelist);
376 }
377 
378 static int alloc_extra_elems(struct bpf_htab *htab)
379 {
380 	struct htab_elem *__percpu *pptr, *l_new;
381 	struct pcpu_freelist_node *l;
382 	int cpu;
383 
384 	pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
385 				    GFP_USER | __GFP_NOWARN);
386 	if (!pptr)
387 		return -ENOMEM;
388 
389 	for_each_possible_cpu(cpu) {
390 		l = pcpu_freelist_pop(&htab->freelist);
391 		/* pop will succeed, since prealloc_init()
392 		 * preallocated extra num_possible_cpus elements
393 		 */
394 		l_new = container_of(l, struct htab_elem, fnode);
395 		*per_cpu_ptr(pptr, cpu) = l_new;
396 	}
397 	htab->extra_elems = pptr;
398 	return 0;
399 }
400 
401 /* Called from syscall */
402 static int htab_map_alloc_check(union bpf_attr *attr)
403 {
404 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
405 		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
406 	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
407 		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
408 	/* percpu_lru means each cpu has its own LRU list.
409 	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
410 	 * the map's value itself is percpu.  percpu_lru has
411 	 * nothing to do with the map's value.
412 	 */
413 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
414 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
415 	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
416 	int numa_node = bpf_map_attr_numa_node(attr);
417 
418 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
419 		     offsetof(struct htab_elem, hash_node.pprev));
420 
421 	if (zero_seed && !capable(CAP_SYS_ADMIN))
422 		/* Guard against local DoS, and discourage production use. */
423 		return -EPERM;
424 
425 	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
426 	    !bpf_map_flags_access_ok(attr->map_flags))
427 		return -EINVAL;
428 
429 	if (!lru && percpu_lru)
430 		return -EINVAL;
431 
432 	if (lru && !prealloc)
433 		return -ENOTSUPP;
434 
435 	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
436 		return -EINVAL;
437 
438 	/* check sanity of attributes.
439 	 * value_size == 0 may be allowed in the future to use map as a set
440 	 */
441 	if (attr->max_entries == 0 || attr->key_size == 0 ||
442 	    attr->value_size == 0)
443 		return -EINVAL;
444 
445 	if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
446 	   sizeof(struct htab_elem))
447 		/* if key_size + value_size is bigger, the user space won't be
448 		 * able to access the elements via bpf syscall. This check
449 		 * also makes sure that the elem_size doesn't overflow and it's
450 		 * kmalloc-able later in htab_map_update_elem()
451 		 */
452 		return -E2BIG;
453 	/* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
454 	if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
455 		return -E2BIG;
456 
457 	return 0;
458 }
459 
460 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
461 {
462 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
463 		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
464 	/* percpu_lru means each cpu has its own LRU list.
465 	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
466 	 * the map's value itself is percpu.  percpu_lru has
467 	 * nothing to do with the map's value.
468 	 */
469 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
470 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
471 	struct bpf_htab *htab;
472 	int err;
473 
474 	htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
475 	if (!htab)
476 		return ERR_PTR(-ENOMEM);
477 
478 	bpf_map_init_from_attr(&htab->map, attr);
479 
480 	if (percpu_lru) {
481 		/* ensure each CPU's lru list has >=1 elements.
482 		 * since we are at it, make each lru list has the same
483 		 * number of elements.
484 		 */
485 		htab->map.max_entries = roundup(attr->max_entries,
486 						num_possible_cpus());
487 		if (htab->map.max_entries < attr->max_entries)
488 			htab->map.max_entries = rounddown(attr->max_entries,
489 							  num_possible_cpus());
490 	}
491 
492 	/* hash table size must be power of 2; roundup_pow_of_two() can overflow
493 	 * into UB on 32-bit arches, so check that first
494 	 */
495 	err = -E2BIG;
496 	if (htab->map.max_entries > 1UL << 31)
497 		goto free_htab;
498 
499 	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
500 
501 	htab->elem_size = sizeof(struct htab_elem) +
502 			  round_up(htab->map.key_size, 8);
503 	if (percpu)
504 		htab->elem_size += sizeof(void *);
505 	else
506 		htab->elem_size += round_up(htab->map.value_size, 8);
507 
508 	/* check for u32 overflow */
509 	if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
510 		goto free_htab;
511 
512 	err = bpf_map_init_elem_count(&htab->map);
513 	if (err)
514 		goto free_htab;
515 
516 	err = -ENOMEM;
517 	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
518 					   sizeof(struct bucket),
519 					   htab->map.numa_node);
520 	if (!htab->buckets)
521 		goto free_elem_count;
522 
523 	if (htab->map.map_flags & BPF_F_ZERO_SEED)
524 		htab->hashrnd = 0;
525 	else
526 		htab->hashrnd = get_random_u32();
527 
528 	htab_init_buckets(htab);
529 
530 /* compute_batch_value() computes batch value as num_online_cpus() * 2
531  * and __percpu_counter_compare() needs
532  * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
533  * for percpu_counter to be faster than atomic_t. In practice the average bpf
534  * hash map size is 10k, which means that a system with 64 cpus will fill
535  * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
536  * define our own batch count as 32 then 10k hash map can be filled up to 80%:
537  * 10k - 8k > 32 _batch_ * 64 _cpus_
538  * and __percpu_counter_compare() will still be fast. At that point hash map
539  * collisions will dominate its performance anyway. Assume that hash map filled
540  * to 50+% isn't going to be O(1) and use the following formula to choose
541  * between percpu_counter and atomic_t.
542  */
543 #define PERCPU_COUNTER_BATCH 32
544 	if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
545 		htab->use_percpu_counter = true;
546 
547 	if (htab->use_percpu_counter) {
548 		err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
549 		if (err)
550 			goto free_map_locked;
551 	}
552 
553 	if (prealloc) {
554 		err = prealloc_init(htab);
555 		if (err)
556 			goto free_map_locked;
557 
558 		if (htab_has_extra_elems(htab)) {
559 			err = alloc_extra_elems(htab);
560 			if (err)
561 				goto free_prealloc;
562 		}
563 	} else {
564 		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
565 		if (err)
566 			goto free_map_locked;
567 		if (percpu) {
568 			err = bpf_mem_alloc_init(&htab->pcpu_ma,
569 						 round_up(htab->map.value_size, 8), true);
570 			if (err)
571 				goto free_map_locked;
572 		}
573 	}
574 
575 	return &htab->map;
576 
577 free_prealloc:
578 	prealloc_destroy(htab);
579 free_map_locked:
580 	if (htab->use_percpu_counter)
581 		percpu_counter_destroy(&htab->pcount);
582 	bpf_map_area_free(htab->buckets);
583 	bpf_mem_alloc_destroy(&htab->pcpu_ma);
584 	bpf_mem_alloc_destroy(&htab->ma);
585 free_elem_count:
586 	bpf_map_free_elem_count(&htab->map);
587 free_htab:
588 	bpf_map_area_free(htab);
589 	return ERR_PTR(err);
590 }
591 
592 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
593 {
594 	if (likely(key_len % 4 == 0))
595 		return jhash2(key, key_len / 4, hashrnd);
596 	return jhash(key, key_len, hashrnd);
597 }
598 
599 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
600 {
601 	return &htab->buckets[hash & (htab->n_buckets - 1)];
602 }
603 
604 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
605 {
606 	return &__select_bucket(htab, hash)->head;
607 }
608 
609 /* this lookup function can only be called with bucket lock taken */
610 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
611 					 void *key, u32 key_size)
612 {
613 	struct hlist_nulls_node *n;
614 	struct htab_elem *l;
615 
616 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
617 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
618 			return l;
619 
620 	return NULL;
621 }
622 
623 /* can be called without bucket lock. it will repeat the loop in
624  * the unlikely event when elements moved from one bucket into another
625  * while link list is being walked
626  */
627 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
628 					       u32 hash, void *key,
629 					       u32 key_size, u32 n_buckets)
630 {
631 	struct hlist_nulls_node *n;
632 	struct htab_elem *l;
633 
634 again:
635 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
636 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
637 			return l;
638 
639 	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
640 		goto again;
641 
642 	return NULL;
643 }
644 
645 /* Called from syscall or from eBPF program directly, so
646  * arguments have to match bpf_map_lookup_elem() exactly.
647  * The return value is adjusted by BPF instructions
648  * in htab_map_gen_lookup().
649  */
650 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
651 {
652 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
653 	struct hlist_nulls_head *head;
654 	struct htab_elem *l;
655 	u32 hash, key_size;
656 
657 	WARN_ON_ONCE(!bpf_rcu_lock_held());
658 
659 	key_size = map->key_size;
660 
661 	hash = htab_map_hash(key, key_size, htab->hashrnd);
662 
663 	head = select_bucket(htab, hash);
664 
665 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
666 
667 	return l;
668 }
669 
670 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
671 {
672 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
673 
674 	if (l)
675 		return htab_elem_value(l, map->key_size);
676 
677 	return NULL;
678 }
679 
680 /* inline bpf_map_lookup_elem() call.
681  * Instead of:
682  * bpf_prog
683  *   bpf_map_lookup_elem
684  *     map->ops->map_lookup_elem
685  *       htab_map_lookup_elem
686  *         __htab_map_lookup_elem
687  * do:
688  * bpf_prog
689  *   __htab_map_lookup_elem
690  */
691 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
692 {
693 	struct bpf_insn *insn = insn_buf;
694 	const int ret = BPF_REG_0;
695 
696 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
697 		     (void *(*)(struct bpf_map *map, void *key))NULL));
698 	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
699 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
700 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
701 				offsetof(struct htab_elem, key) +
702 				round_up(map->key_size, 8));
703 	return insn - insn_buf;
704 }
705 
706 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
707 							void *key, const bool mark)
708 {
709 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
710 
711 	if (l) {
712 		if (mark)
713 			bpf_lru_node_set_ref(&l->lru_node);
714 		return htab_elem_value(l, map->key_size);
715 	}
716 
717 	return NULL;
718 }
719 
720 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
721 {
722 	return __htab_lru_map_lookup_elem(map, key, true);
723 }
724 
725 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
726 {
727 	return __htab_lru_map_lookup_elem(map, key, false);
728 }
729 
730 static int htab_lru_map_gen_lookup(struct bpf_map *map,
731 				   struct bpf_insn *insn_buf)
732 {
733 	struct bpf_insn *insn = insn_buf;
734 	const int ret = BPF_REG_0;
735 	const int ref_reg = BPF_REG_1;
736 
737 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
738 		     (void *(*)(struct bpf_map *map, void *key))NULL));
739 	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
740 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
741 	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
742 			      offsetof(struct htab_elem, lru_node) +
743 			      offsetof(struct bpf_lru_node, ref));
744 	*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
745 	*insn++ = BPF_ST_MEM(BPF_B, ret,
746 			     offsetof(struct htab_elem, lru_node) +
747 			     offsetof(struct bpf_lru_node, ref),
748 			     1);
749 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
750 				offsetof(struct htab_elem, key) +
751 				round_up(map->key_size, 8));
752 	return insn - insn_buf;
753 }
754 
755 static void check_and_free_fields(struct bpf_htab *htab,
756 				  struct htab_elem *elem)
757 {
758 	if (IS_ERR_OR_NULL(htab->map.record))
759 		return;
760 
761 	if (htab_is_percpu(htab)) {
762 		void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
763 		int cpu;
764 
765 		for_each_possible_cpu(cpu)
766 			bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
767 	} else {
768 		void *map_value = htab_elem_value(elem, htab->map.key_size);
769 
770 		bpf_obj_free_fields(htab->map.record, map_value);
771 	}
772 }
773 
774 /* It is called from the bpf_lru_list when the LRU needs to delete
775  * older elements from the htab.
776  */
777 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
778 {
779 	struct bpf_htab *htab = arg;
780 	struct htab_elem *l = NULL, *tgt_l;
781 	struct hlist_nulls_head *head;
782 	struct hlist_nulls_node *n;
783 	unsigned long flags;
784 	struct bucket *b;
785 	int ret;
786 
787 	tgt_l = container_of(node, struct htab_elem, lru_node);
788 	b = __select_bucket(htab, tgt_l->hash);
789 	head = &b->head;
790 
791 	ret = htab_lock_bucket(b, &flags);
792 	if (ret)
793 		return false;
794 
795 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
796 		if (l == tgt_l) {
797 			hlist_nulls_del_rcu(&l->hash_node);
798 			bpf_map_dec_elem_count(&htab->map);
799 			break;
800 		}
801 
802 	htab_unlock_bucket(b, flags);
803 
804 	if (l == tgt_l)
805 		check_and_free_fields(htab, l);
806 	return l == tgt_l;
807 }
808 
809 /* Called from syscall */
810 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
811 {
812 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
813 	struct hlist_nulls_head *head;
814 	struct htab_elem *l, *next_l;
815 	u32 hash, key_size;
816 	int i = 0;
817 
818 	WARN_ON_ONCE(!rcu_read_lock_held());
819 
820 	key_size = map->key_size;
821 
822 	if (!key)
823 		goto find_first_elem;
824 
825 	hash = htab_map_hash(key, key_size, htab->hashrnd);
826 
827 	head = select_bucket(htab, hash);
828 
829 	/* lookup the key */
830 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
831 
832 	if (!l)
833 		goto find_first_elem;
834 
835 	/* key was found, get next key in the same bucket */
836 	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
837 				  struct htab_elem, hash_node);
838 
839 	if (next_l) {
840 		/* if next elem in this hash list is non-zero, just return it */
841 		memcpy(next_key, next_l->key, key_size);
842 		return 0;
843 	}
844 
845 	/* no more elements in this hash list, go to the next bucket */
846 	i = hash & (htab->n_buckets - 1);
847 	i++;
848 
849 find_first_elem:
850 	/* iterate over buckets */
851 	for (; i < htab->n_buckets; i++) {
852 		head = select_bucket(htab, i);
853 
854 		/* pick first element in the bucket */
855 		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
856 					  struct htab_elem, hash_node);
857 		if (next_l) {
858 			/* if it's not empty, just return it */
859 			memcpy(next_key, next_l->key, key_size);
860 			return 0;
861 		}
862 	}
863 
864 	/* iterated over all buckets and all elements */
865 	return -ENOENT;
866 }
867 
868 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
869 {
870 	check_and_free_fields(htab, l);
871 
872 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
873 		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
874 	bpf_mem_cache_free(&htab->ma, l);
875 }
876 
877 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
878 {
879 	struct bpf_map *map = &htab->map;
880 	void *ptr;
881 
882 	if (map->ops->map_fd_put_ptr) {
883 		ptr = fd_htab_map_get_ptr(map, l);
884 		map->ops->map_fd_put_ptr(map, ptr, true);
885 	}
886 }
887 
888 static bool is_map_full(struct bpf_htab *htab)
889 {
890 	if (htab->use_percpu_counter)
891 		return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
892 						PERCPU_COUNTER_BATCH) >= 0;
893 	return atomic_read(&htab->count) >= htab->map.max_entries;
894 }
895 
896 static void inc_elem_count(struct bpf_htab *htab)
897 {
898 	bpf_map_inc_elem_count(&htab->map);
899 
900 	if (htab->use_percpu_counter)
901 		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
902 	else
903 		atomic_inc(&htab->count);
904 }
905 
906 static void dec_elem_count(struct bpf_htab *htab)
907 {
908 	bpf_map_dec_elem_count(&htab->map);
909 
910 	if (htab->use_percpu_counter)
911 		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
912 	else
913 		atomic_dec(&htab->count);
914 }
915 
916 
917 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
918 {
919 	htab_put_fd_value(htab, l);
920 
921 	if (htab_is_prealloc(htab)) {
922 		bpf_map_dec_elem_count(&htab->map);
923 		check_and_free_fields(htab, l);
924 		pcpu_freelist_push(&htab->freelist, &l->fnode);
925 	} else {
926 		dec_elem_count(htab);
927 		htab_elem_free(htab, l);
928 	}
929 }
930 
931 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
932 			    void *value, bool onallcpus, u64 map_flags)
933 {
934 	void *ptr;
935 
936 	if (!onallcpus) {
937 		/* copy true value_size bytes */
938 		ptr = this_cpu_ptr(pptr);
939 		copy_map_value(&htab->map, ptr, value);
940 		bpf_obj_free_fields(htab->map.record, ptr);
941 	} else {
942 		u32 size = round_up(htab->map.value_size, 8);
943 		void *val;
944 		int cpu;
945 
946 		if (map_flags & BPF_F_CPU) {
947 			cpu = map_flags >> 32;
948 			ptr = per_cpu_ptr(pptr, cpu);
949 			copy_map_value(&htab->map, ptr, value);
950 			bpf_obj_free_fields(htab->map.record, ptr);
951 			return;
952 		}
953 
954 		for_each_possible_cpu(cpu) {
955 			ptr = per_cpu_ptr(pptr, cpu);
956 			val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
957 			copy_map_value(&htab->map, ptr, val);
958 			bpf_obj_free_fields(htab->map.record, ptr);
959 		}
960 	}
961 }
962 
963 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
964 			    void *value, bool onallcpus, u64 map_flags)
965 {
966 	/* When not setting the initial value on all cpus, zero-fill element
967 	 * values for other cpus. Otherwise, bpf program has no way to ensure
968 	 * known initial values for cpus other than current one
969 	 * (onallcpus=false always when coming from bpf prog).
970 	 */
971 	if (!onallcpus) {
972 		int current_cpu = raw_smp_processor_id();
973 		int cpu;
974 
975 		for_each_possible_cpu(cpu) {
976 			if (cpu == current_cpu)
977 				copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
978 			else /* Since elem is preallocated, we cannot touch special fields */
979 				zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
980 		}
981 	} else {
982 		pcpu_copy_value(htab, pptr, value, onallcpus, map_flags);
983 	}
984 }
985 
986 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
987 {
988 	return is_fd_htab(htab) && BITS_PER_LONG == 64;
989 }
990 
991 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
992 					 void *value, u32 key_size, u32 hash,
993 					 bool percpu, bool onallcpus,
994 					 struct htab_elem *old_elem, u64 map_flags)
995 {
996 	u32 size = htab->map.value_size;
997 	bool prealloc = htab_is_prealloc(htab);
998 	struct htab_elem *l_new, **pl_new;
999 	void __percpu *pptr;
1000 
1001 	if (prealloc) {
1002 		if (old_elem) {
1003 			/* if we're updating the existing element,
1004 			 * use per-cpu extra elems to avoid freelist_pop/push
1005 			 */
1006 			pl_new = this_cpu_ptr(htab->extra_elems);
1007 			l_new = *pl_new;
1008 			*pl_new = old_elem;
1009 		} else {
1010 			struct pcpu_freelist_node *l;
1011 
1012 			l = __pcpu_freelist_pop(&htab->freelist);
1013 			if (!l)
1014 				return ERR_PTR(-E2BIG);
1015 			l_new = container_of(l, struct htab_elem, fnode);
1016 			bpf_map_inc_elem_count(&htab->map);
1017 		}
1018 	} else {
1019 		if (is_map_full(htab))
1020 			if (!old_elem)
1021 				/* when map is full and update() is replacing
1022 				 * old element, it's ok to allocate, since
1023 				 * old element will be freed immediately.
1024 				 * Otherwise return an error
1025 				 */
1026 				return ERR_PTR(-E2BIG);
1027 		inc_elem_count(htab);
1028 		l_new = bpf_mem_cache_alloc(&htab->ma);
1029 		if (!l_new) {
1030 			l_new = ERR_PTR(-ENOMEM);
1031 			goto dec_count;
1032 		}
1033 	}
1034 
1035 	memcpy(l_new->key, key, key_size);
1036 	if (percpu) {
1037 		if (prealloc) {
1038 			pptr = htab_elem_get_ptr(l_new, key_size);
1039 		} else {
1040 			/* alloc_percpu zero-fills */
1041 			void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
1042 
1043 			if (!ptr) {
1044 				bpf_mem_cache_free(&htab->ma, l_new);
1045 				l_new = ERR_PTR(-ENOMEM);
1046 				goto dec_count;
1047 			}
1048 			l_new->ptr_to_pptr = ptr;
1049 			pptr = *(void __percpu **)ptr;
1050 		}
1051 
1052 		pcpu_init_value(htab, pptr, value, onallcpus, map_flags);
1053 
1054 		if (!prealloc)
1055 			htab_elem_set_ptr(l_new, key_size, pptr);
1056 	} else if (fd_htab_map_needs_adjust(htab)) {
1057 		size = round_up(size, 8);
1058 		memcpy(htab_elem_value(l_new, key_size), value, size);
1059 	} else {
1060 		copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
1061 	}
1062 
1063 	l_new->hash = hash;
1064 	return l_new;
1065 dec_count:
1066 	dec_elem_count(htab);
1067 	return l_new;
1068 }
1069 
1070 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1071 		       u64 map_flags)
1072 {
1073 	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1074 		/* elem already exists */
1075 		return -EEXIST;
1076 
1077 	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1078 		/* elem doesn't exist, cannot update it */
1079 		return -ENOENT;
1080 
1081 	return 0;
1082 }
1083 
1084 /* Called from syscall or from eBPF program */
1085 static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1086 				 u64 map_flags)
1087 {
1088 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1089 	struct htab_elem *l_new, *l_old;
1090 	struct hlist_nulls_head *head;
1091 	unsigned long flags;
1092 	struct bucket *b;
1093 	u32 key_size, hash;
1094 	int ret;
1095 
1096 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1097 		/* unknown flags */
1098 		return -EINVAL;
1099 
1100 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1101 
1102 	key_size = map->key_size;
1103 
1104 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1105 
1106 	b = __select_bucket(htab, hash);
1107 	head = &b->head;
1108 
1109 	if (unlikely(map_flags & BPF_F_LOCK)) {
1110 		if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1111 			return -EINVAL;
1112 		/* find an element without taking the bucket lock */
1113 		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1114 					      htab->n_buckets);
1115 		ret = check_flags(htab, l_old, map_flags);
1116 		if (ret)
1117 			return ret;
1118 		if (l_old) {
1119 			/* grab the element lock and update value in place */
1120 			copy_map_value_locked(map,
1121 					      htab_elem_value(l_old, key_size),
1122 					      value, false);
1123 			return 0;
1124 		}
1125 		/* fall through, grab the bucket lock and lookup again.
1126 		 * 99.9% chance that the element won't be found,
1127 		 * but second lookup under lock has to be done.
1128 		 */
1129 	}
1130 
1131 	ret = htab_lock_bucket(b, &flags);
1132 	if (ret)
1133 		return ret;
1134 
1135 	l_old = lookup_elem_raw(head, hash, key, key_size);
1136 
1137 	ret = check_flags(htab, l_old, map_flags);
1138 	if (ret)
1139 		goto err;
1140 
1141 	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1142 		/* first lookup without the bucket lock didn't find the element,
1143 		 * but second lookup with the bucket lock found it.
1144 		 * This case is highly unlikely, but has to be dealt with:
1145 		 * grab the element lock in addition to the bucket lock
1146 		 * and update element in place
1147 		 */
1148 		copy_map_value_locked(map,
1149 				      htab_elem_value(l_old, key_size),
1150 				      value, false);
1151 		ret = 0;
1152 		goto err;
1153 	}
1154 
1155 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1156 				l_old, map_flags);
1157 	if (IS_ERR(l_new)) {
1158 		/* all pre-allocated elements are in use or memory exhausted */
1159 		ret = PTR_ERR(l_new);
1160 		goto err;
1161 	}
1162 
1163 	/* add new element to the head of the list, so that
1164 	 * concurrent search will find it before old elem
1165 	 */
1166 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1167 	if (l_old) {
1168 		hlist_nulls_del_rcu(&l_old->hash_node);
1169 
1170 		/* l_old has already been stashed in htab->extra_elems, free
1171 		 * its special fields before it is available for reuse.
1172 		 */
1173 		if (htab_is_prealloc(htab))
1174 			check_and_free_fields(htab, l_old);
1175 	}
1176 	htab_unlock_bucket(b, flags);
1177 	if (l_old && !htab_is_prealloc(htab))
1178 		free_htab_elem(htab, l_old);
1179 	return 0;
1180 err:
1181 	htab_unlock_bucket(b, flags);
1182 	return ret;
1183 }
1184 
1185 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1186 {
1187 	check_and_free_fields(htab, elem);
1188 	bpf_map_dec_elem_count(&htab->map);
1189 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
1190 }
1191 
1192 static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1193 				     u64 map_flags)
1194 {
1195 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1196 	struct htab_elem *l_new, *l_old = NULL;
1197 	struct hlist_nulls_head *head;
1198 	unsigned long flags;
1199 	struct bucket *b;
1200 	u32 key_size, hash;
1201 	int ret;
1202 
1203 	if (unlikely(map_flags > BPF_EXIST))
1204 		/* unknown flags */
1205 		return -EINVAL;
1206 
1207 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1208 
1209 	key_size = map->key_size;
1210 
1211 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1212 
1213 	b = __select_bucket(htab, hash);
1214 	head = &b->head;
1215 
1216 	/* For LRU, we need to alloc before taking bucket's
1217 	 * spinlock because getting free nodes from LRU may need
1218 	 * to remove older elements from htab and this removal
1219 	 * operation will need a bucket lock.
1220 	 */
1221 	l_new = prealloc_lru_pop(htab, key, hash);
1222 	if (!l_new)
1223 		return -ENOMEM;
1224 	copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
1225 
1226 	ret = htab_lock_bucket(b, &flags);
1227 	if (ret)
1228 		goto err_lock_bucket;
1229 
1230 	l_old = lookup_elem_raw(head, hash, key, key_size);
1231 
1232 	ret = check_flags(htab, l_old, map_flags);
1233 	if (ret)
1234 		goto err;
1235 
1236 	/* add new element to the head of the list, so that
1237 	 * concurrent search will find it before old elem
1238 	 */
1239 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1240 	if (l_old) {
1241 		bpf_lru_node_set_ref(&l_new->lru_node);
1242 		hlist_nulls_del_rcu(&l_old->hash_node);
1243 	}
1244 	ret = 0;
1245 
1246 err:
1247 	htab_unlock_bucket(b, flags);
1248 
1249 err_lock_bucket:
1250 	if (ret)
1251 		htab_lru_push_free(htab, l_new);
1252 	else if (l_old)
1253 		htab_lru_push_free(htab, l_old);
1254 
1255 	return ret;
1256 }
1257 
1258 static int htab_map_check_update_flags(bool onallcpus, u64 map_flags)
1259 {
1260 	if (unlikely(!onallcpus && map_flags > BPF_EXIST))
1261 		return -EINVAL;
1262 	if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)))
1263 		return -EINVAL;
1264 	return 0;
1265 }
1266 
1267 static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
1268 					  void *value, u64 map_flags,
1269 					  bool percpu, bool onallcpus)
1270 {
1271 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1272 	struct htab_elem *l_new, *l_old;
1273 	struct hlist_nulls_head *head;
1274 	void *old_map_ptr = NULL;
1275 	unsigned long flags;
1276 	struct bucket *b;
1277 	u32 key_size, hash;
1278 	int ret;
1279 
1280 	ret = htab_map_check_update_flags(onallcpus, map_flags);
1281 	if (unlikely(ret))
1282 		return ret;
1283 
1284 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1285 
1286 	key_size = map->key_size;
1287 
1288 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1289 
1290 	b = __select_bucket(htab, hash);
1291 	head = &b->head;
1292 
1293 	ret = htab_lock_bucket(b, &flags);
1294 	if (ret)
1295 		return ret;
1296 
1297 	l_old = lookup_elem_raw(head, hash, key, key_size);
1298 
1299 	ret = check_flags(htab, l_old, map_flags);
1300 	if (ret)
1301 		goto err;
1302 
1303 	if (l_old) {
1304 		/* Update value in-place */
1305 		if (percpu) {
1306 			pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1307 					value, onallcpus, map_flags);
1308 		} else {
1309 			void **inner_map_pptr = htab_elem_value(l_old, key_size);
1310 
1311 			old_map_ptr = *inner_map_pptr;
1312 			WRITE_ONCE(*inner_map_pptr, *(void **)value);
1313 		}
1314 	} else {
1315 		l_new = alloc_htab_elem(htab, key, value, key_size,
1316 					hash, percpu, onallcpus, NULL, map_flags);
1317 		if (IS_ERR(l_new)) {
1318 			ret = PTR_ERR(l_new);
1319 			goto err;
1320 		}
1321 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1322 	}
1323 err:
1324 	htab_unlock_bucket(b, flags);
1325 	if (old_map_ptr)
1326 		map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1327 	return ret;
1328 }
1329 
1330 static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1331 					      void *value, u64 map_flags,
1332 					      bool onallcpus)
1333 {
1334 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1335 	struct htab_elem *l_new = NULL, *l_old;
1336 	struct hlist_nulls_head *head;
1337 	unsigned long flags;
1338 	struct bucket *b;
1339 	u32 key_size, hash;
1340 	int ret;
1341 
1342 	ret = htab_map_check_update_flags(onallcpus, map_flags);
1343 	if (unlikely(ret))
1344 		return ret;
1345 
1346 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1347 
1348 	key_size = map->key_size;
1349 
1350 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1351 
1352 	b = __select_bucket(htab, hash);
1353 	head = &b->head;
1354 
1355 	/* For LRU, we need to alloc before taking bucket's
1356 	 * spinlock because LRU's elem alloc may need
1357 	 * to remove older elem from htab and this removal
1358 	 * operation will need a bucket lock.
1359 	 */
1360 	if (map_flags != BPF_EXIST) {
1361 		l_new = prealloc_lru_pop(htab, key, hash);
1362 		if (!l_new)
1363 			return -ENOMEM;
1364 	}
1365 
1366 	ret = htab_lock_bucket(b, &flags);
1367 	if (ret)
1368 		goto err_lock_bucket;
1369 
1370 	l_old = lookup_elem_raw(head, hash, key, key_size);
1371 
1372 	ret = check_flags(htab, l_old, map_flags);
1373 	if (ret)
1374 		goto err;
1375 
1376 	if (l_old) {
1377 		bpf_lru_node_set_ref(&l_old->lru_node);
1378 
1379 		/* per-cpu hash map can update value in-place */
1380 		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1381 				value, onallcpus, map_flags);
1382 	} else {
1383 		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1384 				value, onallcpus, map_flags);
1385 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1386 		l_new = NULL;
1387 	}
1388 	ret = 0;
1389 err:
1390 	htab_unlock_bucket(b, flags);
1391 err_lock_bucket:
1392 	if (l_new) {
1393 		bpf_map_dec_elem_count(&htab->map);
1394 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1395 	}
1396 	return ret;
1397 }
1398 
1399 static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1400 					void *value, u64 map_flags)
1401 {
1402 	return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
1403 }
1404 
1405 static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1406 					    void *value, u64 map_flags)
1407 {
1408 	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1409 						 false);
1410 }
1411 
1412 /* Called from syscall or from eBPF program */
1413 static long htab_map_delete_elem(struct bpf_map *map, void *key)
1414 {
1415 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1416 	struct hlist_nulls_head *head;
1417 	struct bucket *b;
1418 	struct htab_elem *l;
1419 	unsigned long flags;
1420 	u32 hash, key_size;
1421 	int ret;
1422 
1423 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1424 
1425 	key_size = map->key_size;
1426 
1427 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1428 	b = __select_bucket(htab, hash);
1429 	head = &b->head;
1430 
1431 	ret = htab_lock_bucket(b, &flags);
1432 	if (ret)
1433 		return ret;
1434 
1435 	l = lookup_elem_raw(head, hash, key, key_size);
1436 	if (l)
1437 		hlist_nulls_del_rcu(&l->hash_node);
1438 	else
1439 		ret = -ENOENT;
1440 
1441 	htab_unlock_bucket(b, flags);
1442 
1443 	if (l)
1444 		free_htab_elem(htab, l);
1445 	return ret;
1446 }
1447 
1448 static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1449 {
1450 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1451 	struct hlist_nulls_head *head;
1452 	struct bucket *b;
1453 	struct htab_elem *l;
1454 	unsigned long flags;
1455 	u32 hash, key_size;
1456 	int ret;
1457 
1458 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1459 
1460 	key_size = map->key_size;
1461 
1462 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1463 	b = __select_bucket(htab, hash);
1464 	head = &b->head;
1465 
1466 	ret = htab_lock_bucket(b, &flags);
1467 	if (ret)
1468 		return ret;
1469 
1470 	l = lookup_elem_raw(head, hash, key, key_size);
1471 
1472 	if (l)
1473 		hlist_nulls_del_rcu(&l->hash_node);
1474 	else
1475 		ret = -ENOENT;
1476 
1477 	htab_unlock_bucket(b, flags);
1478 	if (l)
1479 		htab_lru_push_free(htab, l);
1480 	return ret;
1481 }
1482 
1483 static void delete_all_elements(struct bpf_htab *htab)
1484 {
1485 	int i;
1486 
1487 	/* It's called from a worker thread and migration has been disabled,
1488 	 * therefore, it is OK to invoke bpf_mem_cache_free() directly.
1489 	 */
1490 	for (i = 0; i < htab->n_buckets; i++) {
1491 		struct hlist_nulls_head *head = select_bucket(htab, i);
1492 		struct hlist_nulls_node *n;
1493 		struct htab_elem *l;
1494 
1495 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1496 			hlist_nulls_del_rcu(&l->hash_node);
1497 			htab_elem_free(htab, l);
1498 		}
1499 		cond_resched();
1500 	}
1501 }
1502 
1503 static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
1504 {
1505 	int i;
1506 
1507 	rcu_read_lock();
1508 	for (i = 0; i < htab->n_buckets; i++) {
1509 		struct hlist_nulls_head *head = select_bucket(htab, i);
1510 		struct hlist_nulls_node *n;
1511 		struct htab_elem *l;
1512 
1513 		hlist_nulls_for_each_entry(l, n, head, hash_node) {
1514 			/* We only free internal structs on uref dropping to zero */
1515 			bpf_map_free_internal_structs(&htab->map,
1516 						      htab_elem_value(l, htab->map.key_size));
1517 		}
1518 		cond_resched_rcu();
1519 	}
1520 	rcu_read_unlock();
1521 }
1522 
1523 static void htab_map_free_internal_structs(struct bpf_map *map)
1524 {
1525 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1526 
1527 	/* We only free internal structs on uref dropping to zero */
1528 	if (!bpf_map_has_internal_structs(map))
1529 		return;
1530 
1531 	if (htab_is_prealloc(htab))
1532 		htab_free_prealloced_internal_structs(htab);
1533 	else
1534 		htab_free_malloced_internal_structs(htab);
1535 }
1536 
1537 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1538 static void htab_map_free(struct bpf_map *map)
1539 {
1540 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1541 
1542 	/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1543 	 * bpf_free_used_maps() is called after bpf prog is no longer executing.
1544 	 * There is no need to synchronize_rcu() here to protect map elements.
1545 	 */
1546 
1547 	/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1548 	 * underneath and is responsible for waiting for callbacks to finish
1549 	 * during bpf_mem_alloc_destroy().
1550 	 */
1551 	if (!htab_is_prealloc(htab)) {
1552 		delete_all_elements(htab);
1553 	} else {
1554 		htab_free_prealloced_fields(htab);
1555 		prealloc_destroy(htab);
1556 	}
1557 
1558 	bpf_map_free_elem_count(map);
1559 	free_percpu(htab->extra_elems);
1560 	bpf_map_area_free(htab->buckets);
1561 	bpf_mem_alloc_destroy(&htab->pcpu_ma);
1562 	bpf_mem_alloc_destroy(&htab->ma);
1563 	if (htab->use_percpu_counter)
1564 		percpu_counter_destroy(&htab->pcount);
1565 	bpf_map_area_free(htab);
1566 }
1567 
1568 static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1569 				   struct seq_file *m)
1570 {
1571 	void *value;
1572 
1573 	rcu_read_lock();
1574 
1575 	value = htab_map_lookup_elem(map, key);
1576 	if (!value) {
1577 		rcu_read_unlock();
1578 		return;
1579 	}
1580 
1581 	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1582 	seq_puts(m, ": ");
1583 	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1584 	seq_putc(m, '\n');
1585 
1586 	rcu_read_unlock();
1587 }
1588 
1589 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1590 					     void *value, bool is_lru_map,
1591 					     bool is_percpu, u64 flags)
1592 {
1593 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1594 	struct hlist_nulls_head *head;
1595 	unsigned long bflags;
1596 	struct htab_elem *l;
1597 	u32 hash, key_size;
1598 	struct bucket *b;
1599 	int ret;
1600 
1601 	key_size = map->key_size;
1602 
1603 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1604 	b = __select_bucket(htab, hash);
1605 	head = &b->head;
1606 
1607 	ret = htab_lock_bucket(b, &bflags);
1608 	if (ret)
1609 		return ret;
1610 
1611 	l = lookup_elem_raw(head, hash, key, key_size);
1612 	if (!l) {
1613 		ret = -ENOENT;
1614 		goto out_unlock;
1615 	}
1616 
1617 	if (is_percpu) {
1618 		u32 roundup_value_size = round_up(map->value_size, 8);
1619 		void __percpu *pptr;
1620 		int off = 0, cpu;
1621 
1622 		pptr = htab_elem_get_ptr(l, key_size);
1623 		for_each_possible_cpu(cpu) {
1624 			copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
1625 			check_and_init_map_value(&htab->map, value + off);
1626 			off += roundup_value_size;
1627 		}
1628 	} else {
1629 		void *src = htab_elem_value(l, map->key_size);
1630 
1631 		if (flags & BPF_F_LOCK)
1632 			copy_map_value_locked(map, value, src, true);
1633 		else
1634 			copy_map_value(map, value, src);
1635 		/* Zeroing special fields in the temp buffer */
1636 		check_and_init_map_value(map, value);
1637 	}
1638 	hlist_nulls_del_rcu(&l->hash_node);
1639 
1640 out_unlock:
1641 	htab_unlock_bucket(b, bflags);
1642 
1643 	if (l) {
1644 		if (is_lru_map)
1645 			htab_lru_push_free(htab, l);
1646 		else
1647 			free_htab_elem(htab, l);
1648 	}
1649 
1650 	return ret;
1651 }
1652 
1653 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1654 					   void *value, u64 flags)
1655 {
1656 	return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1657 						 flags);
1658 }
1659 
1660 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1661 						  void *key, void *value,
1662 						  u64 flags)
1663 {
1664 	return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1665 						 flags);
1666 }
1667 
1668 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1669 					       void *value, u64 flags)
1670 {
1671 	return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1672 						 flags);
1673 }
1674 
1675 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1676 						      void *key, void *value,
1677 						      u64 flags)
1678 {
1679 	return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1680 						 flags);
1681 }
1682 
1683 static int
1684 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
1685 				   const union bpf_attr *attr,
1686 				   union bpf_attr __user *uattr,
1687 				   bool do_delete, bool is_lru_map,
1688 				   bool is_percpu)
1689 {
1690 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1691 	void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1692 	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1693 	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1694 	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1695 	u32 batch, max_count, size, bucket_size, map_id;
1696 	u64 elem_map_flags, map_flags, allowed_flags;
1697 	u32 bucket_cnt, total, key_size, value_size;
1698 	struct htab_elem *node_to_free = NULL;
1699 	struct hlist_nulls_head *head;
1700 	struct hlist_nulls_node *n;
1701 	unsigned long flags = 0;
1702 	bool locked = false;
1703 	struct htab_elem *l;
1704 	struct bucket *b;
1705 	int ret = 0;
1706 
1707 	elem_map_flags = attr->batch.elem_flags;
1708 	allowed_flags = BPF_F_LOCK;
1709 	if (!do_delete && is_percpu)
1710 		allowed_flags |= BPF_F_CPU;
1711 	ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags);
1712 	if (ret)
1713 		return ret;
1714 
1715 	map_flags = attr->batch.flags;
1716 	if (map_flags)
1717 		return -EINVAL;
1718 
1719 	max_count = attr->batch.count;
1720 	if (!max_count)
1721 		return 0;
1722 
1723 	if (put_user(0, &uattr->batch.count))
1724 		return -EFAULT;
1725 
1726 	batch = 0;
1727 	if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1728 		return -EFAULT;
1729 
1730 	if (batch >= htab->n_buckets)
1731 		return -ENOENT;
1732 
1733 	key_size = htab->map.key_size;
1734 	value_size = htab->map.value_size;
1735 	size = round_up(value_size, 8);
1736 	if (is_percpu && !(elem_map_flags & BPF_F_CPU))
1737 		value_size = size * num_possible_cpus();
1738 	total = 0;
1739 	/* while experimenting with hash tables with sizes ranging from 10 to
1740 	 * 1000, it was observed that a bucket can have up to 5 entries.
1741 	 */
1742 	bucket_size = 5;
1743 
1744 alloc:
1745 	/* We cannot do copy_from_user or copy_to_user inside
1746 	 * the rcu_read_lock. Allocate enough space here.
1747 	 */
1748 	keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1749 	values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1750 	if (!keys || !values) {
1751 		ret = -ENOMEM;
1752 		goto after_loop;
1753 	}
1754 
1755 again:
1756 	bpf_disable_instrumentation();
1757 	rcu_read_lock();
1758 again_nocopy:
1759 	dst_key = keys;
1760 	dst_val = values;
1761 	b = &htab->buckets[batch];
1762 	head = &b->head;
1763 	/* do not grab the lock unless need it (bucket_cnt > 0). */
1764 	if (locked) {
1765 		ret = htab_lock_bucket(b, &flags);
1766 		if (ret) {
1767 			rcu_read_unlock();
1768 			bpf_enable_instrumentation();
1769 			goto after_loop;
1770 		}
1771 	}
1772 
1773 	bucket_cnt = 0;
1774 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1775 		bucket_cnt++;
1776 
1777 	if (bucket_cnt && !locked) {
1778 		locked = true;
1779 		goto again_nocopy;
1780 	}
1781 
1782 	if (bucket_cnt > (max_count - total)) {
1783 		if (total == 0)
1784 			ret = -ENOSPC;
1785 		/* Note that since bucket_cnt > 0 here, it is implicit
1786 		 * that the locked was grabbed, so release it.
1787 		 */
1788 		htab_unlock_bucket(b, flags);
1789 		rcu_read_unlock();
1790 		bpf_enable_instrumentation();
1791 		goto after_loop;
1792 	}
1793 
1794 	if (bucket_cnt > bucket_size) {
1795 		bucket_size = bucket_cnt;
1796 		/* Note that since bucket_cnt > 0 here, it is implicit
1797 		 * that the locked was grabbed, so release it.
1798 		 */
1799 		htab_unlock_bucket(b, flags);
1800 		rcu_read_unlock();
1801 		bpf_enable_instrumentation();
1802 		kvfree(keys);
1803 		kvfree(values);
1804 		goto alloc;
1805 	}
1806 
1807 	/* Next block is only safe to run if you have grabbed the lock */
1808 	if (!locked)
1809 		goto next_batch;
1810 
1811 	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1812 		memcpy(dst_key, l->key, key_size);
1813 
1814 		if (is_percpu) {
1815 			int off = 0, cpu;
1816 			void __percpu *pptr;
1817 
1818 			pptr = htab_elem_get_ptr(l, map->key_size);
1819 			if (elem_map_flags & BPF_F_CPU) {
1820 				cpu = elem_map_flags >> 32;
1821 				copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu));
1822 				check_and_init_map_value(&htab->map, dst_val);
1823 			} else {
1824 				for_each_possible_cpu(cpu) {
1825 					copy_map_value_long(&htab->map, dst_val + off,
1826 							    per_cpu_ptr(pptr, cpu));
1827 					check_and_init_map_value(&htab->map, dst_val + off);
1828 					off += size;
1829 				}
1830 			}
1831 		} else {
1832 			value = htab_elem_value(l, key_size);
1833 			if (is_fd_htab(htab)) {
1834 				struct bpf_map **inner_map = value;
1835 
1836 				 /* Actual value is the id of the inner map */
1837 				map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1838 				value = &map_id;
1839 			}
1840 
1841 			if (elem_map_flags & BPF_F_LOCK)
1842 				copy_map_value_locked(map, dst_val, value,
1843 						      true);
1844 			else
1845 				copy_map_value(map, dst_val, value);
1846 			/* Zeroing special fields in the temp buffer */
1847 			check_and_init_map_value(map, dst_val);
1848 		}
1849 		if (do_delete) {
1850 			hlist_nulls_del_rcu(&l->hash_node);
1851 
1852 			/* bpf_lru_push_free() will acquire lru_lock, which
1853 			 * may cause deadlock. See comments in function
1854 			 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1855 			 * after releasing the bucket lock.
1856 			 *
1857 			 * For htab of maps, htab_put_fd_value() in
1858 			 * free_htab_elem() may acquire a spinlock with bucket
1859 			 * lock being held and it violates the lock rule, so
1860 			 * invoke free_htab_elem() after unlock as well.
1861 			 */
1862 			l->batch_flink = node_to_free;
1863 			node_to_free = l;
1864 		}
1865 		dst_key += key_size;
1866 		dst_val += value_size;
1867 	}
1868 
1869 	htab_unlock_bucket(b, flags);
1870 	locked = false;
1871 
1872 	while (node_to_free) {
1873 		l = node_to_free;
1874 		node_to_free = node_to_free->batch_flink;
1875 		if (is_lru_map)
1876 			htab_lru_push_free(htab, l);
1877 		else
1878 			free_htab_elem(htab, l);
1879 	}
1880 
1881 next_batch:
1882 	/* If we are not copying data, we can go to next bucket and avoid
1883 	 * unlocking the rcu.
1884 	 */
1885 	if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1886 		batch++;
1887 		goto again_nocopy;
1888 	}
1889 
1890 	rcu_read_unlock();
1891 	bpf_enable_instrumentation();
1892 	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1893 	    key_size * bucket_cnt) ||
1894 	    copy_to_user(uvalues + total * value_size, values,
1895 	    value_size * bucket_cnt))) {
1896 		ret = -EFAULT;
1897 		goto after_loop;
1898 	}
1899 
1900 	total += bucket_cnt;
1901 	batch++;
1902 	if (batch >= htab->n_buckets) {
1903 		ret = -ENOENT;
1904 		goto after_loop;
1905 	}
1906 	goto again;
1907 
1908 after_loop:
1909 	if (ret == -EFAULT)
1910 		goto out;
1911 
1912 	/* copy # of entries and next batch */
1913 	ubatch = u64_to_user_ptr(attr->batch.out_batch);
1914 	if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1915 	    put_user(total, &uattr->batch.count))
1916 		ret = -EFAULT;
1917 
1918 out:
1919 	kvfree(keys);
1920 	kvfree(values);
1921 	return ret;
1922 }
1923 
1924 static int
1925 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1926 			     union bpf_attr __user *uattr)
1927 {
1928 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1929 						  false, true);
1930 }
1931 
1932 static int
1933 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1934 					const union bpf_attr *attr,
1935 					union bpf_attr __user *uattr)
1936 {
1937 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1938 						  false, true);
1939 }
1940 
1941 static int
1942 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1943 		      union bpf_attr __user *uattr)
1944 {
1945 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1946 						  false, false);
1947 }
1948 
1949 static int
1950 htab_map_lookup_and_delete_batch(struct bpf_map *map,
1951 				 const union bpf_attr *attr,
1952 				 union bpf_attr __user *uattr)
1953 {
1954 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1955 						  false, false);
1956 }
1957 
1958 static int
1959 htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1960 				 const union bpf_attr *attr,
1961 				 union bpf_attr __user *uattr)
1962 {
1963 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1964 						  true, true);
1965 }
1966 
1967 static int
1968 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1969 					    const union bpf_attr *attr,
1970 					    union bpf_attr __user *uattr)
1971 {
1972 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1973 						  true, true);
1974 }
1975 
1976 static int
1977 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1978 			  union bpf_attr __user *uattr)
1979 {
1980 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1981 						  true, false);
1982 }
1983 
1984 static int
1985 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1986 				     const union bpf_attr *attr,
1987 				     union bpf_attr __user *uattr)
1988 {
1989 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1990 						  true, false);
1991 }
1992 
1993 struct bpf_iter_seq_hash_map_info {
1994 	struct bpf_map *map;
1995 	struct bpf_htab *htab;
1996 	void *percpu_value_buf; // non-zero means percpu hash
1997 	u32 bucket_id;
1998 	u32 skip_elems;
1999 };
2000 
2001 static struct htab_elem *
2002 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
2003 			   struct htab_elem *prev_elem)
2004 {
2005 	const struct bpf_htab *htab = info->htab;
2006 	u32 skip_elems = info->skip_elems;
2007 	u32 bucket_id = info->bucket_id;
2008 	struct hlist_nulls_head *head;
2009 	struct hlist_nulls_node *n;
2010 	struct htab_elem *elem;
2011 	struct bucket *b;
2012 	u32 i, count;
2013 
2014 	if (bucket_id >= htab->n_buckets)
2015 		return NULL;
2016 
2017 	/* try to find next elem in the same bucket */
2018 	if (prev_elem) {
2019 		/* no update/deletion on this bucket, prev_elem should be still valid
2020 		 * and we won't skip elements.
2021 		 */
2022 		n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
2023 		elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
2024 		if (elem)
2025 			return elem;
2026 
2027 		/* not found, unlock and go to the next bucket */
2028 		b = &htab->buckets[bucket_id++];
2029 		rcu_read_unlock();
2030 		skip_elems = 0;
2031 	}
2032 
2033 	for (i = bucket_id; i < htab->n_buckets; i++) {
2034 		b = &htab->buckets[i];
2035 		rcu_read_lock();
2036 
2037 		count = 0;
2038 		head = &b->head;
2039 		hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2040 			if (count >= skip_elems) {
2041 				info->bucket_id = i;
2042 				info->skip_elems = count;
2043 				return elem;
2044 			}
2045 			count++;
2046 		}
2047 
2048 		rcu_read_unlock();
2049 		skip_elems = 0;
2050 	}
2051 
2052 	info->bucket_id = i;
2053 	info->skip_elems = 0;
2054 	return NULL;
2055 }
2056 
2057 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
2058 {
2059 	struct bpf_iter_seq_hash_map_info *info = seq->private;
2060 	struct htab_elem *elem;
2061 
2062 	elem = bpf_hash_map_seq_find_next(info, NULL);
2063 	if (!elem)
2064 		return NULL;
2065 
2066 	if (*pos == 0)
2067 		++*pos;
2068 	return elem;
2069 }
2070 
2071 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2072 {
2073 	struct bpf_iter_seq_hash_map_info *info = seq->private;
2074 
2075 	++*pos;
2076 	++info->skip_elems;
2077 	return bpf_hash_map_seq_find_next(info, v);
2078 }
2079 
2080 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
2081 {
2082 	struct bpf_iter_seq_hash_map_info *info = seq->private;
2083 	struct bpf_iter__bpf_map_elem ctx = {};
2084 	struct bpf_map *map = info->map;
2085 	struct bpf_iter_meta meta;
2086 	int ret = 0, off = 0, cpu;
2087 	u32 roundup_value_size;
2088 	struct bpf_prog *prog;
2089 	void __percpu *pptr;
2090 
2091 	meta.seq = seq;
2092 	prog = bpf_iter_get_info(&meta, elem == NULL);
2093 	if (prog) {
2094 		ctx.meta = &meta;
2095 		ctx.map = info->map;
2096 		if (elem) {
2097 			ctx.key = elem->key;
2098 			if (!info->percpu_value_buf) {
2099 				ctx.value = htab_elem_value(elem, map->key_size);
2100 			} else {
2101 				roundup_value_size = round_up(map->value_size, 8);
2102 				pptr = htab_elem_get_ptr(elem, map->key_size);
2103 				for_each_possible_cpu(cpu) {
2104 					copy_map_value_long(map, info->percpu_value_buf + off,
2105 							    per_cpu_ptr(pptr, cpu));
2106 					check_and_init_map_value(map, info->percpu_value_buf + off);
2107 					off += roundup_value_size;
2108 				}
2109 				ctx.value = info->percpu_value_buf;
2110 			}
2111 		}
2112 		ret = bpf_iter_run_prog(prog, &ctx);
2113 	}
2114 
2115 	return ret;
2116 }
2117 
2118 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
2119 {
2120 	return __bpf_hash_map_seq_show(seq, v);
2121 }
2122 
2123 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2124 {
2125 	if (!v)
2126 		(void)__bpf_hash_map_seq_show(seq, NULL);
2127 	else
2128 		rcu_read_unlock();
2129 }
2130 
2131 static int bpf_iter_init_hash_map(void *priv_data,
2132 				  struct bpf_iter_aux_info *aux)
2133 {
2134 	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2135 	struct bpf_map *map = aux->map;
2136 	void *value_buf;
2137 	u32 buf_size;
2138 
2139 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2140 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2141 		buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2142 		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2143 		if (!value_buf)
2144 			return -ENOMEM;
2145 
2146 		seq_info->percpu_value_buf = value_buf;
2147 	}
2148 
2149 	bpf_map_inc_with_uref(map);
2150 	seq_info->map = map;
2151 	seq_info->htab = container_of(map, struct bpf_htab, map);
2152 	return 0;
2153 }
2154 
2155 static void bpf_iter_fini_hash_map(void *priv_data)
2156 {
2157 	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2158 
2159 	bpf_map_put_with_uref(seq_info->map);
2160 	kfree(seq_info->percpu_value_buf);
2161 }
2162 
2163 static const struct seq_operations bpf_hash_map_seq_ops = {
2164 	.start	= bpf_hash_map_seq_start,
2165 	.next	= bpf_hash_map_seq_next,
2166 	.stop	= bpf_hash_map_seq_stop,
2167 	.show	= bpf_hash_map_seq_show,
2168 };
2169 
2170 static const struct bpf_iter_seq_info iter_seq_info = {
2171 	.seq_ops		= &bpf_hash_map_seq_ops,
2172 	.init_seq_private	= bpf_iter_init_hash_map,
2173 	.fini_seq_private	= bpf_iter_fini_hash_map,
2174 	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
2175 };
2176 
2177 static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2178 				   void *callback_ctx, u64 flags)
2179 {
2180 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2181 	struct hlist_nulls_head *head;
2182 	struct hlist_nulls_node *n;
2183 	struct htab_elem *elem;
2184 	int i, num_elems = 0;
2185 	void __percpu *pptr;
2186 	struct bucket *b;
2187 	void *key, *val;
2188 	bool is_percpu;
2189 	u64 ret = 0;
2190 
2191 	cant_migrate();
2192 
2193 	if (flags != 0)
2194 		return -EINVAL;
2195 
2196 	is_percpu = htab_is_percpu(htab);
2197 
2198 	/* migration has been disabled, so percpu value prepared here will be
2199 	 * the same as the one seen by the bpf program with
2200 	 * bpf_map_lookup_elem().
2201 	 */
2202 	for (i = 0; i < htab->n_buckets; i++) {
2203 		b = &htab->buckets[i];
2204 		rcu_read_lock();
2205 		head = &b->head;
2206 		hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
2207 			key = elem->key;
2208 			if (is_percpu) {
2209 				/* current cpu value for percpu map */
2210 				pptr = htab_elem_get_ptr(elem, map->key_size);
2211 				val = this_cpu_ptr(pptr);
2212 			} else {
2213 				val = htab_elem_value(elem, map->key_size);
2214 			}
2215 			num_elems++;
2216 			ret = callback_fn((u64)(long)map, (u64)(long)key,
2217 					  (u64)(long)val, (u64)(long)callback_ctx, 0);
2218 			/* return value: 0 - continue, 1 - stop and return */
2219 			if (ret) {
2220 				rcu_read_unlock();
2221 				goto out;
2222 			}
2223 		}
2224 		rcu_read_unlock();
2225 	}
2226 out:
2227 	return num_elems;
2228 }
2229 
2230 static u64 htab_map_mem_usage(const struct bpf_map *map)
2231 {
2232 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2233 	u32 value_size = round_up(htab->map.value_size, 8);
2234 	bool prealloc = htab_is_prealloc(htab);
2235 	bool percpu = htab_is_percpu(htab);
2236 	bool lru = htab_is_lru(htab);
2237 	u64 num_entries, usage;
2238 
2239 	usage = sizeof(struct bpf_htab) +
2240 		sizeof(struct bucket) * htab->n_buckets;
2241 
2242 	if (prealloc) {
2243 		num_entries = map->max_entries;
2244 		if (htab_has_extra_elems(htab))
2245 			num_entries += num_possible_cpus();
2246 
2247 		usage += htab->elem_size * num_entries;
2248 
2249 		if (percpu)
2250 			usage += value_size * num_possible_cpus() * num_entries;
2251 		else if (!lru)
2252 			usage += sizeof(struct htab_elem *) * num_possible_cpus();
2253 	} else {
2254 #define LLIST_NODE_SZ sizeof(struct llist_node)
2255 
2256 		num_entries = htab->use_percpu_counter ?
2257 					  percpu_counter_sum(&htab->pcount) :
2258 					  atomic_read(&htab->count);
2259 		usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2260 		if (percpu) {
2261 			usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
2262 			usage += value_size * num_possible_cpus() * num_entries;
2263 		}
2264 	}
2265 	return usage;
2266 }
2267 
2268 BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2269 const struct bpf_map_ops htab_map_ops = {
2270 	.map_meta_equal = bpf_map_meta_equal,
2271 	.map_alloc_check = htab_map_alloc_check,
2272 	.map_alloc = htab_map_alloc,
2273 	.map_free = htab_map_free,
2274 	.map_get_next_key = htab_map_get_next_key,
2275 	.map_release_uref = htab_map_free_internal_structs,
2276 	.map_lookup_elem = htab_map_lookup_elem,
2277 	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2278 	.map_update_elem = htab_map_update_elem,
2279 	.map_delete_elem = htab_map_delete_elem,
2280 	.map_gen_lookup = htab_map_gen_lookup,
2281 	.map_seq_show_elem = htab_map_seq_show_elem,
2282 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2283 	.map_for_each_callback = bpf_for_each_hash_elem,
2284 	.map_mem_usage = htab_map_mem_usage,
2285 	BATCH_OPS(htab),
2286 	.map_btf_id = &htab_map_btf_ids[0],
2287 	.iter_seq_info = &iter_seq_info,
2288 };
2289 
2290 const struct bpf_map_ops htab_lru_map_ops = {
2291 	.map_meta_equal = bpf_map_meta_equal,
2292 	.map_alloc_check = htab_map_alloc_check,
2293 	.map_alloc = htab_map_alloc,
2294 	.map_free = htab_map_free,
2295 	.map_get_next_key = htab_map_get_next_key,
2296 	.map_release_uref = htab_map_free_internal_structs,
2297 	.map_lookup_elem = htab_lru_map_lookup_elem,
2298 	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2299 	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2300 	.map_update_elem = htab_lru_map_update_elem,
2301 	.map_delete_elem = htab_lru_map_delete_elem,
2302 	.map_gen_lookup = htab_lru_map_gen_lookup,
2303 	.map_seq_show_elem = htab_map_seq_show_elem,
2304 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2305 	.map_for_each_callback = bpf_for_each_hash_elem,
2306 	.map_mem_usage = htab_map_mem_usage,
2307 	BATCH_OPS(htab_lru),
2308 	.map_btf_id = &htab_map_btf_ids[0],
2309 	.iter_seq_info = &iter_seq_info,
2310 };
2311 
2312 /* Called from eBPF program */
2313 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2314 {
2315 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2316 
2317 	if (l)
2318 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2319 	else
2320 		return NULL;
2321 }
2322 
2323 /* inline bpf_map_lookup_elem() call for per-CPU hashmap */
2324 static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
2325 {
2326 	struct bpf_insn *insn = insn_buf;
2327 
2328 	if (!bpf_jit_supports_percpu_insn())
2329 		return -EOPNOTSUPP;
2330 
2331 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2332 		     (void *(*)(struct bpf_map *map, void *key))NULL));
2333 	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2334 	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
2335 	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
2336 				offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
2337 	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
2338 	*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
2339 
2340 	return insn - insn_buf;
2341 }
2342 
2343 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2344 {
2345 	struct htab_elem *l;
2346 
2347 	if (cpu >= nr_cpu_ids)
2348 		return NULL;
2349 
2350 	l = __htab_map_lookup_elem(map, key);
2351 	if (l)
2352 		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2353 	else
2354 		return NULL;
2355 }
2356 
2357 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2358 {
2359 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2360 
2361 	if (l) {
2362 		bpf_lru_node_set_ref(&l->lru_node);
2363 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2364 	}
2365 
2366 	return NULL;
2367 }
2368 
2369 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2370 {
2371 	struct htab_elem *l;
2372 
2373 	if (cpu >= nr_cpu_ids)
2374 		return NULL;
2375 
2376 	l = __htab_map_lookup_elem(map, key);
2377 	if (l) {
2378 		bpf_lru_node_set_ref(&l->lru_node);
2379 		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2380 	}
2381 
2382 	return NULL;
2383 }
2384 
2385 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
2386 {
2387 	struct htab_elem *l;
2388 	void __percpu *pptr;
2389 	int ret = -ENOENT;
2390 	int cpu, off = 0;
2391 	u32 size;
2392 
2393 	/* per_cpu areas are zero-filled and bpf programs can only
2394 	 * access 'value_size' of them, so copying rounded areas
2395 	 * will not leak any kernel data
2396 	 */
2397 	size = round_up(map->value_size, 8);
2398 	rcu_read_lock();
2399 	l = __htab_map_lookup_elem(map, key);
2400 	if (!l)
2401 		goto out;
2402 	ret = 0;
2403 	/* We do not mark LRU map element here in order to not mess up
2404 	 * eviction heuristics when user space does a map walk.
2405 	 */
2406 	pptr = htab_elem_get_ptr(l, map->key_size);
2407 	if (map_flags & BPF_F_CPU) {
2408 		cpu = map_flags >> 32;
2409 		copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
2410 		check_and_init_map_value(map, value);
2411 		goto out;
2412 	}
2413 	for_each_possible_cpu(cpu) {
2414 		copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
2415 		check_and_init_map_value(map, value + off);
2416 		off += size;
2417 	}
2418 out:
2419 	rcu_read_unlock();
2420 	return ret;
2421 }
2422 
2423 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2424 			   u64 map_flags)
2425 {
2426 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2427 	int ret;
2428 
2429 	rcu_read_lock();
2430 	if (htab_is_lru(htab))
2431 		ret = __htab_lru_percpu_map_update_elem(map, key, value,
2432 							map_flags, true);
2433 	else
2434 		ret = htab_map_update_elem_in_place(map, key, value, map_flags,
2435 						    true, true);
2436 	rcu_read_unlock();
2437 
2438 	return ret;
2439 }
2440 
2441 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2442 					  struct seq_file *m)
2443 {
2444 	struct htab_elem *l;
2445 	void __percpu *pptr;
2446 	int cpu;
2447 
2448 	rcu_read_lock();
2449 
2450 	l = __htab_map_lookup_elem(map, key);
2451 	if (!l) {
2452 		rcu_read_unlock();
2453 		return;
2454 	}
2455 
2456 	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2457 	seq_puts(m, ": {\n");
2458 	pptr = htab_elem_get_ptr(l, map->key_size);
2459 	for_each_possible_cpu(cpu) {
2460 		seq_printf(m, "\tcpu%d: ", cpu);
2461 		btf_type_seq_show(map->btf, map->btf_value_type_id,
2462 				  per_cpu_ptr(pptr, cpu), m);
2463 		seq_putc(m, '\n');
2464 	}
2465 	seq_puts(m, "}\n");
2466 
2467 	rcu_read_unlock();
2468 }
2469 
2470 const struct bpf_map_ops htab_percpu_map_ops = {
2471 	.map_meta_equal = bpf_map_meta_equal,
2472 	.map_alloc_check = htab_map_alloc_check,
2473 	.map_alloc = htab_map_alloc,
2474 	.map_free = htab_map_free,
2475 	.map_get_next_key = htab_map_get_next_key,
2476 	.map_lookup_elem = htab_percpu_map_lookup_elem,
2477 	.map_gen_lookup = htab_percpu_map_gen_lookup,
2478 	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2479 	.map_update_elem = htab_percpu_map_update_elem,
2480 	.map_delete_elem = htab_map_delete_elem,
2481 	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2482 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2483 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2484 	.map_for_each_callback = bpf_for_each_hash_elem,
2485 	.map_mem_usage = htab_map_mem_usage,
2486 	BATCH_OPS(htab_percpu),
2487 	.map_btf_id = &htab_map_btf_ids[0],
2488 	.iter_seq_info = &iter_seq_info,
2489 };
2490 
2491 const struct bpf_map_ops htab_lru_percpu_map_ops = {
2492 	.map_meta_equal = bpf_map_meta_equal,
2493 	.map_alloc_check = htab_map_alloc_check,
2494 	.map_alloc = htab_map_alloc,
2495 	.map_free = htab_map_free,
2496 	.map_get_next_key = htab_map_get_next_key,
2497 	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2498 	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2499 	.map_update_elem = htab_lru_percpu_map_update_elem,
2500 	.map_delete_elem = htab_lru_map_delete_elem,
2501 	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2502 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2503 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2504 	.map_for_each_callback = bpf_for_each_hash_elem,
2505 	.map_mem_usage = htab_map_mem_usage,
2506 	BATCH_OPS(htab_lru_percpu),
2507 	.map_btf_id = &htab_map_btf_ids[0],
2508 	.iter_seq_info = &iter_seq_info,
2509 };
2510 
2511 static int fd_htab_map_alloc_check(union bpf_attr *attr)
2512 {
2513 	if (attr->value_size != sizeof(u32))
2514 		return -EINVAL;
2515 	return htab_map_alloc_check(attr);
2516 }
2517 
2518 static void fd_htab_map_free(struct bpf_map *map)
2519 {
2520 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2521 	struct hlist_nulls_node *n;
2522 	struct hlist_nulls_head *head;
2523 	struct htab_elem *l;
2524 	int i;
2525 
2526 	for (i = 0; i < htab->n_buckets; i++) {
2527 		head = select_bucket(htab, i);
2528 
2529 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2530 			void *ptr = fd_htab_map_get_ptr(map, l);
2531 
2532 			map->ops->map_fd_put_ptr(map, ptr, false);
2533 		}
2534 	}
2535 
2536 	htab_map_free(map);
2537 }
2538 
2539 /* only called from syscall */
2540 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2541 {
2542 	void **ptr;
2543 	int ret = 0;
2544 
2545 	if (!map->ops->map_fd_sys_lookup_elem)
2546 		return -ENOTSUPP;
2547 
2548 	rcu_read_lock();
2549 	ptr = htab_map_lookup_elem(map, key);
2550 	if (ptr)
2551 		*value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2552 	else
2553 		ret = -ENOENT;
2554 	rcu_read_unlock();
2555 
2556 	return ret;
2557 }
2558 
2559 /* Only called from syscall */
2560 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2561 				void *key, void *value, u64 map_flags)
2562 {
2563 	void *ptr;
2564 	int ret;
2565 
2566 	ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
2567 	if (IS_ERR(ptr))
2568 		return PTR_ERR(ptr);
2569 
2570 	/* The htab bucket lock is always held during update operations in fd
2571 	 * htab map, and the following rcu_read_lock() is only used to avoid
2572 	 * the WARN_ON_ONCE in htab_map_update_elem_in_place().
2573 	 */
2574 	rcu_read_lock();
2575 	ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
2576 	rcu_read_unlock();
2577 	if (ret)
2578 		map->ops->map_fd_put_ptr(map, ptr, false);
2579 
2580 	return ret;
2581 }
2582 
2583 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2584 {
2585 	struct bpf_map *map, *inner_map_meta;
2586 
2587 	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2588 	if (IS_ERR(inner_map_meta))
2589 		return inner_map_meta;
2590 
2591 	map = htab_map_alloc(attr);
2592 	if (IS_ERR(map)) {
2593 		bpf_map_meta_free(inner_map_meta);
2594 		return map;
2595 	}
2596 
2597 	map->inner_map_meta = inner_map_meta;
2598 
2599 	return map;
2600 }
2601 
2602 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2603 {
2604 	struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
2605 
2606 	if (!inner_map)
2607 		return NULL;
2608 
2609 	return READ_ONCE(*inner_map);
2610 }
2611 
2612 static int htab_of_map_gen_lookup(struct bpf_map *map,
2613 				  struct bpf_insn *insn_buf)
2614 {
2615 	struct bpf_insn *insn = insn_buf;
2616 	const int ret = BPF_REG_0;
2617 
2618 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2619 		     (void *(*)(struct bpf_map *map, void *key))NULL));
2620 	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2621 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2622 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2623 				offsetof(struct htab_elem, key) +
2624 				round_up(map->key_size, 8));
2625 	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2626 
2627 	return insn - insn_buf;
2628 }
2629 
2630 static void htab_of_map_free(struct bpf_map *map)
2631 {
2632 	bpf_map_meta_free(map->inner_map_meta);
2633 	fd_htab_map_free(map);
2634 }
2635 
2636 const struct bpf_map_ops htab_of_maps_map_ops = {
2637 	.map_alloc_check = fd_htab_map_alloc_check,
2638 	.map_alloc = htab_of_map_alloc,
2639 	.map_free = htab_of_map_free,
2640 	.map_get_next_key = htab_map_get_next_key,
2641 	.map_lookup_elem = htab_of_map_lookup_elem,
2642 	.map_delete_elem = htab_map_delete_elem,
2643 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
2644 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
2645 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2646 	.map_gen_lookup = htab_of_map_gen_lookup,
2647 	.map_check_btf = map_check_no_btf,
2648 	.map_mem_usage = htab_map_mem_usage,
2649 	BATCH_OPS(htab),
2650 	.map_btf_id = &htab_map_btf_ids[0],
2651 };
2652