xref: /linux/kernel/bpf/hashtab.c (revision 9779193e871b144e34ec4a3e50109b3778a51a69)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  * Copyright (c) 2016 Facebook
4  */
5 #include <linux/bpf.h>
6 #include <linux/btf.h>
7 #include <linux/jhash.h>
8 #include <linux/filter.h>
9 #include <linux/rculist_nulls.h>
10 #include <linux/rcupdate_wait.h>
11 #include <linux/random.h>
12 #include <linux/rhashtable.h>
13 #include <uapi/linux/btf.h>
14 #include <linux/rcupdate_trace.h>
15 #include <linux/btf_ids.h>
16 #include "percpu_freelist.h"
17 #include "bpf_lru_list.h"
18 #include "map_in_map.h"
19 #include <linux/bpf_mem_alloc.h>
20 #include <asm/rqspinlock.h>
21 
22 #define HTAB_CREATE_FLAG_MASK						\
23 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
24 	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
25 
26 #define BATCH_OPS(_name)			\
27 	.map_lookup_batch =			\
28 	_name##_map_lookup_batch,		\
29 	.map_lookup_and_delete_batch =		\
30 	_name##_map_lookup_and_delete_batch,	\
31 	.map_update_batch =			\
32 	generic_map_update_batch,		\
33 	.map_delete_batch =			\
34 	generic_map_delete_batch
35 
36 /*
37  * The bucket lock has two protection scopes:
38  *
39  * 1) Serializing concurrent operations from BPF programs on different
40  *    CPUs
41  *
42  * 2) Serializing concurrent operations from BPF programs and sys_bpf()
43  *
44  * BPF programs can execute in any context including perf, kprobes and
45  * tracing. As there are almost no limits where perf, kprobes and tracing
46  * can be invoked from the lock operations need to be protected against
47  * deadlocks. Deadlocks can be caused by recursion and by an invocation in
48  * the lock held section when functions which acquire this lock are invoked
49  * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
50  * variable bpf_prog_active, which prevents BPF programs attached to perf
51  * events, kprobes and tracing to be invoked before the prior invocation
52  * from one of these contexts completed. sys_bpf() uses the same mechanism
53  * by pinning the task to the current CPU and incrementing the recursion
54  * protection across the map operation.
55  *
56  * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
57  * operations like memory allocations (even with GFP_ATOMIC) from atomic
58  * contexts. This is required because even with GFP_ATOMIC the memory
59  * allocator calls into code paths which acquire locks with long held lock
60  * sections. To ensure the deterministic behaviour these locks are regular
61  * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
62  * true atomic contexts on an RT kernel are the low level hardware
63  * handling, scheduling, low level interrupt handling, NMIs etc. None of
64  * these contexts should ever do memory allocations.
65  *
66  * As regular device interrupt handlers and soft interrupts are forced into
67  * thread context, the existing code which does
68  *   spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
69  * just works.
70  *
71  * In theory the BPF locks could be converted to regular spinlocks as well,
72  * but the bucket locks and percpu_freelist locks can be taken from
73  * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
74  * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
75  * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
76  * because there is no memory allocation within the lock held sections. However
77  * after hash map was fully converted to use bpf_mem_alloc, there will be
78  * non-synchronous memory allocation for non-preallocated hash map, so it is
79  * safe to always use raw spinlock for bucket lock.
80  */
81 struct bucket {
82 	struct hlist_nulls_head head;
83 	rqspinlock_t raw_lock;
84 };
85 
86 struct bpf_htab {
87 	struct bpf_map map;
88 	struct bpf_mem_alloc ma;
89 	struct bpf_mem_alloc pcpu_ma;
90 	struct bucket *buckets;
91 	void *elems;
92 	union {
93 		struct pcpu_freelist freelist;
94 		struct bpf_lru lru;
95 	};
96 	struct htab_elem *__percpu *extra_elems;
97 	/* number of elements in non-preallocated hashtable are kept
98 	 * in either pcount or count
99 	 */
100 	struct percpu_counter pcount;
101 	atomic_t count;
102 	bool use_percpu_counter;
103 	u32 n_buckets;	/* number of hash buckets */
104 	u32 elem_size;	/* size of each element in bytes */
105 	u32 hashrnd;
106 };
107 
108 /* each htab element is struct htab_elem + key + value */
109 struct htab_elem {
110 	union {
111 		struct hlist_nulls_node hash_node;
112 		struct {
113 			void *padding;
114 			union {
115 				struct pcpu_freelist_node fnode;
116 				struct htab_elem *batch_flink;
117 			};
118 		};
119 	};
120 	union {
121 		/* pointer to per-cpu pointer */
122 		void *ptr_to_pptr;
123 		struct bpf_lru_node lru_node;
124 	};
125 	u32 hash;
126 	char key[] __aligned(8);
127 };
128 
129 struct htab_btf_record {
130 	struct btf_record *record;
131 	u32 key_size;
132 };
133 
134 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
135 {
136 	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
137 }
138 
139 static void htab_init_buckets(struct bpf_htab *htab)
140 {
141 	unsigned int i;
142 
143 	for (i = 0; i < htab->n_buckets; i++) {
144 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
145 		raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
146 		cond_resched();
147 	}
148 }
149 
150 static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
151 {
152 	unsigned long flags;
153 	int ret;
154 
155 	ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
156 	if (ret)
157 		return ret;
158 	*pflags = flags;
159 	return 0;
160 }
161 
162 static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
163 {
164 	raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
165 }
166 
167 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
168 
169 static bool htab_is_lru(const struct bpf_htab *htab)
170 {
171 	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
172 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
173 }
174 
175 static bool htab_is_percpu(const struct bpf_htab *htab)
176 {
177 	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
178 		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
179 }
180 
181 static inline bool is_fd_htab(const struct bpf_htab *htab)
182 {
183 	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
184 }
185 
186 static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
187 {
188 	return l->key + round_up(key_size, 8);
189 }
190 
191 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
192 				     void __percpu *pptr)
193 {
194 	*(void __percpu **)htab_elem_value(l, key_size) = pptr;
195 }
196 
197 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
198 {
199 	return *(void __percpu **)htab_elem_value(l, key_size);
200 }
201 
202 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
203 {
204 	return *(void **)htab_elem_value(l, map->key_size);
205 }
206 
207 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
208 {
209 	return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
210 }
211 
212 /* Both percpu and fd htab support in-place update, so no need for
213  * extra elem. LRU itself can remove the least used element, so
214  * there is no need for an extra elem during map_update.
215  */
216 static bool htab_has_extra_elems(struct bpf_htab *htab)
217 {
218 	return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
219 }
220 
221 static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
222 {
223 	u32 num_entries = htab->map.max_entries;
224 	int i;
225 
226 	if (htab_has_extra_elems(htab))
227 		num_entries += num_possible_cpus();
228 
229 	for (i = 0; i < num_entries; i++) {
230 		struct htab_elem *elem;
231 
232 		elem = get_htab_elem(htab, i);
233 		bpf_map_free_internal_structs(&htab->map,
234 					      htab_elem_value(elem, htab->map.key_size));
235 		cond_resched();
236 	}
237 }
238 
239 static void htab_free_prealloced_fields(struct bpf_htab *htab)
240 {
241 	u32 num_entries = htab->map.max_entries;
242 	int i;
243 
244 	if (IS_ERR_OR_NULL(htab->map.record))
245 		return;
246 	if (htab_has_extra_elems(htab))
247 		num_entries += num_possible_cpus();
248 	for (i = 0; i < num_entries; i++) {
249 		struct htab_elem *elem;
250 
251 		elem = get_htab_elem(htab, i);
252 		if (htab_is_percpu(htab)) {
253 			void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
254 			int cpu;
255 
256 			for_each_possible_cpu(cpu) {
257 				bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
258 				cond_resched();
259 			}
260 		} else {
261 			bpf_obj_free_fields(htab->map.record,
262 					    htab_elem_value(elem, htab->map.key_size));
263 			cond_resched();
264 		}
265 		cond_resched();
266 	}
267 }
268 
269 static void htab_free_elems(struct bpf_htab *htab)
270 {
271 	int i;
272 
273 	if (!htab_is_percpu(htab))
274 		goto free_elems;
275 
276 	for (i = 0; i < htab->map.max_entries; i++) {
277 		void __percpu *pptr;
278 
279 		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
280 					 htab->map.key_size);
281 		free_percpu(pptr);
282 		cond_resched();
283 	}
284 free_elems:
285 	bpf_map_area_free(htab->elems);
286 }
287 
288 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock
289  * (bucket_lock). If both locks need to be acquired together, the lock
290  * order is always lru_lock -> bucket_lock and this only happens in
291  * bpf_lru_list.c logic. For example, certain code path of
292  * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
293  * will acquire lru_lock first followed by acquiring bucket_lock.
294  *
295  * In hashtab.c, to avoid deadlock, lock acquisition of
296  * bucket_lock followed by lru_lock is not allowed. In such cases,
297  * bucket_lock needs to be released first before acquiring lru_lock.
298  */
299 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
300 					  u32 hash)
301 {
302 	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
303 	struct htab_elem *l;
304 
305 	if (node) {
306 		bpf_map_inc_elem_count(&htab->map);
307 		l = container_of(node, struct htab_elem, lru_node);
308 		memcpy(l->key, key, htab->map.key_size);
309 		return l;
310 	}
311 
312 	return NULL;
313 }
314 
315 static int prealloc_init(struct bpf_htab *htab)
316 {
317 	u32 num_entries = htab->map.max_entries;
318 	int err = -ENOMEM, i;
319 
320 	if (htab_has_extra_elems(htab))
321 		num_entries += num_possible_cpus();
322 
323 	htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
324 					 htab->map.numa_node);
325 	if (!htab->elems)
326 		return -ENOMEM;
327 
328 	if (!htab_is_percpu(htab))
329 		goto skip_percpu_elems;
330 
331 	for (i = 0; i < num_entries; i++) {
332 		u32 size = round_up(htab->map.value_size, 8);
333 		void __percpu *pptr;
334 
335 		pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
336 					    GFP_USER | __GFP_NOWARN);
337 		if (!pptr)
338 			goto free_elems;
339 		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
340 				  pptr);
341 		cond_resched();
342 	}
343 
344 skip_percpu_elems:
345 	if (htab_is_lru(htab))
346 		err = bpf_lru_init(&htab->lru,
347 				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
348 				   offsetof(struct htab_elem, hash) -
349 				   offsetof(struct htab_elem, lru_node),
350 				   htab_lru_map_delete_node,
351 				   htab);
352 	else
353 		err = pcpu_freelist_init(&htab->freelist);
354 
355 	if (err)
356 		goto free_elems;
357 
358 	if (htab_is_lru(htab))
359 		bpf_lru_populate(&htab->lru, htab->elems,
360 				 offsetof(struct htab_elem, lru_node),
361 				 htab->elem_size, num_entries);
362 	else
363 		pcpu_freelist_populate(&htab->freelist,
364 				       htab->elems + offsetof(struct htab_elem, fnode),
365 				       htab->elem_size, num_entries);
366 
367 	return 0;
368 
369 free_elems:
370 	htab_free_elems(htab);
371 	return err;
372 }
373 
374 static void prealloc_destroy(struct bpf_htab *htab)
375 {
376 	htab_free_elems(htab);
377 
378 	if (htab_is_lru(htab))
379 		bpf_lru_destroy(&htab->lru);
380 	else
381 		pcpu_freelist_destroy(&htab->freelist);
382 }
383 
384 static int alloc_extra_elems(struct bpf_htab *htab)
385 {
386 	struct htab_elem *__percpu *pptr, *l_new;
387 	struct pcpu_freelist_node *l;
388 	int cpu;
389 
390 	pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
391 				    GFP_USER | __GFP_NOWARN);
392 	if (!pptr)
393 		return -ENOMEM;
394 
395 	for_each_possible_cpu(cpu) {
396 		l = pcpu_freelist_pop(&htab->freelist);
397 		/* pop will succeed, since prealloc_init()
398 		 * preallocated extra num_possible_cpus elements
399 		 */
400 		l_new = container_of(l, struct htab_elem, fnode);
401 		*per_cpu_ptr(pptr, cpu) = l_new;
402 	}
403 	htab->extra_elems = pptr;
404 	return 0;
405 }
406 
407 /* Called from syscall */
408 static int htab_map_alloc_check(union bpf_attr *attr)
409 {
410 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
411 		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
412 	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
413 		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
414 	/* percpu_lru means each cpu has its own LRU list.
415 	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
416 	 * the map's value itself is percpu.  percpu_lru has
417 	 * nothing to do with the map's value.
418 	 */
419 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
420 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
421 	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
422 	int numa_node = bpf_map_attr_numa_node(attr);
423 
424 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
425 		     offsetof(struct htab_elem, hash_node.pprev));
426 
427 	if (zero_seed && !capable(CAP_SYS_ADMIN))
428 		/* Guard against local DoS, and discourage production use. */
429 		return -EPERM;
430 
431 	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
432 	    !bpf_map_flags_access_ok(attr->map_flags))
433 		return -EINVAL;
434 
435 	if (!lru && percpu_lru)
436 		return -EINVAL;
437 
438 	if (lru && !prealloc)
439 		return -ENOTSUPP;
440 
441 	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
442 		return -EINVAL;
443 
444 	/* check sanity of attributes.
445 	 * value_size == 0 may be allowed in the future to use map as a set
446 	 */
447 	if (attr->max_entries == 0 || attr->key_size == 0 ||
448 	    attr->value_size == 0)
449 		return -EINVAL;
450 
451 	if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
452 	   sizeof(struct htab_elem))
453 		/* if key_size + value_size is bigger, the user space won't be
454 		 * able to access the elements via bpf syscall. This check
455 		 * also makes sure that the elem_size doesn't overflow and it's
456 		 * kmalloc-able later in htab_map_update_elem()
457 		 */
458 		return -E2BIG;
459 	/* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
460 	if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
461 		return -E2BIG;
462 
463 	return 0;
464 }
465 
466 static void htab_mem_dtor(void *obj, void *ctx)
467 {
468 	struct htab_btf_record *hrec = ctx;
469 	struct htab_elem *elem = obj;
470 	void *map_value;
471 
472 	if (IS_ERR_OR_NULL(hrec->record))
473 		return;
474 
475 	map_value = htab_elem_value(elem, hrec->key_size);
476 	bpf_obj_free_fields(hrec->record, map_value);
477 }
478 
479 static void htab_pcpu_mem_dtor(void *obj, void *ctx)
480 {
481 	void __percpu *pptr = *(void __percpu **)obj;
482 	struct htab_btf_record *hrec = ctx;
483 	int cpu;
484 
485 	if (IS_ERR_OR_NULL(hrec->record))
486 		return;
487 
488 	for_each_possible_cpu(cpu)
489 		bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu));
490 }
491 
492 static void htab_dtor_ctx_free(void *ctx)
493 {
494 	struct htab_btf_record *hrec = ctx;
495 
496 	btf_record_free(hrec->record);
497 	kfree(ctx);
498 }
499 
500 static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma,
501 			   void (*dtor)(void *, void *))
502 {
503 	struct htab_btf_record *hrec;
504 	int err;
505 
506 	/* No need for dtors. */
507 	if (IS_ERR_OR_NULL(map->record))
508 		return 0;
509 
510 	hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
511 	if (!hrec)
512 		return -ENOMEM;
513 	hrec->key_size = map->key_size;
514 	hrec->record = btf_record_dup(map->record);
515 	if (IS_ERR(hrec->record)) {
516 		err = PTR_ERR(hrec->record);
517 		kfree(hrec);
518 		return err;
519 	}
520 	bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
521 	return 0;
522 }
523 
524 static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
525 			      const struct btf_type *key_type, const struct btf_type *value_type)
526 {
527 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
528 
529 	if (htab_is_prealloc(htab))
530 		return 0;
531 	/*
532 	 * We must set the dtor using this callback, as map's BTF record is not
533 	 * populated in htab_map_alloc(), so it will always appear as NULL.
534 	 */
535 	if (htab_is_percpu(htab))
536 		return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor);
537 	else
538 		return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor);
539 }
540 
541 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
542 {
543 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
544 		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
545 	/* percpu_lru means each cpu has its own LRU list.
546 	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
547 	 * the map's value itself is percpu.  percpu_lru has
548 	 * nothing to do with the map's value.
549 	 */
550 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
551 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
552 	struct bpf_htab *htab;
553 	int err;
554 
555 	htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
556 	if (!htab)
557 		return ERR_PTR(-ENOMEM);
558 
559 	bpf_map_init_from_attr(&htab->map, attr);
560 
561 	if (percpu_lru) {
562 		/* ensure each CPU's lru list has >=1 elements.
563 		 * since we are at it, make each lru list has the same
564 		 * number of elements.
565 		 */
566 		htab->map.max_entries = roundup(attr->max_entries,
567 						num_possible_cpus());
568 		if (htab->map.max_entries < attr->max_entries)
569 			htab->map.max_entries = rounddown(attr->max_entries,
570 							  num_possible_cpus());
571 	}
572 
573 	/* hash table size must be power of 2; roundup_pow_of_two() can overflow
574 	 * into UB on 32-bit arches, so check that first
575 	 */
576 	err = -E2BIG;
577 	if (htab->map.max_entries > 1UL << 31)
578 		goto free_htab;
579 
580 	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
581 
582 	htab->elem_size = sizeof(struct htab_elem) +
583 			  round_up(htab->map.key_size, 8);
584 	if (percpu)
585 		htab->elem_size += sizeof(void *);
586 	else
587 		htab->elem_size += round_up(htab->map.value_size, 8);
588 
589 	/* check for u32 overflow */
590 	if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
591 		goto free_htab;
592 
593 	err = bpf_map_init_elem_count(&htab->map);
594 	if (err)
595 		goto free_htab;
596 
597 	err = -ENOMEM;
598 	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
599 					   sizeof(struct bucket),
600 					   htab->map.numa_node);
601 	if (!htab->buckets)
602 		goto free_elem_count;
603 
604 	if (htab->map.map_flags & BPF_F_ZERO_SEED)
605 		htab->hashrnd = 0;
606 	else
607 		htab->hashrnd = get_random_u32();
608 
609 	htab_init_buckets(htab);
610 
611 /* compute_batch_value() computes batch value as num_online_cpus() * 2
612  * and __percpu_counter_compare() needs
613  * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
614  * for percpu_counter to be faster than atomic_t. In practice the average bpf
615  * hash map size is 10k, which means that a system with 64 cpus will fill
616  * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
617  * define our own batch count as 32 then 10k hash map can be filled up to 80%:
618  * 10k - 8k > 32 _batch_ * 64 _cpus_
619  * and __percpu_counter_compare() will still be fast. At that point hash map
620  * collisions will dominate its performance anyway. Assume that hash map filled
621  * to 50+% isn't going to be O(1) and use the following formula to choose
622  * between percpu_counter and atomic_t.
623  */
624 #define PERCPU_COUNTER_BATCH 32
625 	if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
626 		htab->use_percpu_counter = true;
627 
628 	if (htab->use_percpu_counter) {
629 		err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
630 		if (err)
631 			goto free_map_locked;
632 	}
633 
634 	if (prealloc) {
635 		err = prealloc_init(htab);
636 		if (err)
637 			goto free_map_locked;
638 
639 		if (htab_has_extra_elems(htab)) {
640 			err = alloc_extra_elems(htab);
641 			if (err)
642 				goto free_prealloc;
643 		}
644 	} else {
645 		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
646 		if (err)
647 			goto free_map_locked;
648 		if (percpu) {
649 			err = bpf_mem_alloc_init(&htab->pcpu_ma,
650 						 round_up(htab->map.value_size, 8), true);
651 			if (err)
652 				goto free_map_locked;
653 		}
654 	}
655 
656 	return &htab->map;
657 
658 free_prealloc:
659 	prealloc_destroy(htab);
660 free_map_locked:
661 	if (htab->use_percpu_counter)
662 		percpu_counter_destroy(&htab->pcount);
663 	bpf_map_area_free(htab->buckets);
664 	bpf_mem_alloc_destroy(&htab->pcpu_ma);
665 	bpf_mem_alloc_destroy(&htab->ma);
666 free_elem_count:
667 	bpf_map_free_elem_count(&htab->map);
668 free_htab:
669 	bpf_map_area_free(htab);
670 	return ERR_PTR(err);
671 }
672 
673 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
674 {
675 	if (likely(key_len % 4 == 0))
676 		return jhash2(key, key_len / 4, hashrnd);
677 	return jhash(key, key_len, hashrnd);
678 }
679 
680 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
681 {
682 	return &htab->buckets[hash & (htab->n_buckets - 1)];
683 }
684 
685 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
686 {
687 	return &__select_bucket(htab, hash)->head;
688 }
689 
690 /* this lookup function can only be called with bucket lock taken */
691 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
692 					 void *key, u32 key_size)
693 {
694 	struct hlist_nulls_node *n;
695 	struct htab_elem *l;
696 
697 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
698 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
699 			return l;
700 
701 	return NULL;
702 }
703 
704 /* can be called without bucket lock. it will repeat the loop in
705  * the unlikely event when elements moved from one bucket into another
706  * while link list is being walked
707  */
708 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
709 					       u32 hash, void *key,
710 					       u32 key_size, u32 n_buckets)
711 {
712 	struct hlist_nulls_node *n;
713 	struct htab_elem *l;
714 
715 again:
716 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
717 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
718 			return l;
719 
720 	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
721 		goto again;
722 
723 	return NULL;
724 }
725 
726 /* Called from syscall or from eBPF program directly, so
727  * arguments have to match bpf_map_lookup_elem() exactly.
728  * The return value is adjusted by BPF instructions
729  * in htab_map_gen_lookup().
730  */
731 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
732 {
733 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
734 	struct hlist_nulls_head *head;
735 	struct htab_elem *l;
736 	u32 hash, key_size;
737 
738 	WARN_ON_ONCE(!bpf_rcu_lock_held());
739 
740 	key_size = map->key_size;
741 
742 	hash = htab_map_hash(key, key_size, htab->hashrnd);
743 
744 	head = select_bucket(htab, hash);
745 
746 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
747 
748 	return l;
749 }
750 
751 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
752 {
753 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
754 
755 	if (l)
756 		return htab_elem_value(l, map->key_size);
757 
758 	return NULL;
759 }
760 
761 /* inline bpf_map_lookup_elem() call.
762  * Instead of:
763  * bpf_prog
764  *   bpf_map_lookup_elem
765  *     map->ops->map_lookup_elem
766  *       htab_map_lookup_elem
767  *         __htab_map_lookup_elem
768  * do:
769  * bpf_prog
770  *   __htab_map_lookup_elem
771  */
772 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
773 {
774 	struct bpf_insn *insn = insn_buf;
775 	const int ret = BPF_REG_0;
776 
777 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
778 		     (void *(*)(struct bpf_map *map, void *key))NULL));
779 	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
780 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
781 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
782 				offsetof(struct htab_elem, key) +
783 				round_up(map->key_size, 8));
784 	return insn - insn_buf;
785 }
786 
787 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
788 							void *key, const bool mark)
789 {
790 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
791 
792 	if (l) {
793 		if (mark)
794 			bpf_lru_node_set_ref(&l->lru_node);
795 		return htab_elem_value(l, map->key_size);
796 	}
797 
798 	return NULL;
799 }
800 
801 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
802 {
803 	return __htab_lru_map_lookup_elem(map, key, true);
804 }
805 
806 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
807 {
808 	return __htab_lru_map_lookup_elem(map, key, false);
809 }
810 
811 static int htab_lru_map_gen_lookup(struct bpf_map *map,
812 				   struct bpf_insn *insn_buf)
813 {
814 	struct bpf_insn *insn = insn_buf;
815 	const int ret = BPF_REG_0;
816 	const int ref_reg = BPF_REG_1;
817 
818 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
819 		     (void *(*)(struct bpf_map *map, void *key))NULL));
820 	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
821 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
822 	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
823 			      offsetof(struct htab_elem, lru_node) +
824 			      offsetof(struct bpf_lru_node, ref));
825 	*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
826 	*insn++ = BPF_ST_MEM(BPF_B, ret,
827 			     offsetof(struct htab_elem, lru_node) +
828 			     offsetof(struct bpf_lru_node, ref),
829 			     1);
830 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
831 				offsetof(struct htab_elem, key) +
832 				round_up(map->key_size, 8));
833 	return insn - insn_buf;
834 }
835 
836 static void check_and_free_fields(struct bpf_htab *htab,
837 				  struct htab_elem *elem)
838 {
839 	if (IS_ERR_OR_NULL(htab->map.record))
840 		return;
841 
842 	if (htab_is_percpu(htab)) {
843 		void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
844 		int cpu;
845 
846 		for_each_possible_cpu(cpu)
847 			bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
848 	} else {
849 		void *map_value = htab_elem_value(elem, htab->map.key_size);
850 
851 		bpf_obj_free_fields(htab->map.record, map_value);
852 	}
853 }
854 
855 /* It is called from the bpf_lru_list when the LRU needs to delete
856  * older elements from the htab.
857  */
858 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
859 {
860 	struct bpf_htab *htab = arg;
861 	struct htab_elem *l = NULL, *tgt_l;
862 	struct hlist_nulls_head *head;
863 	struct hlist_nulls_node *n;
864 	unsigned long flags;
865 	struct bucket *b;
866 	int ret;
867 
868 	tgt_l = container_of(node, struct htab_elem, lru_node);
869 	b = __select_bucket(htab, tgt_l->hash);
870 	head = &b->head;
871 
872 	ret = htab_lock_bucket(b, &flags);
873 	if (ret)
874 		return false;
875 
876 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
877 		if (l == tgt_l) {
878 			hlist_nulls_del_rcu(&l->hash_node);
879 			bpf_map_dec_elem_count(&htab->map);
880 			break;
881 		}
882 
883 	htab_unlock_bucket(b, flags);
884 
885 	if (l == tgt_l)
886 		check_and_free_fields(htab, l);
887 	return l == tgt_l;
888 }
889 
890 /* Called from syscall */
891 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
892 {
893 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
894 	struct hlist_nulls_head *head;
895 	struct htab_elem *l, *next_l;
896 	u32 hash, key_size;
897 	int i = 0;
898 
899 	WARN_ON_ONCE(!rcu_read_lock_held());
900 
901 	key_size = map->key_size;
902 
903 	if (!key)
904 		goto find_first_elem;
905 
906 	hash = htab_map_hash(key, key_size, htab->hashrnd);
907 
908 	head = select_bucket(htab, hash);
909 
910 	/* lookup the key */
911 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
912 
913 	if (!l)
914 		goto find_first_elem;
915 
916 	/* key was found, get next key in the same bucket */
917 	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
918 				  struct htab_elem, hash_node);
919 
920 	if (next_l) {
921 		/* if next elem in this hash list is non-zero, just return it */
922 		memcpy(next_key, next_l->key, key_size);
923 		return 0;
924 	}
925 
926 	/* no more elements in this hash list, go to the next bucket */
927 	i = hash & (htab->n_buckets - 1);
928 	i++;
929 
930 find_first_elem:
931 	/* iterate over buckets */
932 	for (; i < htab->n_buckets; i++) {
933 		head = select_bucket(htab, i);
934 
935 		/* pick first element in the bucket */
936 		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
937 					  struct htab_elem, hash_node);
938 		if (next_l) {
939 			/* if it's not empty, just return it */
940 			memcpy(next_key, next_l->key, key_size);
941 			return 0;
942 		}
943 	}
944 
945 	/* iterated over all buckets and all elements */
946 	return -ENOENT;
947 }
948 
949 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
950 {
951 	check_and_free_fields(htab, l);
952 
953 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
954 		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
955 	bpf_mem_cache_free(&htab->ma, l);
956 }
957 
958 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
959 {
960 	struct bpf_map *map = &htab->map;
961 	void *ptr;
962 
963 	if (map->ops->map_fd_put_ptr) {
964 		ptr = fd_htab_map_get_ptr(map, l);
965 		map->ops->map_fd_put_ptr(map, ptr, true);
966 	}
967 }
968 
969 static bool is_map_full(struct bpf_htab *htab)
970 {
971 	if (htab->use_percpu_counter)
972 		return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
973 						PERCPU_COUNTER_BATCH) >= 0;
974 	return atomic_read(&htab->count) >= htab->map.max_entries;
975 }
976 
977 static void inc_elem_count(struct bpf_htab *htab)
978 {
979 	bpf_map_inc_elem_count(&htab->map);
980 
981 	if (htab->use_percpu_counter)
982 		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
983 	else
984 		atomic_inc(&htab->count);
985 }
986 
987 static void dec_elem_count(struct bpf_htab *htab)
988 {
989 	bpf_map_dec_elem_count(&htab->map);
990 
991 	if (htab->use_percpu_counter)
992 		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
993 	else
994 		atomic_dec(&htab->count);
995 }
996 
997 
998 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
999 {
1000 	htab_put_fd_value(htab, l);
1001 
1002 	if (htab_is_prealloc(htab)) {
1003 		bpf_map_dec_elem_count(&htab->map);
1004 		check_and_free_fields(htab, l);
1005 		pcpu_freelist_push(&htab->freelist, &l->fnode);
1006 	} else {
1007 		dec_elem_count(htab);
1008 		htab_elem_free(htab, l);
1009 	}
1010 }
1011 
1012 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
1013 			    void *value, bool onallcpus, u64 map_flags)
1014 {
1015 	void *ptr;
1016 
1017 	if (!onallcpus) {
1018 		/* copy true value_size bytes */
1019 		ptr = this_cpu_ptr(pptr);
1020 		copy_map_value(&htab->map, ptr, value);
1021 		bpf_obj_free_fields(htab->map.record, ptr);
1022 	} else {
1023 		u32 size = round_up(htab->map.value_size, 8);
1024 		void *val;
1025 		int cpu;
1026 
1027 		if (map_flags & BPF_F_CPU) {
1028 			cpu = map_flags >> 32;
1029 			ptr = per_cpu_ptr(pptr, cpu);
1030 			copy_map_value(&htab->map, ptr, value);
1031 			bpf_obj_free_fields(htab->map.record, ptr);
1032 			return;
1033 		}
1034 
1035 		for_each_possible_cpu(cpu) {
1036 			ptr = per_cpu_ptr(pptr, cpu);
1037 			val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
1038 			copy_map_value(&htab->map, ptr, val);
1039 			bpf_obj_free_fields(htab->map.record, ptr);
1040 		}
1041 	}
1042 }
1043 
1044 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
1045 			    void *value, bool onallcpus, u64 map_flags)
1046 {
1047 	/* When not setting the initial value on all cpus, zero-fill element
1048 	 * values for other cpus. Otherwise, bpf program has no way to ensure
1049 	 * known initial values for cpus other than current one
1050 	 * (onallcpus=false always when coming from bpf prog).
1051 	 */
1052 	if (!onallcpus) {
1053 		int current_cpu = raw_smp_processor_id();
1054 		int cpu;
1055 
1056 		for_each_possible_cpu(cpu) {
1057 			if (cpu == current_cpu)
1058 				copy_map_value(&htab->map, per_cpu_ptr(pptr, cpu), value);
1059 			else /* Since elem is preallocated, we cannot touch special fields */
1060 				zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
1061 		}
1062 	} else {
1063 		pcpu_copy_value(htab, pptr, value, onallcpus, map_flags);
1064 	}
1065 }
1066 
1067 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
1068 {
1069 	return is_fd_htab(htab) && BITS_PER_LONG == 64;
1070 }
1071 
1072 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
1073 					 void *value, u32 key_size, u32 hash,
1074 					 bool percpu, bool onallcpus,
1075 					 struct htab_elem *old_elem, u64 map_flags)
1076 {
1077 	u32 size = htab->map.value_size;
1078 	bool prealloc = htab_is_prealloc(htab);
1079 	struct htab_elem *l_new, **pl_new;
1080 	void __percpu *pptr;
1081 
1082 	if (prealloc) {
1083 		if (old_elem) {
1084 			/* if we're updating the existing element,
1085 			 * use per-cpu extra elems to avoid freelist_pop/push
1086 			 */
1087 			pl_new = this_cpu_ptr(htab->extra_elems);
1088 			l_new = *pl_new;
1089 			*pl_new = old_elem;
1090 		} else {
1091 			struct pcpu_freelist_node *l;
1092 
1093 			l = __pcpu_freelist_pop(&htab->freelist);
1094 			if (!l)
1095 				return ERR_PTR(-E2BIG);
1096 			l_new = container_of(l, struct htab_elem, fnode);
1097 			bpf_map_inc_elem_count(&htab->map);
1098 		}
1099 	} else {
1100 		if (is_map_full(htab))
1101 			if (!old_elem)
1102 				/* when map is full and update() is replacing
1103 				 * old element, it's ok to allocate, since
1104 				 * old element will be freed immediately.
1105 				 * Otherwise return an error
1106 				 */
1107 				return ERR_PTR(-E2BIG);
1108 		inc_elem_count(htab);
1109 		l_new = bpf_mem_cache_alloc(&htab->ma);
1110 		if (!l_new) {
1111 			l_new = ERR_PTR(-ENOMEM);
1112 			goto dec_count;
1113 		}
1114 	}
1115 
1116 	memcpy(l_new->key, key, key_size);
1117 	if (percpu) {
1118 		if (prealloc) {
1119 			pptr = htab_elem_get_ptr(l_new, key_size);
1120 		} else {
1121 			/* alloc_percpu zero-fills */
1122 			void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
1123 
1124 			if (!ptr) {
1125 				bpf_mem_cache_free(&htab->ma, l_new);
1126 				l_new = ERR_PTR(-ENOMEM);
1127 				goto dec_count;
1128 			}
1129 			l_new->ptr_to_pptr = ptr;
1130 			pptr = *(void __percpu **)ptr;
1131 		}
1132 
1133 		pcpu_init_value(htab, pptr, value, onallcpus, map_flags);
1134 
1135 		if (!prealloc)
1136 			htab_elem_set_ptr(l_new, key_size, pptr);
1137 	} else if (fd_htab_map_needs_adjust(htab)) {
1138 		size = round_up(size, 8);
1139 		memcpy(htab_elem_value(l_new, key_size), value, size);
1140 	} else if (map_flags & BPF_F_LOCK) {
1141 		copy_map_value_locked(&htab->map,
1142 				      htab_elem_value(l_new, key_size),
1143 				      value, false);
1144 	} else {
1145 		copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
1146 	}
1147 
1148 	l_new->hash = hash;
1149 	return l_new;
1150 dec_count:
1151 	dec_elem_count(htab);
1152 	return l_new;
1153 }
1154 
1155 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1156 		       u64 map_flags)
1157 {
1158 	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1159 		/* elem already exists */
1160 		return -EEXIST;
1161 
1162 	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1163 		/* elem doesn't exist, cannot update it */
1164 		return -ENOENT;
1165 
1166 	return 0;
1167 }
1168 
1169 /* Called from syscall or from eBPF program */
1170 static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1171 				 u64 map_flags)
1172 {
1173 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1174 	struct htab_elem *l_new, *l_old;
1175 	struct hlist_nulls_head *head;
1176 	unsigned long flags;
1177 	struct bucket *b;
1178 	u32 key_size, hash;
1179 	int ret;
1180 
1181 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1182 		/* unknown flags */
1183 		return -EINVAL;
1184 
1185 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1186 
1187 	key_size = map->key_size;
1188 
1189 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1190 
1191 	b = __select_bucket(htab, hash);
1192 	head = &b->head;
1193 
1194 	if (unlikely(map_flags & BPF_F_LOCK)) {
1195 		if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1196 			return -EINVAL;
1197 		/* find an element without taking the bucket lock */
1198 		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1199 					      htab->n_buckets);
1200 		ret = check_flags(htab, l_old, map_flags);
1201 		if (ret)
1202 			return ret;
1203 		if (l_old) {
1204 			/* grab the element lock and update value in place */
1205 			copy_map_value_locked(map,
1206 					      htab_elem_value(l_old, key_size),
1207 					      value, false);
1208 			return 0;
1209 		}
1210 		/* fall through, grab the bucket lock and lookup again.
1211 		 * 99.9% chance that the element won't be found,
1212 		 * but second lookup under lock has to be done.
1213 		 */
1214 	}
1215 
1216 	ret = htab_lock_bucket(b, &flags);
1217 	if (ret)
1218 		return ret;
1219 
1220 	l_old = lookup_elem_raw(head, hash, key, key_size);
1221 
1222 	ret = check_flags(htab, l_old, map_flags);
1223 	if (ret)
1224 		goto err;
1225 
1226 	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1227 		/* first lookup without the bucket lock didn't find the element,
1228 		 * but second lookup with the bucket lock found it.
1229 		 * This case is highly unlikely, but has to be dealt with:
1230 		 * grab the element lock in addition to the bucket lock
1231 		 * and update element in place
1232 		 */
1233 		copy_map_value_locked(map,
1234 				      htab_elem_value(l_old, key_size),
1235 				      value, false);
1236 		ret = 0;
1237 		goto err;
1238 	}
1239 
1240 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1241 				l_old, map_flags);
1242 	if (IS_ERR(l_new)) {
1243 		/* all pre-allocated elements are in use or memory exhausted */
1244 		ret = PTR_ERR(l_new);
1245 		goto err;
1246 	}
1247 
1248 	/* add new element to the head of the list, so that
1249 	 * concurrent search will find it before old elem
1250 	 */
1251 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1252 	if (l_old) {
1253 		hlist_nulls_del_rcu(&l_old->hash_node);
1254 
1255 		/* l_old has already been stashed in htab->extra_elems, free
1256 		 * its special fields before it is available for reuse.
1257 		 */
1258 		if (htab_is_prealloc(htab))
1259 			check_and_free_fields(htab, l_old);
1260 	}
1261 	htab_unlock_bucket(b, flags);
1262 	if (l_old && !htab_is_prealloc(htab))
1263 		free_htab_elem(htab, l_old);
1264 	return 0;
1265 err:
1266 	htab_unlock_bucket(b, flags);
1267 	return ret;
1268 }
1269 
1270 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1271 {
1272 	check_and_free_fields(htab, elem);
1273 	bpf_map_dec_elem_count(&htab->map);
1274 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
1275 }
1276 
1277 static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1278 				     u64 map_flags)
1279 {
1280 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1281 	struct htab_elem *l_new, *l_old = NULL;
1282 	struct hlist_nulls_head *head;
1283 	unsigned long flags;
1284 	struct bucket *b;
1285 	u32 key_size, hash;
1286 	int ret;
1287 
1288 	if (unlikely(map_flags > BPF_EXIST))
1289 		/* unknown flags */
1290 		return -EINVAL;
1291 
1292 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1293 
1294 	key_size = map->key_size;
1295 
1296 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1297 
1298 	b = __select_bucket(htab, hash);
1299 	head = &b->head;
1300 
1301 	/* For LRU, we need to alloc before taking bucket's
1302 	 * spinlock because getting free nodes from LRU may need
1303 	 * to remove older elements from htab and this removal
1304 	 * operation will need a bucket lock.
1305 	 */
1306 	l_new = prealloc_lru_pop(htab, key, hash);
1307 	if (!l_new)
1308 		return -ENOMEM;
1309 	copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
1310 
1311 	ret = htab_lock_bucket(b, &flags);
1312 	if (ret)
1313 		goto err_lock_bucket;
1314 
1315 	l_old = lookup_elem_raw(head, hash, key, key_size);
1316 
1317 	ret = check_flags(htab, l_old, map_flags);
1318 	if (ret)
1319 		goto err;
1320 
1321 	/* add new element to the head of the list, so that
1322 	 * concurrent search will find it before old elem
1323 	 */
1324 	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1325 	if (l_old) {
1326 		bpf_lru_node_set_ref(&l_new->lru_node);
1327 		hlist_nulls_del_rcu(&l_old->hash_node);
1328 	}
1329 	ret = 0;
1330 
1331 err:
1332 	htab_unlock_bucket(b, flags);
1333 
1334 err_lock_bucket:
1335 	if (ret)
1336 		htab_lru_push_free(htab, l_new);
1337 	else if (l_old)
1338 		htab_lru_push_free(htab, l_old);
1339 
1340 	return ret;
1341 }
1342 
1343 static int htab_map_check_update_flags(bool onallcpus, u64 map_flags)
1344 {
1345 	if (unlikely(!onallcpus && map_flags > BPF_EXIST))
1346 		return -EINVAL;
1347 	if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)))
1348 		return -EINVAL;
1349 	return 0;
1350 }
1351 
1352 static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
1353 					  void *value, u64 map_flags,
1354 					  bool percpu, bool onallcpus)
1355 {
1356 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1357 	struct htab_elem *l_new, *l_old;
1358 	struct hlist_nulls_head *head;
1359 	void *old_map_ptr = NULL;
1360 	unsigned long flags;
1361 	struct bucket *b;
1362 	u32 key_size, hash;
1363 	int ret;
1364 
1365 	ret = htab_map_check_update_flags(onallcpus, map_flags);
1366 	if (unlikely(ret))
1367 		return ret;
1368 
1369 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1370 
1371 	key_size = map->key_size;
1372 
1373 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1374 
1375 	b = __select_bucket(htab, hash);
1376 	head = &b->head;
1377 
1378 	ret = htab_lock_bucket(b, &flags);
1379 	if (ret)
1380 		return ret;
1381 
1382 	l_old = lookup_elem_raw(head, hash, key, key_size);
1383 
1384 	ret = check_flags(htab, l_old, map_flags);
1385 	if (ret)
1386 		goto err;
1387 
1388 	if (l_old) {
1389 		/* Update value in-place */
1390 		if (percpu) {
1391 			pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1392 					value, onallcpus, map_flags);
1393 		} else {
1394 			void **inner_map_pptr = htab_elem_value(l_old, key_size);
1395 
1396 			old_map_ptr = *inner_map_pptr;
1397 			WRITE_ONCE(*inner_map_pptr, *(void **)value);
1398 		}
1399 	} else {
1400 		l_new = alloc_htab_elem(htab, key, value, key_size,
1401 					hash, percpu, onallcpus, NULL, map_flags);
1402 		if (IS_ERR(l_new)) {
1403 			ret = PTR_ERR(l_new);
1404 			goto err;
1405 		}
1406 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1407 	}
1408 err:
1409 	htab_unlock_bucket(b, flags);
1410 	if (old_map_ptr)
1411 		map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1412 	return ret;
1413 }
1414 
1415 static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1416 					      void *value, u64 map_flags,
1417 					      bool onallcpus)
1418 {
1419 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1420 	struct htab_elem *l_new = NULL, *l_old;
1421 	struct hlist_nulls_head *head;
1422 	unsigned long flags;
1423 	struct bucket *b;
1424 	u32 key_size, hash;
1425 	int ret;
1426 
1427 	ret = htab_map_check_update_flags(onallcpus, map_flags);
1428 	if (unlikely(ret))
1429 		return ret;
1430 
1431 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1432 
1433 	key_size = map->key_size;
1434 
1435 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1436 
1437 	b = __select_bucket(htab, hash);
1438 	head = &b->head;
1439 
1440 	/* For LRU, we need to alloc before taking bucket's
1441 	 * spinlock because LRU's elem alloc may need
1442 	 * to remove older elem from htab and this removal
1443 	 * operation will need a bucket lock.
1444 	 */
1445 	if (map_flags != BPF_EXIST) {
1446 		l_new = prealloc_lru_pop(htab, key, hash);
1447 		if (!l_new)
1448 			return -ENOMEM;
1449 	}
1450 
1451 	ret = htab_lock_bucket(b, &flags);
1452 	if (ret)
1453 		goto err_lock_bucket;
1454 
1455 	l_old = lookup_elem_raw(head, hash, key, key_size);
1456 
1457 	ret = check_flags(htab, l_old, map_flags);
1458 	if (ret)
1459 		goto err;
1460 
1461 	if (l_old) {
1462 		bpf_lru_node_set_ref(&l_old->lru_node);
1463 
1464 		/* per-cpu hash map can update value in-place */
1465 		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1466 				value, onallcpus, map_flags);
1467 	} else {
1468 		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1469 				value, onallcpus, map_flags);
1470 		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1471 		l_new = NULL;
1472 	}
1473 	ret = 0;
1474 err:
1475 	htab_unlock_bucket(b, flags);
1476 err_lock_bucket:
1477 	if (l_new) {
1478 		bpf_map_dec_elem_count(&htab->map);
1479 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1480 	}
1481 	return ret;
1482 }
1483 
1484 static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1485 					void *value, u64 map_flags)
1486 {
1487 	return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
1488 }
1489 
1490 static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1491 					    void *value, u64 map_flags)
1492 {
1493 	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1494 						 false);
1495 }
1496 
1497 /* Called from syscall or from eBPF program */
1498 static long htab_map_delete_elem(struct bpf_map *map, void *key)
1499 {
1500 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1501 	struct hlist_nulls_head *head;
1502 	struct bucket *b;
1503 	struct htab_elem *l;
1504 	unsigned long flags;
1505 	u32 hash, key_size;
1506 	int ret;
1507 
1508 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1509 
1510 	key_size = map->key_size;
1511 
1512 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1513 	b = __select_bucket(htab, hash);
1514 	head = &b->head;
1515 
1516 	ret = htab_lock_bucket(b, &flags);
1517 	if (ret)
1518 		return ret;
1519 
1520 	l = lookup_elem_raw(head, hash, key, key_size);
1521 	if (l)
1522 		hlist_nulls_del_rcu(&l->hash_node);
1523 	else
1524 		ret = -ENOENT;
1525 
1526 	htab_unlock_bucket(b, flags);
1527 
1528 	if (l)
1529 		free_htab_elem(htab, l);
1530 	return ret;
1531 }
1532 
1533 static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1534 {
1535 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1536 	struct hlist_nulls_head *head;
1537 	struct bucket *b;
1538 	struct htab_elem *l;
1539 	unsigned long flags;
1540 	u32 hash, key_size;
1541 	int ret;
1542 
1543 	WARN_ON_ONCE(!bpf_rcu_lock_held());
1544 
1545 	key_size = map->key_size;
1546 
1547 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1548 	b = __select_bucket(htab, hash);
1549 	head = &b->head;
1550 
1551 	ret = htab_lock_bucket(b, &flags);
1552 	if (ret)
1553 		return ret;
1554 
1555 	l = lookup_elem_raw(head, hash, key, key_size);
1556 
1557 	if (l)
1558 		hlist_nulls_del_rcu(&l->hash_node);
1559 	else
1560 		ret = -ENOENT;
1561 
1562 	htab_unlock_bucket(b, flags);
1563 	if (l)
1564 		htab_lru_push_free(htab, l);
1565 	return ret;
1566 }
1567 
1568 static void delete_all_elements(struct bpf_htab *htab)
1569 {
1570 	int i;
1571 
1572 	/* It's called from a worker thread and migration has been disabled,
1573 	 * therefore, it is OK to invoke bpf_mem_cache_free() directly.
1574 	 */
1575 	for (i = 0; i < htab->n_buckets; i++) {
1576 		struct hlist_nulls_head *head = select_bucket(htab, i);
1577 		struct hlist_nulls_node *n;
1578 		struct htab_elem *l;
1579 
1580 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1581 			hlist_nulls_del_rcu(&l->hash_node);
1582 			htab_elem_free(htab, l);
1583 		}
1584 		cond_resched();
1585 	}
1586 }
1587 
1588 static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
1589 {
1590 	int i;
1591 
1592 	rcu_read_lock();
1593 	for (i = 0; i < htab->n_buckets; i++) {
1594 		struct hlist_nulls_head *head = select_bucket(htab, i);
1595 		struct hlist_nulls_node *n;
1596 		struct htab_elem *l;
1597 
1598 		hlist_nulls_for_each_entry(l, n, head, hash_node) {
1599 			/* We only free internal structs on uref dropping to zero */
1600 			bpf_map_free_internal_structs(&htab->map,
1601 						      htab_elem_value(l, htab->map.key_size));
1602 		}
1603 		cond_resched_rcu();
1604 	}
1605 	rcu_read_unlock();
1606 }
1607 
1608 static void htab_map_free_internal_structs(struct bpf_map *map)
1609 {
1610 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1611 
1612 	/* We only free internal structs on uref dropping to zero */
1613 	if (!bpf_map_has_internal_structs(map))
1614 		return;
1615 
1616 	if (htab_is_prealloc(htab))
1617 		htab_free_prealloced_internal_structs(htab);
1618 	else
1619 		htab_free_malloced_internal_structs(htab);
1620 }
1621 
1622 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1623 static void htab_map_free(struct bpf_map *map)
1624 {
1625 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1626 
1627 	/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1628 	 * bpf_free_used_maps() is called after bpf prog is no longer executing.
1629 	 * There is no need to synchronize_rcu() here to protect map elements.
1630 	 */
1631 
1632 	/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1633 	 * underneath and is responsible for waiting for callbacks to finish
1634 	 * during bpf_mem_alloc_destroy().
1635 	 */
1636 	if (!htab_is_prealloc(htab)) {
1637 		delete_all_elements(htab);
1638 	} else {
1639 		htab_free_prealloced_fields(htab);
1640 		prealloc_destroy(htab);
1641 	}
1642 
1643 	bpf_map_free_elem_count(map);
1644 	free_percpu(htab->extra_elems);
1645 	bpf_map_area_free(htab->buckets);
1646 	bpf_mem_alloc_destroy(&htab->pcpu_ma);
1647 	bpf_mem_alloc_destroy(&htab->ma);
1648 	if (htab->use_percpu_counter)
1649 		percpu_counter_destroy(&htab->pcount);
1650 	bpf_map_area_free(htab);
1651 }
1652 
1653 static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1654 				   struct seq_file *m)
1655 {
1656 	void *value;
1657 
1658 	rcu_read_lock();
1659 
1660 	value = htab_map_lookup_elem(map, key);
1661 	if (!value) {
1662 		rcu_read_unlock();
1663 		return;
1664 	}
1665 
1666 	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1667 	seq_puts(m, ": ");
1668 	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1669 	seq_putc(m, '\n');
1670 
1671 	rcu_read_unlock();
1672 }
1673 
1674 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1675 					     void *value, bool is_lru_map,
1676 					     bool is_percpu, u64 flags)
1677 {
1678 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1679 	struct hlist_nulls_head *head;
1680 	unsigned long bflags;
1681 	struct htab_elem *l;
1682 	u32 hash, key_size;
1683 	struct bucket *b;
1684 	int ret;
1685 
1686 	key_size = map->key_size;
1687 
1688 	hash = htab_map_hash(key, key_size, htab->hashrnd);
1689 	b = __select_bucket(htab, hash);
1690 	head = &b->head;
1691 
1692 	ret = htab_lock_bucket(b, &bflags);
1693 	if (ret)
1694 		return ret;
1695 
1696 	l = lookup_elem_raw(head, hash, key, key_size);
1697 	if (!l) {
1698 		ret = -ENOENT;
1699 		goto out_unlock;
1700 	}
1701 
1702 	if (is_percpu) {
1703 		u32 roundup_value_size = round_up(map->value_size, 8);
1704 		void __percpu *pptr;
1705 		int off = 0, cpu;
1706 
1707 		pptr = htab_elem_get_ptr(l, key_size);
1708 		for_each_possible_cpu(cpu) {
1709 			copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
1710 			check_and_init_map_value(&htab->map, value + off);
1711 			off += roundup_value_size;
1712 		}
1713 	} else {
1714 		void *src = htab_elem_value(l, map->key_size);
1715 
1716 		if (flags & BPF_F_LOCK)
1717 			copy_map_value_locked(map, value, src, true);
1718 		else
1719 			copy_map_value(map, value, src);
1720 		/* Zeroing special fields in the temp buffer */
1721 		check_and_init_map_value(map, value);
1722 	}
1723 	hlist_nulls_del_rcu(&l->hash_node);
1724 
1725 out_unlock:
1726 	htab_unlock_bucket(b, bflags);
1727 
1728 	if (l) {
1729 		if (is_lru_map)
1730 			htab_lru_push_free(htab, l);
1731 		else
1732 			free_htab_elem(htab, l);
1733 	}
1734 
1735 	return ret;
1736 }
1737 
1738 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1739 					   void *value, u64 flags)
1740 {
1741 	return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1742 						 flags);
1743 }
1744 
1745 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1746 						  void *key, void *value,
1747 						  u64 flags)
1748 {
1749 	return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1750 						 flags);
1751 }
1752 
1753 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1754 					       void *value, u64 flags)
1755 {
1756 	return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1757 						 flags);
1758 }
1759 
1760 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1761 						      void *key, void *value,
1762 						      u64 flags)
1763 {
1764 	return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1765 						 flags);
1766 }
1767 
1768 static int
1769 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
1770 				   const union bpf_attr *attr,
1771 				   union bpf_attr __user *uattr,
1772 				   bool do_delete, bool is_lru_map,
1773 				   bool is_percpu)
1774 {
1775 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1776 	void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1777 	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1778 	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1779 	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1780 	u32 batch, max_count, size, bucket_size, map_id;
1781 	u64 elem_map_flags, map_flags, allowed_flags;
1782 	u32 bucket_cnt, total, key_size, value_size;
1783 	struct htab_elem *node_to_free = NULL;
1784 	struct hlist_nulls_head *head;
1785 	struct hlist_nulls_node *n;
1786 	unsigned long flags = 0;
1787 	bool locked = false;
1788 	struct htab_elem *l;
1789 	struct bucket *b;
1790 	int ret = 0;
1791 
1792 	elem_map_flags = attr->batch.elem_flags;
1793 	allowed_flags = BPF_F_LOCK;
1794 	if (!do_delete && is_percpu)
1795 		allowed_flags |= BPF_F_CPU;
1796 	ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags);
1797 	if (ret)
1798 		return ret;
1799 
1800 	map_flags = attr->batch.flags;
1801 	if (map_flags)
1802 		return -EINVAL;
1803 
1804 	max_count = attr->batch.count;
1805 	if (!max_count)
1806 		return 0;
1807 
1808 	if (put_user(0, &uattr->batch.count))
1809 		return -EFAULT;
1810 
1811 	batch = 0;
1812 	if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1813 		return -EFAULT;
1814 
1815 	if (batch >= htab->n_buckets)
1816 		return -ENOENT;
1817 
1818 	key_size = htab->map.key_size;
1819 	value_size = htab->map.value_size;
1820 	size = round_up(value_size, 8);
1821 	if (is_percpu && !(elem_map_flags & BPF_F_CPU))
1822 		value_size = size * num_possible_cpus();
1823 	total = 0;
1824 	/* while experimenting with hash tables with sizes ranging from 10 to
1825 	 * 1000, it was observed that a bucket can have up to 5 entries.
1826 	 */
1827 	bucket_size = 5;
1828 
1829 alloc:
1830 	/* We cannot do copy_from_user or copy_to_user inside
1831 	 * the rcu_read_lock. Allocate enough space here.
1832 	 */
1833 	keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1834 	values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1835 	if (!keys || !values) {
1836 		ret = -ENOMEM;
1837 		goto after_loop;
1838 	}
1839 
1840 again:
1841 	bpf_disable_instrumentation();
1842 	rcu_read_lock();
1843 again_nocopy:
1844 	dst_key = keys;
1845 	dst_val = values;
1846 	b = &htab->buckets[batch];
1847 	head = &b->head;
1848 	/* do not grab the lock unless need it (bucket_cnt > 0). */
1849 	if (locked) {
1850 		ret = htab_lock_bucket(b, &flags);
1851 		if (ret) {
1852 			rcu_read_unlock();
1853 			bpf_enable_instrumentation();
1854 			goto after_loop;
1855 		}
1856 	}
1857 
1858 	bucket_cnt = 0;
1859 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1860 		bucket_cnt++;
1861 
1862 	if (bucket_cnt && !locked) {
1863 		locked = true;
1864 		goto again_nocopy;
1865 	}
1866 
1867 	if (bucket_cnt > (max_count - total)) {
1868 		if (total == 0)
1869 			ret = -ENOSPC;
1870 		/* Note that since bucket_cnt > 0 here, it is implicit
1871 		 * that the locked was grabbed, so release it.
1872 		 */
1873 		htab_unlock_bucket(b, flags);
1874 		rcu_read_unlock();
1875 		bpf_enable_instrumentation();
1876 		goto after_loop;
1877 	}
1878 
1879 	if (bucket_cnt > bucket_size) {
1880 		bucket_size = bucket_cnt;
1881 		/* Note that since bucket_cnt > 0 here, it is implicit
1882 		 * that the locked was grabbed, so release it.
1883 		 */
1884 		htab_unlock_bucket(b, flags);
1885 		rcu_read_unlock();
1886 		bpf_enable_instrumentation();
1887 		kvfree(keys);
1888 		kvfree(values);
1889 		goto alloc;
1890 	}
1891 
1892 	/* Next block is only safe to run if you have grabbed the lock */
1893 	if (!locked)
1894 		goto next_batch;
1895 
1896 	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1897 		memcpy(dst_key, l->key, key_size);
1898 
1899 		if (is_percpu) {
1900 			int off = 0, cpu;
1901 			void __percpu *pptr;
1902 
1903 			pptr = htab_elem_get_ptr(l, map->key_size);
1904 			if (elem_map_flags & BPF_F_CPU) {
1905 				cpu = elem_map_flags >> 32;
1906 				copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu));
1907 				check_and_init_map_value(&htab->map, dst_val);
1908 			} else {
1909 				for_each_possible_cpu(cpu) {
1910 					copy_map_value_long(&htab->map, dst_val + off,
1911 							    per_cpu_ptr(pptr, cpu));
1912 					check_and_init_map_value(&htab->map, dst_val + off);
1913 					off += size;
1914 				}
1915 			}
1916 		} else {
1917 			value = htab_elem_value(l, key_size);
1918 			if (is_fd_htab(htab)) {
1919 				struct bpf_map **inner_map = value;
1920 
1921 				 /* Actual value is the id of the inner map */
1922 				map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1923 				value = &map_id;
1924 			}
1925 
1926 			if (elem_map_flags & BPF_F_LOCK)
1927 				copy_map_value_locked(map, dst_val, value,
1928 						      true);
1929 			else
1930 				copy_map_value(map, dst_val, value);
1931 			/* Zeroing special fields in the temp buffer */
1932 			check_and_init_map_value(map, dst_val);
1933 		}
1934 		if (do_delete) {
1935 			hlist_nulls_del_rcu(&l->hash_node);
1936 
1937 			/* bpf_lru_push_free() will acquire lru_lock, which
1938 			 * may cause deadlock. See comments in function
1939 			 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1940 			 * after releasing the bucket lock.
1941 			 *
1942 			 * For htab of maps, htab_put_fd_value() in
1943 			 * free_htab_elem() may acquire a spinlock with bucket
1944 			 * lock being held and it violates the lock rule, so
1945 			 * invoke free_htab_elem() after unlock as well.
1946 			 */
1947 			l->batch_flink = node_to_free;
1948 			node_to_free = l;
1949 		}
1950 		dst_key += key_size;
1951 		dst_val += value_size;
1952 	}
1953 
1954 	htab_unlock_bucket(b, flags);
1955 	locked = false;
1956 
1957 	while (node_to_free) {
1958 		l = node_to_free;
1959 		node_to_free = node_to_free->batch_flink;
1960 		if (is_lru_map)
1961 			htab_lru_push_free(htab, l);
1962 		else
1963 			free_htab_elem(htab, l);
1964 	}
1965 
1966 next_batch:
1967 	/* If we are not copying data, we can go to next bucket and avoid
1968 	 * unlocking the rcu.
1969 	 */
1970 	if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1971 		batch++;
1972 		goto again_nocopy;
1973 	}
1974 
1975 	rcu_read_unlock();
1976 	bpf_enable_instrumentation();
1977 	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1978 	    key_size * bucket_cnt) ||
1979 	    copy_to_user(uvalues + total * value_size, values,
1980 	    value_size * bucket_cnt))) {
1981 		ret = -EFAULT;
1982 		goto after_loop;
1983 	}
1984 
1985 	total += bucket_cnt;
1986 	batch++;
1987 	if (batch >= htab->n_buckets) {
1988 		ret = -ENOENT;
1989 		goto after_loop;
1990 	}
1991 	goto again;
1992 
1993 after_loop:
1994 	if (ret == -EFAULT)
1995 		goto out;
1996 
1997 	/* copy # of entries and next batch */
1998 	ubatch = u64_to_user_ptr(attr->batch.out_batch);
1999 	if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
2000 	    put_user(total, &uattr->batch.count))
2001 		ret = -EFAULT;
2002 
2003 out:
2004 	kvfree(keys);
2005 	kvfree(values);
2006 	return ret;
2007 }
2008 
2009 static int
2010 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
2011 			     union bpf_attr __user *uattr)
2012 {
2013 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
2014 						  false, true);
2015 }
2016 
2017 static int
2018 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
2019 					const union bpf_attr *attr,
2020 					union bpf_attr __user *uattr)
2021 {
2022 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
2023 						  false, true);
2024 }
2025 
2026 static int
2027 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
2028 		      union bpf_attr __user *uattr)
2029 {
2030 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
2031 						  false, false);
2032 }
2033 
2034 static int
2035 htab_map_lookup_and_delete_batch(struct bpf_map *map,
2036 				 const union bpf_attr *attr,
2037 				 union bpf_attr __user *uattr)
2038 {
2039 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
2040 						  false, false);
2041 }
2042 
2043 static int
2044 htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
2045 				 const union bpf_attr *attr,
2046 				 union bpf_attr __user *uattr)
2047 {
2048 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
2049 						  true, true);
2050 }
2051 
2052 static int
2053 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
2054 					    const union bpf_attr *attr,
2055 					    union bpf_attr __user *uattr)
2056 {
2057 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
2058 						  true, true);
2059 }
2060 
2061 static int
2062 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
2063 			  union bpf_attr __user *uattr)
2064 {
2065 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
2066 						  true, false);
2067 }
2068 
2069 static int
2070 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
2071 				     const union bpf_attr *attr,
2072 				     union bpf_attr __user *uattr)
2073 {
2074 	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
2075 						  true, false);
2076 }
2077 
2078 struct bpf_iter_seq_hash_map_info {
2079 	struct bpf_map *map;
2080 	struct bpf_htab *htab;
2081 	void *percpu_value_buf; // non-zero means percpu hash
2082 	u32 bucket_id;
2083 	u32 skip_elems;
2084 };
2085 
2086 static struct htab_elem *
2087 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
2088 			   struct htab_elem *prev_elem)
2089 {
2090 	const struct bpf_htab *htab = info->htab;
2091 	u32 skip_elems = info->skip_elems;
2092 	u32 bucket_id = info->bucket_id;
2093 	struct hlist_nulls_head *head;
2094 	struct hlist_nulls_node *n;
2095 	struct htab_elem *elem;
2096 	struct bucket *b;
2097 	u32 i, count;
2098 
2099 	if (bucket_id >= htab->n_buckets)
2100 		return NULL;
2101 
2102 	/* try to find next elem in the same bucket */
2103 	if (prev_elem) {
2104 		/* no update/deletion on this bucket, prev_elem should be still valid
2105 		 * and we won't skip elements.
2106 		 */
2107 		n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
2108 		elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
2109 		if (elem)
2110 			return elem;
2111 
2112 		/* not found, unlock and go to the next bucket */
2113 		b = &htab->buckets[bucket_id++];
2114 		rcu_read_unlock();
2115 		skip_elems = 0;
2116 	}
2117 
2118 	for (i = bucket_id; i < htab->n_buckets; i++) {
2119 		b = &htab->buckets[i];
2120 		rcu_read_lock();
2121 
2122 		count = 0;
2123 		head = &b->head;
2124 		hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2125 			if (count >= skip_elems) {
2126 				info->bucket_id = i;
2127 				info->skip_elems = count;
2128 				return elem;
2129 			}
2130 			count++;
2131 		}
2132 
2133 		rcu_read_unlock();
2134 		skip_elems = 0;
2135 	}
2136 
2137 	info->bucket_id = i;
2138 	info->skip_elems = 0;
2139 	return NULL;
2140 }
2141 
2142 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
2143 {
2144 	struct bpf_iter_seq_hash_map_info *info = seq->private;
2145 	struct htab_elem *elem;
2146 
2147 	elem = bpf_hash_map_seq_find_next(info, NULL);
2148 	if (!elem)
2149 		return NULL;
2150 
2151 	if (*pos == 0)
2152 		++*pos;
2153 	return elem;
2154 }
2155 
2156 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2157 {
2158 	struct bpf_iter_seq_hash_map_info *info = seq->private;
2159 
2160 	++*pos;
2161 	++info->skip_elems;
2162 	return bpf_hash_map_seq_find_next(info, v);
2163 }
2164 
2165 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
2166 {
2167 	struct bpf_iter_seq_hash_map_info *info = seq->private;
2168 	struct bpf_iter__bpf_map_elem ctx = {};
2169 	struct bpf_map *map = info->map;
2170 	struct bpf_iter_meta meta;
2171 	int ret = 0, off = 0, cpu;
2172 	u32 roundup_value_size;
2173 	struct bpf_prog *prog;
2174 	void __percpu *pptr;
2175 
2176 	meta.seq = seq;
2177 	prog = bpf_iter_get_info(&meta, elem == NULL);
2178 	if (prog) {
2179 		ctx.meta = &meta;
2180 		ctx.map = info->map;
2181 		if (elem) {
2182 			ctx.key = elem->key;
2183 			if (!info->percpu_value_buf) {
2184 				ctx.value = htab_elem_value(elem, map->key_size);
2185 			} else {
2186 				roundup_value_size = round_up(map->value_size, 8);
2187 				pptr = htab_elem_get_ptr(elem, map->key_size);
2188 				for_each_possible_cpu(cpu) {
2189 					copy_map_value_long(map, info->percpu_value_buf + off,
2190 							    per_cpu_ptr(pptr, cpu));
2191 					check_and_init_map_value(map, info->percpu_value_buf + off);
2192 					off += roundup_value_size;
2193 				}
2194 				ctx.value = info->percpu_value_buf;
2195 			}
2196 		}
2197 		ret = bpf_iter_run_prog(prog, &ctx);
2198 	}
2199 
2200 	return ret;
2201 }
2202 
2203 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
2204 {
2205 	return __bpf_hash_map_seq_show(seq, v);
2206 }
2207 
2208 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2209 {
2210 	if (!v)
2211 		(void)__bpf_hash_map_seq_show(seq, NULL);
2212 	else
2213 		rcu_read_unlock();
2214 }
2215 
2216 static int bpf_iter_init_hash_map(void *priv_data,
2217 				  struct bpf_iter_aux_info *aux)
2218 {
2219 	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2220 	struct bpf_map *map = aux->map;
2221 	void *value_buf;
2222 	u32 buf_size;
2223 
2224 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2225 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2226 		buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2227 		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2228 		if (!value_buf)
2229 			return -ENOMEM;
2230 
2231 		seq_info->percpu_value_buf = value_buf;
2232 	}
2233 
2234 	bpf_map_inc_with_uref(map);
2235 	seq_info->map = map;
2236 	seq_info->htab = container_of(map, struct bpf_htab, map);
2237 	return 0;
2238 }
2239 
2240 static void bpf_iter_fini_hash_map(void *priv_data)
2241 {
2242 	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2243 
2244 	bpf_map_put_with_uref(seq_info->map);
2245 	kfree(seq_info->percpu_value_buf);
2246 }
2247 
2248 static const struct seq_operations bpf_hash_map_seq_ops = {
2249 	.start	= bpf_hash_map_seq_start,
2250 	.next	= bpf_hash_map_seq_next,
2251 	.stop	= bpf_hash_map_seq_stop,
2252 	.show	= bpf_hash_map_seq_show,
2253 };
2254 
2255 static const struct bpf_iter_seq_info iter_seq_info = {
2256 	.seq_ops		= &bpf_hash_map_seq_ops,
2257 	.init_seq_private	= bpf_iter_init_hash_map,
2258 	.fini_seq_private	= bpf_iter_fini_hash_map,
2259 	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
2260 };
2261 
2262 static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2263 				   void *callback_ctx, u64 flags)
2264 {
2265 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2266 	struct hlist_nulls_head *head;
2267 	struct hlist_nulls_node *n;
2268 	struct htab_elem *elem;
2269 	int i, num_elems = 0;
2270 	void __percpu *pptr;
2271 	struct bucket *b;
2272 	void *key, *val;
2273 	bool is_percpu;
2274 	u64 ret = 0;
2275 
2276 	cant_migrate();
2277 
2278 	if (flags != 0)
2279 		return -EINVAL;
2280 
2281 	is_percpu = htab_is_percpu(htab);
2282 
2283 	/* migration has been disabled, so percpu value prepared here will be
2284 	 * the same as the one seen by the bpf program with
2285 	 * bpf_map_lookup_elem().
2286 	 */
2287 	for (i = 0; i < htab->n_buckets; i++) {
2288 		b = &htab->buckets[i];
2289 		rcu_read_lock();
2290 		head = &b->head;
2291 		hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
2292 			key = elem->key;
2293 			if (is_percpu) {
2294 				/* current cpu value for percpu map */
2295 				pptr = htab_elem_get_ptr(elem, map->key_size);
2296 				val = this_cpu_ptr(pptr);
2297 			} else {
2298 				val = htab_elem_value(elem, map->key_size);
2299 			}
2300 			num_elems++;
2301 			ret = callback_fn((u64)(long)map, (u64)(long)key,
2302 					  (u64)(long)val, (u64)(long)callback_ctx, 0);
2303 			/* return value: 0 - continue, 1 - stop and return */
2304 			if (ret) {
2305 				rcu_read_unlock();
2306 				goto out;
2307 			}
2308 		}
2309 		rcu_read_unlock();
2310 	}
2311 out:
2312 	return num_elems;
2313 }
2314 
2315 static u64 htab_map_mem_usage(const struct bpf_map *map)
2316 {
2317 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2318 	u32 value_size = round_up(htab->map.value_size, 8);
2319 	bool prealloc = htab_is_prealloc(htab);
2320 	bool percpu = htab_is_percpu(htab);
2321 	bool lru = htab_is_lru(htab);
2322 	u64 num_entries, usage;
2323 
2324 	usage = sizeof(struct bpf_htab) +
2325 		sizeof(struct bucket) * htab->n_buckets;
2326 
2327 	if (prealloc) {
2328 		num_entries = map->max_entries;
2329 		if (htab_has_extra_elems(htab))
2330 			num_entries += num_possible_cpus();
2331 
2332 		usage += htab->elem_size * num_entries;
2333 
2334 		if (percpu)
2335 			usage += value_size * num_possible_cpus() * num_entries;
2336 		else if (!lru)
2337 			usage += sizeof(struct htab_elem *) * num_possible_cpus();
2338 	} else {
2339 #define LLIST_NODE_SZ sizeof(struct llist_node)
2340 
2341 		num_entries = htab->use_percpu_counter ?
2342 					  percpu_counter_sum(&htab->pcount) :
2343 					  atomic_read(&htab->count);
2344 		usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2345 		if (percpu) {
2346 			usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
2347 			usage += value_size * num_possible_cpus() * num_entries;
2348 		}
2349 	}
2350 	return usage;
2351 }
2352 
2353 BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2354 const struct bpf_map_ops htab_map_ops = {
2355 	.map_meta_equal = bpf_map_meta_equal,
2356 	.map_alloc_check = htab_map_alloc_check,
2357 	.map_alloc = htab_map_alloc,
2358 	.map_free = htab_map_free,
2359 	.map_get_next_key = htab_map_get_next_key,
2360 	.map_release_uref = htab_map_free_internal_structs,
2361 	.map_lookup_elem = htab_map_lookup_elem,
2362 	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2363 	.map_update_elem = htab_map_update_elem,
2364 	.map_delete_elem = htab_map_delete_elem,
2365 	.map_gen_lookup = htab_map_gen_lookup,
2366 	.map_seq_show_elem = htab_map_seq_show_elem,
2367 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2368 	.map_for_each_callback = bpf_for_each_hash_elem,
2369 	.map_check_btf = htab_map_check_btf,
2370 	.map_mem_usage = htab_map_mem_usage,
2371 	BATCH_OPS(htab),
2372 	.map_btf_id = &htab_map_btf_ids[0],
2373 	.iter_seq_info = &iter_seq_info,
2374 };
2375 
2376 const struct bpf_map_ops htab_lru_map_ops = {
2377 	.map_meta_equal = bpf_map_meta_equal,
2378 	.map_alloc_check = htab_map_alloc_check,
2379 	.map_alloc = htab_map_alloc,
2380 	.map_free = htab_map_free,
2381 	.map_get_next_key = htab_map_get_next_key,
2382 	.map_release_uref = htab_map_free_internal_structs,
2383 	.map_lookup_elem = htab_lru_map_lookup_elem,
2384 	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2385 	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2386 	.map_update_elem = htab_lru_map_update_elem,
2387 	.map_delete_elem = htab_lru_map_delete_elem,
2388 	.map_gen_lookup = htab_lru_map_gen_lookup,
2389 	.map_seq_show_elem = htab_map_seq_show_elem,
2390 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2391 	.map_for_each_callback = bpf_for_each_hash_elem,
2392 	.map_check_btf = htab_map_check_btf,
2393 	.map_mem_usage = htab_map_mem_usage,
2394 	BATCH_OPS(htab_lru),
2395 	.map_btf_id = &htab_map_btf_ids[0],
2396 	.iter_seq_info = &iter_seq_info,
2397 };
2398 
2399 /* Called from eBPF program */
2400 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2401 {
2402 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2403 
2404 	if (l)
2405 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2406 	else
2407 		return NULL;
2408 }
2409 
2410 /* inline bpf_map_lookup_elem() call for per-CPU hashmap */
2411 static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
2412 {
2413 	struct bpf_insn *insn = insn_buf;
2414 
2415 	if (!bpf_jit_supports_percpu_insn())
2416 		return -EOPNOTSUPP;
2417 
2418 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2419 		     (void *(*)(struct bpf_map *map, void *key))NULL));
2420 	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2421 	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
2422 	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
2423 				offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
2424 	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
2425 	*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
2426 
2427 	return insn - insn_buf;
2428 }
2429 
2430 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2431 {
2432 	struct htab_elem *l;
2433 
2434 	if (cpu >= nr_cpu_ids)
2435 		return NULL;
2436 
2437 	l = __htab_map_lookup_elem(map, key);
2438 	if (l)
2439 		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2440 	else
2441 		return NULL;
2442 }
2443 
2444 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2445 {
2446 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2447 
2448 	if (l) {
2449 		bpf_lru_node_set_ref(&l->lru_node);
2450 		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2451 	}
2452 
2453 	return NULL;
2454 }
2455 
2456 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2457 {
2458 	struct htab_elem *l;
2459 
2460 	if (cpu >= nr_cpu_ids)
2461 		return NULL;
2462 
2463 	l = __htab_map_lookup_elem(map, key);
2464 	if (l) {
2465 		bpf_lru_node_set_ref(&l->lru_node);
2466 		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2467 	}
2468 
2469 	return NULL;
2470 }
2471 
2472 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
2473 {
2474 	struct htab_elem *l;
2475 	void __percpu *pptr;
2476 	int ret = -ENOENT;
2477 	int cpu, off = 0;
2478 	u32 size;
2479 
2480 	/* per_cpu areas are zero-filled and bpf programs can only
2481 	 * access 'value_size' of them, so copying rounded areas
2482 	 * will not leak any kernel data
2483 	 */
2484 	size = round_up(map->value_size, 8);
2485 	rcu_read_lock();
2486 	l = __htab_map_lookup_elem(map, key);
2487 	if (!l)
2488 		goto out;
2489 	ret = 0;
2490 	/* We do not mark LRU map element here in order to not mess up
2491 	 * eviction heuristics when user space does a map walk.
2492 	 */
2493 	pptr = htab_elem_get_ptr(l, map->key_size);
2494 	if (map_flags & BPF_F_CPU) {
2495 		cpu = map_flags >> 32;
2496 		copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
2497 		check_and_init_map_value(map, value);
2498 		goto out;
2499 	}
2500 	for_each_possible_cpu(cpu) {
2501 		copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
2502 		check_and_init_map_value(map, value + off);
2503 		off += size;
2504 	}
2505 out:
2506 	rcu_read_unlock();
2507 	return ret;
2508 }
2509 
2510 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2511 			   u64 map_flags)
2512 {
2513 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2514 	int ret;
2515 
2516 	rcu_read_lock();
2517 	if (htab_is_lru(htab))
2518 		ret = __htab_lru_percpu_map_update_elem(map, key, value,
2519 							map_flags, true);
2520 	else
2521 		ret = htab_map_update_elem_in_place(map, key, value, map_flags,
2522 						    true, true);
2523 	rcu_read_unlock();
2524 
2525 	return ret;
2526 }
2527 
2528 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2529 					  struct seq_file *m)
2530 {
2531 	struct htab_elem *l;
2532 	void __percpu *pptr;
2533 	int cpu;
2534 
2535 	rcu_read_lock();
2536 
2537 	l = __htab_map_lookup_elem(map, key);
2538 	if (!l) {
2539 		rcu_read_unlock();
2540 		return;
2541 	}
2542 
2543 	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2544 	seq_puts(m, ": {\n");
2545 	pptr = htab_elem_get_ptr(l, map->key_size);
2546 	for_each_possible_cpu(cpu) {
2547 		seq_printf(m, "\tcpu%d: ", cpu);
2548 		btf_type_seq_show(map->btf, map->btf_value_type_id,
2549 				  per_cpu_ptr(pptr, cpu), m);
2550 		seq_putc(m, '\n');
2551 	}
2552 	seq_puts(m, "}\n");
2553 
2554 	rcu_read_unlock();
2555 }
2556 
2557 const struct bpf_map_ops htab_percpu_map_ops = {
2558 	.map_meta_equal = bpf_map_meta_equal,
2559 	.map_alloc_check = htab_map_alloc_check,
2560 	.map_alloc = htab_map_alloc,
2561 	.map_free = htab_map_free,
2562 	.map_get_next_key = htab_map_get_next_key,
2563 	.map_lookup_elem = htab_percpu_map_lookup_elem,
2564 	.map_gen_lookup = htab_percpu_map_gen_lookup,
2565 	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2566 	.map_update_elem = htab_percpu_map_update_elem,
2567 	.map_delete_elem = htab_map_delete_elem,
2568 	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2569 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2570 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2571 	.map_for_each_callback = bpf_for_each_hash_elem,
2572 	.map_check_btf = htab_map_check_btf,
2573 	.map_mem_usage = htab_map_mem_usage,
2574 	BATCH_OPS(htab_percpu),
2575 	.map_btf_id = &htab_map_btf_ids[0],
2576 	.iter_seq_info = &iter_seq_info,
2577 };
2578 
2579 const struct bpf_map_ops htab_lru_percpu_map_ops = {
2580 	.map_meta_equal = bpf_map_meta_equal,
2581 	.map_alloc_check = htab_map_alloc_check,
2582 	.map_alloc = htab_map_alloc,
2583 	.map_free = htab_map_free,
2584 	.map_get_next_key = htab_map_get_next_key,
2585 	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2586 	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2587 	.map_update_elem = htab_lru_percpu_map_update_elem,
2588 	.map_delete_elem = htab_lru_map_delete_elem,
2589 	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2590 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2591 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2592 	.map_for_each_callback = bpf_for_each_hash_elem,
2593 	.map_check_btf = htab_map_check_btf,
2594 	.map_mem_usage = htab_map_mem_usage,
2595 	BATCH_OPS(htab_lru_percpu),
2596 	.map_btf_id = &htab_map_btf_ids[0],
2597 	.iter_seq_info = &iter_seq_info,
2598 };
2599 
2600 static int fd_htab_map_alloc_check(union bpf_attr *attr)
2601 {
2602 	if (attr->value_size != sizeof(u32))
2603 		return -EINVAL;
2604 	return htab_map_alloc_check(attr);
2605 }
2606 
2607 static void fd_htab_map_free(struct bpf_map *map)
2608 {
2609 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2610 	struct hlist_nulls_node *n;
2611 	struct hlist_nulls_head *head;
2612 	struct htab_elem *l;
2613 	int i;
2614 
2615 	for (i = 0; i < htab->n_buckets; i++) {
2616 		head = select_bucket(htab, i);
2617 
2618 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2619 			void *ptr = fd_htab_map_get_ptr(map, l);
2620 
2621 			map->ops->map_fd_put_ptr(map, ptr, false);
2622 		}
2623 	}
2624 
2625 	htab_map_free(map);
2626 }
2627 
2628 /* only called from syscall */
2629 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2630 {
2631 	void **ptr;
2632 	int ret = 0;
2633 
2634 	if (!map->ops->map_fd_sys_lookup_elem)
2635 		return -ENOTSUPP;
2636 
2637 	rcu_read_lock();
2638 	ptr = htab_map_lookup_elem(map, key);
2639 	if (ptr)
2640 		*value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2641 	else
2642 		ret = -ENOENT;
2643 	rcu_read_unlock();
2644 
2645 	return ret;
2646 }
2647 
2648 /* Only called from syscall */
2649 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2650 				void *key, void *value, u64 map_flags)
2651 {
2652 	void *ptr;
2653 	int ret;
2654 
2655 	ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
2656 	if (IS_ERR(ptr))
2657 		return PTR_ERR(ptr);
2658 
2659 	/* The htab bucket lock is always held during update operations in fd
2660 	 * htab map, and the following rcu_read_lock() is only used to avoid
2661 	 * the WARN_ON_ONCE in htab_map_update_elem_in_place().
2662 	 */
2663 	rcu_read_lock();
2664 	ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
2665 	rcu_read_unlock();
2666 	if (ret)
2667 		map->ops->map_fd_put_ptr(map, ptr, false);
2668 
2669 	return ret;
2670 }
2671 
2672 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2673 {
2674 	struct bpf_map *map, *inner_map_meta;
2675 
2676 	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2677 	if (IS_ERR(inner_map_meta))
2678 		return inner_map_meta;
2679 
2680 	map = htab_map_alloc(attr);
2681 	if (IS_ERR(map)) {
2682 		bpf_map_meta_free(inner_map_meta);
2683 		return map;
2684 	}
2685 
2686 	map->inner_map_meta = inner_map_meta;
2687 
2688 	return map;
2689 }
2690 
2691 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2692 {
2693 	struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
2694 
2695 	if (!inner_map)
2696 		return NULL;
2697 
2698 	return READ_ONCE(*inner_map);
2699 }
2700 
2701 static int htab_of_map_gen_lookup(struct bpf_map *map,
2702 				  struct bpf_insn *insn_buf)
2703 {
2704 	struct bpf_insn *insn = insn_buf;
2705 	const int ret = BPF_REG_0;
2706 
2707 	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2708 		     (void *(*)(struct bpf_map *map, void *key))NULL));
2709 	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2710 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2711 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2712 				offsetof(struct htab_elem, key) +
2713 				round_up(map->key_size, 8));
2714 	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2715 
2716 	return insn - insn_buf;
2717 }
2718 
2719 static void htab_of_map_free(struct bpf_map *map)
2720 {
2721 	bpf_map_meta_free(map->inner_map_meta);
2722 	fd_htab_map_free(map);
2723 }
2724 
2725 const struct bpf_map_ops htab_of_maps_map_ops = {
2726 	.map_alloc_check = fd_htab_map_alloc_check,
2727 	.map_alloc = htab_of_map_alloc,
2728 	.map_free = htab_of_map_free,
2729 	.map_get_next_key = htab_map_get_next_key,
2730 	.map_lookup_elem = htab_of_map_lookup_elem,
2731 	.map_delete_elem = htab_map_delete_elem,
2732 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
2733 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
2734 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2735 	.map_gen_lookup = htab_of_map_gen_lookup,
2736 	.map_check_btf = map_check_no_btf,
2737 	.map_mem_usage = htab_map_mem_usage,
2738 	BATCH_OPS(htab),
2739 	.map_btf_id = &htab_map_btf_ids[0],
2740 };
2741 
2742 struct rhtab_elem {
2743 	struct rhash_head node;
2744 	/* key bytes, then value bytes follow */
2745 	u8 data[] __aligned(8);
2746 };
2747 
2748 struct bpf_rhtab {
2749 	struct bpf_map map;
2750 	struct rhashtable ht;
2751 	struct bpf_mem_alloc ma;
2752 	u32 elem_size;
2753 	bool freeing_internal;
2754 };
2755 
2756 static const struct rhashtable_params rhtab_params = {
2757 	.head_offset = offsetof(struct rhtab_elem, node),
2758 	.key_offset  = offsetof(struct rhtab_elem, data),
2759 };
2760 
2761 static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size)
2762 {
2763 	return l->data + round_up(key_size, 8);
2764 }
2765 
2766 /* Specialize hash function and objcmp for long sized key */
2767 static __always_inline int rhtab_key_cmp_long(struct rhashtable_compare_arg *arg,
2768 					      const void *ptr)
2769 {
2770 	const unsigned long key1 = *(const unsigned long *)arg->key;
2771 	const struct rhtab_elem *key2 = ptr;
2772 
2773 	return key1 != *(const unsigned long *)key2->data;
2774 }
2775 
2776 static __always_inline u32 rhtab_hashfn_long(const void *data, u32 len, u32 seed)
2777 {
2778 	u64 k = *(const unsigned long *)data;
2779 
2780 	return (u32)(k ^ (k >> 32)) ^ seed;
2781 }
2782 
2783 static const struct rhashtable_params rhtab_params_long = {
2784 	.head_offset = offsetof(struct rhtab_elem, node),
2785 	.key_offset  = offsetof(struct rhtab_elem, data),
2786 	.key_len     = sizeof(long),
2787 	.hashfn      = rhtab_hashfn_long,
2788 	.obj_cmpfn   = rhtab_key_cmp_long,
2789 };
2790 
2791 static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr)
2792 {
2793 	struct rhashtable_params params;
2794 	struct bpf_rhtab *rhtab;
2795 	int err = 0;
2796 
2797 	rhtab = bpf_map_area_alloc(sizeof(*rhtab), NUMA_NO_NODE);
2798 	if (!rhtab)
2799 		return ERR_PTR(-ENOMEM);
2800 
2801 	bpf_map_init_from_attr(&rhtab->map, attr);
2802 
2803 	if (rhtab->map.max_entries > 1UL << 31) {
2804 		err = -E2BIG;
2805 		goto free_rhtab;
2806 	}
2807 
2808 	rhtab->elem_size = sizeof(struct rhtab_elem) + round_up(rhtab->map.key_size, 8) +
2809 			   round_up(rhtab->map.value_size, 8);
2810 
2811 	params = rhtab_params;
2812 	params.key_len = rhtab->map.key_size;
2813 	params.nelem_hint = (u32)attr->map_extra;
2814 	params.automatic_shrinking = true;
2815 
2816 	if (rhtab->map.key_size == sizeof(long)) {
2817 		params.hashfn = rhtab_hashfn_long;
2818 		params.obj_cmpfn = rhtab_key_cmp_long;
2819 	}
2820 
2821 	err = rhashtable_init(&rhtab->ht, &params);
2822 	if (err)
2823 		goto free_rhtab;
2824 
2825 	/* Set max_elems after rhashtable_init() since init zeroes the struct */
2826 	rhtab->ht.max_elems = rhtab->map.max_entries;
2827 
2828 	err = bpf_mem_alloc_init(&rhtab->ma, rhtab->elem_size, false);
2829 	if (err)
2830 		goto destroy_rhtab;
2831 
2832 	return &rhtab->map;
2833 
2834 destroy_rhtab:
2835 	rhashtable_destroy(&rhtab->ht);
2836 free_rhtab:
2837 	bpf_map_area_free(rhtab);
2838 	return ERR_PTR(err);
2839 }
2840 
2841 static int rhtab_map_alloc_check(union bpf_attr *attr)
2842 {
2843 	if (!(attr->map_flags & BPF_F_NO_PREALLOC))
2844 		return -EINVAL;
2845 
2846 	if (attr->map_flags & BPF_F_ZERO_SEED)
2847 		return -EINVAL;
2848 
2849 	if (attr->key_size > U16_MAX)
2850 		return -E2BIG;
2851 
2852 	if (attr->map_extra >> 32)
2853 		return -EINVAL;
2854 
2855 	if ((u32)attr->map_extra > U16_MAX)
2856 		return -E2BIG;
2857 
2858 	if ((u32)attr->map_extra > attr->max_entries)
2859 		return -EINVAL;
2860 
2861 	return htab_map_alloc_check(attr);
2862 }
2863 
2864 static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab,
2865 					struct rhtab_elem *elem)
2866 {
2867 	if (IS_ERR_OR_NULL(rhtab->map.record))
2868 		return;
2869 
2870 	bpf_obj_free_fields(rhtab->map.record,
2871 			    rhtab_elem_value(elem, rhtab->map.key_size));
2872 }
2873 
2874 static void rhtab_mem_dtor(void *obj, void *ctx)
2875 {
2876 	struct htab_btf_record *hrec = ctx;
2877 	struct rhtab_elem *elem = obj;
2878 
2879 	if (IS_ERR_OR_NULL(hrec->record))
2880 		return;
2881 
2882 	bpf_obj_free_fields(hrec->record,
2883 			    rhtab_elem_value(elem, hrec->key_size));
2884 }
2885 
2886 static void rhtab_free_elem(void *ptr, void *arg)
2887 {
2888 	struct bpf_rhtab *rhtab = arg;
2889 	struct rhtab_elem *elem = ptr;
2890 
2891 	bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size));
2892 	bpf_mem_cache_free_rcu(&rhtab->ma, elem);
2893 }
2894 
2895 static void rhtab_map_free(struct bpf_map *map)
2896 {
2897 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
2898 
2899 	rhashtable_free_and_destroy(&rhtab->ht, rhtab_free_elem, rhtab);
2900 	bpf_mem_alloc_destroy(&rhtab->ma);
2901 	bpf_map_area_free(rhtab);
2902 }
2903 
2904 static void *rhtab_lookup_elem(struct bpf_map *map, void *key)
2905 {
2906 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
2907 
2908 	/* Hold RCU lock in case sleepable program calls via gen_lookup */
2909 	guard(rcu)();
2910 
2911 	if (map->key_size == sizeof(long))
2912 		return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params_long);
2913 
2914 	return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params);
2915 }
2916 
2917 static void *rhtab_map_lookup_elem(struct bpf_map *map, void *key) __must_hold(RCU)
2918 {
2919 	struct rhtab_elem *l;
2920 
2921 	l = rhtab_lookup_elem(map, key);
2922 	return l ? rhtab_elem_value(l, map->key_size) : NULL;
2923 }
2924 
2925 static void rhtab_read_elem_value(struct bpf_map *map, void *dst, struct rhtab_elem *elem,
2926 				  u64 flags)
2927 {
2928 	void *src = rhtab_elem_value(elem, map->key_size);
2929 
2930 	if (flags & BPF_F_LOCK)
2931 		copy_map_value_locked(map, dst, src, true);
2932 	else
2933 		copy_map_value(map, dst, src);
2934 }
2935 
2936 static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, void *copy,
2937 			     u64 flags)
2938 {
2939 	int err;
2940 
2941 	/*
2942 	 * disable_instrumentation() mitigates the deadlock for programs running in NMI context.
2943 	 * rhashtable locks bucket with local_irq_save(). Only NMI programs may reenter
2944 	 * rhashtable code, bpf_disable_instrumentation() disables programs running in NMI, except
2945 	 * raw tracepoints, which we don't have in rhashtable.
2946 	 */
2947 	bpf_disable_instrumentation();
2948 
2949 	if (rhtab->map.key_size == sizeof(long))
2950 		err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params_long);
2951 	else
2952 		err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params);
2953 
2954 	bpf_enable_instrumentation();
2955 
2956 	if (err)
2957 		return err;
2958 
2959 	if (copy) {
2960 		rhtab_read_elem_value(&rhtab->map, copy, elem, flags);
2961 		check_and_init_map_value(&rhtab->map, copy);
2962 	}
2963 	/* Release internal structs: kptr, bpf_timer, task_work, wq */
2964 	rhtab_check_and_free_fields(rhtab, elem);
2965 	bpf_mem_cache_free_rcu(&rhtab->ma, elem);
2966 	return 0;
2967 }
2968 
2969 
2970 static long rhtab_map_delete_elem(struct bpf_map *map, void *key)
2971 {
2972 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
2973 	struct rhtab_elem *elem;
2974 
2975 	guard(rcu)();
2976 
2977 	elem = rhtab_lookup_elem(map, key);
2978 	if (!elem)
2979 		return -ENOENT;
2980 
2981 	return rhtab_delete_elem(rhtab, elem, NULL, 0);
2982 }
2983 
2984 static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags)
2985 {
2986 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
2987 	struct rhtab_elem *elem;
2988 	int err;
2989 
2990 	err = bpf_map_check_op_flags(map, flags, BPF_F_LOCK);
2991 	if (err)
2992 		return err;
2993 
2994 	guard(rcu)();
2995 
2996 	elem = rhtab_lookup_elem(map, key);
2997 	if (!elem)
2998 		return -ENOENT;
2999 
3000 	return rhtab_delete_elem(rhtab, elem, value, flags);
3001 }
3002 
3003 static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value,
3004 				      u64 map_flags)
3005 {
3006 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
3007 	void *old_val = rhtab_elem_value(elem, map->key_size);
3008 
3009 	if (map_flags & BPF_NOEXIST)
3010 		return -EEXIST;
3011 
3012 	if (map_flags & BPF_F_LOCK)
3013 		copy_map_value_locked(map, old_val, value, false);
3014 	else
3015 		copy_map_value(map, old_val, value);
3016 
3017 	/*
3018 	 * Torn reads: a concurrent reader without BPF_F_LOCK may observe
3019 	 * the value mid-copy. Callers requiring consistent reads must use
3020 	 * BPF_F_LOCK, matching arraymap semantics.
3021 	 *
3022 	 * copy_map_value() skips special-field offsets, so old timers/
3023 	 * kptrs/etc. still sit in the slot. Cancel them after the copy
3024 	 * to match arraymap's update semantics.
3025 	 */
3026 	rhtab_check_and_free_fields(rhtab, elem);
3027 	return 0;
3028 }
3029 
3030 static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
3031 {
3032 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
3033 	struct rhtab_elem *elem, *tmp;
3034 
3035 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
3036 		return -EINVAL;
3037 
3038 	if ((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
3039 		return -EINVAL;
3040 
3041 	guard(rcu)();
3042 	elem = rhtab_lookup_elem(map, key);
3043 	if (elem)
3044 		return rhtab_map_update_existing(map, elem, value, map_flags);
3045 
3046 	if (map_flags & BPF_EXIST)
3047 		return -ENOENT;
3048 
3049 	/*
3050 	 * Reject new insertions while map_release_uref cleanup walks the
3051 	 * table. Without this, new elements could keep triggering rehash
3052 	 * and prevent the walk from terminating.
3053 	 */
3054 	if (READ_ONCE(rhtab->freeing_internal))
3055 		return -EBUSY;
3056 
3057 	/* Check max_entries limit before inserting new element */
3058 	if (atomic_read(&rhtab->ht.nelems) >= map->max_entries)
3059 		return -E2BIG;
3060 
3061 	elem = bpf_mem_cache_alloc(&rhtab->ma);
3062 	if (!elem)
3063 		return -ENOMEM;
3064 
3065 	memcpy(elem->data, key, map->key_size);
3066 	copy_map_value(map, rhtab_elem_value(elem, map->key_size), value);
3067 	check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size));
3068 
3069 	/* Prevent deadlock for NMI programs attempting to take bucket lock */
3070 	bpf_disable_instrumentation();
3071 
3072 	if (map->key_size == sizeof(long))
3073 		tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params_long);
3074 	else
3075 		tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params);
3076 
3077 	bpf_enable_instrumentation();
3078 
3079 	if (tmp) {
3080 		bpf_mem_cache_free(&rhtab->ma, elem);
3081 		if (IS_ERR(tmp))
3082 			return PTR_ERR(tmp);
3083 
3084 		return rhtab_map_update_existing(map, tmp, value, map_flags);
3085 	}
3086 
3087 	return 0;
3088 }
3089 
3090 static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
3091 {
3092 	struct bpf_insn *insn = insn_buf;
3093 	const int ret = BPF_REG_0;
3094 
3095 	BUILD_BUG_ON(!__same_type(&rhtab_lookup_elem,
3096 				  (void *(*)(struct bpf_map *map, void *key)) NULL));
3097 	*insn++ = BPF_EMIT_CALL(rhtab_lookup_elem);
3098 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
3099 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
3100 				offsetof(struct rhtab_elem, data) + round_up(map->key_size, 8));
3101 
3102 	return insn - insn_buf;
3103 }
3104 
3105 static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf,
3106 			       const struct btf_type *key_type,
3107 			       const struct btf_type *value_type)
3108 {
3109 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
3110 
3111 	return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor);
3112 }
3113 
3114 static void rhtab_map_free_internal_structs(struct bpf_map *map)
3115 {
3116 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
3117 	struct rhashtable_iter iter;
3118 	struct rhtab_elem *elem;
3119 
3120 	if (!bpf_map_has_internal_structs(map))
3121 		return;
3122 
3123 	/*
3124 	 * Block new insertions. Once observed, no new growth is triggered,
3125 	 * so any in-flight rehash will drain and the walker is guaranteed
3126 	 * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress,
3127 	 * retry"; do not wait for the worker.
3128 	 */
3129 	WRITE_ONCE(rhtab->freeing_internal, true);
3130 
3131 	rhashtable_walk_enter(&rhtab->ht, &iter);
3132 	rhashtable_walk_start(&iter);
3133 
3134 	while ((elem = rhashtable_walk_next(&iter))) {
3135 		if (IS_ERR(elem)) {
3136 			if (PTR_ERR(elem) == -EAGAIN)
3137 				continue;
3138 			break;
3139 		}
3140 
3141 		bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size));
3142 
3143 		if (need_resched()) { /* Avoid stalls on large maps */
3144 			rhashtable_walk_stop(&iter);
3145 			cond_resched();
3146 			rhashtable_walk_start(&iter);
3147 		}
3148 	}
3149 
3150 	rhashtable_walk_stop(&iter);
3151 	rhashtable_walk_exit(&iter);
3152 	WRITE_ONCE(rhtab->freeing_internal, false);
3153 }
3154 
3155 static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
3156 	__must_hold_shared(RCU)
3157 {
3158 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
3159 	struct rhtab_elem *elem;
3160 
3161 	elem = rhashtable_next_key(&rhtab->ht, key);
3162 
3163 	/* if not found, return the first key */
3164 	if (PTR_ERR(elem) == -ENOENT)
3165 		elem = rhashtable_next_key(&rhtab->ht, NULL);
3166 
3167 	if (IS_ERR(elem))
3168 		return PTR_ERR(elem);
3169 	if (!elem)
3170 		return -ENOENT;
3171 
3172 	memcpy(next_key, elem->data, map->key_size);
3173 	return 0;
3174 }
3175 
3176 static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m)
3177 {
3178 	void *value;
3179 
3180 	/* Guarantee that hashtab value is not freed */
3181 	guard(rcu)();
3182 
3183 	value = rhtab_map_lookup_elem(map, key);
3184 	if (!value)
3185 		return;
3186 
3187 	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
3188 	seq_puts(m, ": ");
3189 	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
3190 	seq_putc(m, '\n');
3191 }
3192 
3193 static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
3194 				void *callback_ctx, u64 flags)
3195 {
3196 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
3197 	void *prev_key = NULL;
3198 	struct rhtab_elem *elem;
3199 	int num_elems = 0;
3200 	u64 ret = 0;
3201 
3202 	cant_migrate();
3203 
3204 	if (flags != 0)
3205 		return -EINVAL;
3206 
3207 	rcu_read_lock();
3208 	/*
3209 	 * Best-effort iteration: if rhashtable is concurrently resized or
3210 	 * elements are deleted/inserted, there may be missed or duplicate
3211 	 * elements visited.
3212 	 */
3213 	while ((elem = rhashtable_next_key(&rhtab->ht, prev_key))) {
3214 		if (IS_ERR(elem))
3215 			break;
3216 		num_elems++;
3217 		ret = callback_fn((u64)(long)map,
3218 				  (u64)(long)elem->data,
3219 				  (u64)(long)rhtab_elem_value(elem, map->key_size),
3220 				  (u64)(long)callback_ctx, 0);
3221 		if (ret)
3222 			break;
3223 
3224 		prev_key = elem->data;	/* valid while RCU held */
3225 	}
3226 	rcu_read_unlock();
3227 
3228 	return num_elems;
3229 }
3230 
3231 static u64 rhtab_map_mem_usage(const struct bpf_map *map)
3232 {
3233 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
3234 	u64 num_entries;
3235 
3236 	/* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */
3237 	num_entries = atomic_read(&rhtab->ht.nelems);
3238 	return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries;
3239 }
3240 
3241 static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map,
3242 					       const union bpf_attr *attr,
3243 					       union bpf_attr __user *uattr,
3244 					       bool do_delete)
3245 {
3246 	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
3247 	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
3248 	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
3249 	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
3250 	void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val;
3251 	struct rhtab_elem **del_elems = NULL;
3252 	u32 max_count, total, key_size, value_size, i;
3253 	bool has_next_cursor = false;
3254 	struct rhtab_elem *elem;
3255 	u64 elem_map_flags, map_flags;
3256 	int ret = 0;
3257 
3258 	elem_map_flags = attr->batch.elem_flags;
3259 	ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK);
3260 	if (ret)
3261 		return ret;
3262 
3263 	map_flags = attr->batch.flags;
3264 	if (map_flags)
3265 		return -EINVAL;
3266 
3267 	max_count = attr->batch.count;
3268 	if (!max_count)
3269 		return 0;
3270 
3271 	if (put_user(0, &uattr->batch.count))
3272 		return -EFAULT;
3273 
3274 	key_size = map->key_size;
3275 	value_size = map->value_size;
3276 
3277 	keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN);
3278 	values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN);
3279 	if (do_delete)
3280 		del_elems = kvmalloc_array(max_count, sizeof(void *),
3281 					   GFP_USER | __GFP_NOWARN);
3282 	cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
3283 
3284 	if (!keys || !values || !cursor || (do_delete && !del_elems)) {
3285 		ret = -ENOMEM;
3286 		goto free;
3287 	}
3288 
3289 	if (ubatch && copy_from_user(cursor, ubatch, key_size)) {
3290 		ret = -EFAULT;
3291 		goto free;
3292 	}
3293 
3294 	dst_key = keys;
3295 	dst_val = values;
3296 	total = 0;
3297 
3298 	rcu_read_lock();
3299 
3300 	/*
3301 	 * Cursor stores the key of the next-to-process element (stashed by
3302 	 * the previous batch). Look it up directly so the element is included
3303 	 * here rather than skipped by next_key(). If the cursor was deleted
3304 	 * concurrently (or by the previous do_delete batch), return -EAGAIN
3305 	 * so userspace can distinguish a lost cursor from end-of-iteration
3306 	 * (-ENOENT) and restart from a NULL cursor.
3307 	 */
3308 	if (ubatch) {
3309 		elem = rhtab_lookup_elem(map, cursor);
3310 		if (!elem) {
3311 			rcu_read_unlock();
3312 			ret = -EAGAIN;
3313 			goto free;
3314 		}
3315 	} else {
3316 		elem = rhashtable_next_key(&rhtab->ht, NULL);
3317 	}
3318 
3319 	while (elem && !IS_ERR(elem) && total < max_count) {
3320 		memcpy(dst_key, elem->data, key_size);
3321 		rhtab_read_elem_value(map, dst_val, elem, elem_map_flags);
3322 		check_and_init_map_value(map, dst_val);
3323 
3324 		if (do_delete)
3325 			del_elems[total] = elem;
3326 
3327 		elem = rhashtable_next_key(&rhtab->ht, dst_key);
3328 		dst_key += key_size;
3329 		dst_val += value_size;
3330 		total++;
3331 
3332 		/* Bail to userspace to avoid stalls. */
3333 		if (need_resched())
3334 			break;
3335 	}
3336 
3337 	if (elem && !IS_ERR(elem)) {
3338 		/* Stash next-to-process key as cursor for the next batch. */
3339 		memcpy(cursor, elem->data, key_size);
3340 		has_next_cursor = true;
3341 	}
3342 
3343 	if (do_delete) {
3344 		for (i = 0; i < total; i++)
3345 			rhtab_delete_elem(rhtab, del_elems[i], NULL, 0);
3346 	}
3347 
3348 	rcu_read_unlock();
3349 
3350 	if (total == 0) {
3351 		ret = -ENOENT;
3352 		goto free;
3353 	}
3354 
3355 	/* No more elements after this batch. */
3356 	if (!has_next_cursor)
3357 		ret = -ENOENT;
3358 
3359 	if (copy_to_user(ukeys, keys, (size_t)total * key_size) ||
3360 	    copy_to_user(uvalues, values, (size_t)total * value_size) ||
3361 	    put_user(total, &uattr->batch.count) ||
3362 	    (has_next_cursor &&
3363 	     copy_to_user(u64_to_user_ptr(attr->batch.out_batch),
3364 			  cursor, key_size))) {
3365 		ret = -EFAULT;
3366 		goto free;
3367 	}
3368 
3369 free:
3370 	kfree(cursor);
3371 	kvfree(keys);
3372 	kvfree(values);
3373 	kvfree(del_elems);
3374 	return ret;
3375 }
3376 
3377 static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
3378 				  union bpf_attr __user *uattr)
3379 {
3380 	return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false);
3381 }
3382 
3383 static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr,
3384 					     union bpf_attr __user *uattr)
3385 {
3386 	return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true);
3387 }
3388 
3389 struct bpf_iter_seq_rhash_map_info {
3390 	struct bpf_map *map;
3391 	struct bpf_rhtab *rhtab;
3392 	struct rhashtable_iter iter;
3393 };
3394 
3395 static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos)
3396 	__acquires(RCU)
3397 {
3398 	struct bpf_iter_seq_rhash_map_info *info = seq->private;
3399 	struct rhtab_elem *elem;
3400 
3401 	rhashtable_walk_start(&info->iter);
3402 	/*
3403 	 * Re-deliver the element returned by walk_next() at the end of the
3404 	 * previous read() — bpf_seq_read may have stopped before show()
3405 	 * consumed it. Rehash rewinds the walker; retry on -EAGAIN.
3406 	 */
3407 	do {
3408 		elem = rhashtable_walk_peek(&info->iter);
3409 	} while (PTR_ERR(elem) == -EAGAIN);
3410 
3411 	if (IS_ERR(elem))
3412 		return NULL;
3413 
3414 	if (elem && *pos == 0)
3415 		++*pos;
3416 	return elem;
3417 }
3418 
3419 static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3420 {
3421 	struct bpf_iter_seq_rhash_map_info *info = seq->private;
3422 	struct rhtab_elem *elem;
3423 
3424 	++*pos;
3425 
3426 	/* Rehash rewinds the walker; retry until it stops returning -EAGAIN. */
3427 	do {
3428 		elem = rhashtable_walk_next(&info->iter);
3429 	} while (PTR_ERR(elem) == -EAGAIN);
3430 
3431 	if (IS_ERR(elem))
3432 		return NULL;
3433 	return elem;
3434 }
3435 
3436 static int __bpf_rhash_map_seq_show(struct seq_file *seq,
3437 				    struct rhtab_elem *elem)
3438 {
3439 	struct bpf_iter_seq_rhash_map_info *info = seq->private;
3440 	struct bpf_iter__bpf_map_elem ctx = {};
3441 	struct bpf_iter_meta meta;
3442 	struct bpf_prog *prog;
3443 	int ret = 0;
3444 
3445 	meta.seq = seq;
3446 	prog = bpf_iter_get_info(&meta, elem == NULL);
3447 	if (prog) {
3448 		ctx.meta = &meta;
3449 		ctx.map = info->map;
3450 		if (elem) {
3451 			ctx.key = elem->data;
3452 			ctx.value = rhtab_elem_value(elem, info->map->key_size);
3453 		}
3454 		ret = bpf_iter_run_prog(prog, &ctx);
3455 	}
3456 
3457 	return ret;
3458 }
3459 
3460 static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v)
3461 {
3462 	return __bpf_rhash_map_seq_show(seq, v);
3463 }
3464 
3465 static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v)
3466 	__releases(RCU)
3467 {
3468 	struct bpf_iter_seq_rhash_map_info *info = seq->private;
3469 
3470 	if (!v)
3471 		(void)__bpf_rhash_map_seq_show(seq, NULL);
3472 
3473 	rhashtable_walk_stop(&info->iter);
3474 }
3475 
3476 static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux)
3477 {
3478 	struct bpf_iter_seq_rhash_map_info *info = priv_data;
3479 	struct bpf_map *map = aux->map;
3480 
3481 	bpf_map_inc_with_uref(map);
3482 	info->map = map;
3483 	info->rhtab = container_of(map, struct bpf_rhtab, map);
3484 	rhashtable_walk_enter(&info->rhtab->ht, &info->iter);
3485 	return 0;
3486 }
3487 
3488 static void bpf_iter_fini_rhash_map(void *priv_data)
3489 {
3490 	struct bpf_iter_seq_rhash_map_info *info = priv_data;
3491 
3492 	rhashtable_walk_exit(&info->iter);
3493 	bpf_map_put_with_uref(info->map);
3494 }
3495 
3496 static const struct seq_operations bpf_rhash_map_seq_ops = {
3497 	.start = bpf_rhash_map_seq_start,
3498 	.next = bpf_rhash_map_seq_next,
3499 	.stop = bpf_rhash_map_seq_stop,
3500 	.show = bpf_rhash_map_seq_show,
3501 };
3502 
3503 static const struct bpf_iter_seq_info rhash_iter_seq_info = {
3504 	.seq_ops = &bpf_rhash_map_seq_ops,
3505 	.init_seq_private = bpf_iter_init_rhash_map,
3506 	.fini_seq_private = bpf_iter_fini_rhash_map,
3507 	.seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info),
3508 };
3509 
3510 BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab)
3511 const struct bpf_map_ops rhtab_map_ops = {
3512 	.map_meta_equal = bpf_map_meta_equal,
3513 	.map_alloc_check = rhtab_map_alloc_check,
3514 	.map_alloc = rhtab_map_alloc,
3515 	.map_free = rhtab_map_free,
3516 	.map_get_next_key = rhtab_map_get_next_key,
3517 	.map_release_uref = rhtab_map_free_internal_structs,
3518 	.map_check_btf = rhtab_map_check_btf,
3519 	.map_lookup_elem = rhtab_map_lookup_elem,
3520 	.map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem,
3521 	.map_update_elem = rhtab_map_update_elem,
3522 	.map_delete_elem = rhtab_map_delete_elem,
3523 	.map_gen_lookup = rhtab_map_gen_lookup,
3524 	.map_seq_show_elem = rhtab_map_seq_show_elem,
3525 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
3526 	.map_for_each_callback = bpf_each_rhash_elem,
3527 	.map_mem_usage = rhtab_map_mem_usage,
3528 	BATCH_OPS(rhtab),
3529 	.map_btf_id = &rhtab_map_btf_ids[0],
3530 	.iter_seq_info = &rhash_iter_seq_info,
3531 };
3532