1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * Copyright (c) 2016,2017 Facebook
4 */
5 #include <linux/bpf.h>
6 #include <linux/btf.h>
7 #include <linux/err.h>
8 #include <linux/slab.h>
9 #include <linux/mm.h>
10 #include <linux/filter.h>
11 #include <linux/perf_event.h>
12 #include <uapi/linux/btf.h>
13 #include <linux/rcupdate_trace.h>
14 #include <linux/btf_ids.h>
15 #include <crypto/sha2.h>
16
17 #include "map_in_map.h"
18
19 #define ARRAY_CREATE_FLAG_MASK \
20 (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
21 BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
22
bpf_array_free_percpu(struct bpf_array * array)23 static void bpf_array_free_percpu(struct bpf_array *array)
24 {
25 int i;
26
27 for (i = 0; i < array->map.max_entries; i++) {
28 free_percpu(array->pptrs[i]);
29 cond_resched();
30 }
31 }
32
bpf_array_alloc_percpu(struct bpf_array * array)33 static int bpf_array_alloc_percpu(struct bpf_array *array)
34 {
35 void __percpu *ptr;
36 int i;
37
38 for (i = 0; i < array->map.max_entries; i++) {
39 ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
40 GFP_USER | __GFP_NOWARN);
41 if (!ptr) {
42 bpf_array_free_percpu(array);
43 return -ENOMEM;
44 }
45 array->pptrs[i] = ptr;
46 cond_resched();
47 }
48
49 return 0;
50 }
51
52 /* Called from syscall */
array_map_alloc_check(union bpf_attr * attr)53 int array_map_alloc_check(union bpf_attr *attr)
54 {
55 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
56 int numa_node = bpf_map_attr_numa_node(attr);
57
58 /* check sanity of attributes */
59 if (attr->max_entries == 0 || attr->key_size != 4 ||
60 attr->value_size == 0 ||
61 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
62 !bpf_map_flags_access_ok(attr->map_flags) ||
63 (percpu && numa_node != NUMA_NO_NODE))
64 return -EINVAL;
65
66 if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
67 attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
68 return -EINVAL;
69
70 if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
71 attr->map_flags & BPF_F_PRESERVE_ELEMS)
72 return -EINVAL;
73
74 /* avoid overflow on round_up(map->value_size) */
75 if (attr->value_size > INT_MAX)
76 return -E2BIG;
77 /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
78 if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
79 return -E2BIG;
80
81 return 0;
82 }
83
array_map_alloc(union bpf_attr * attr)84 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
85 {
86 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
87 int numa_node = bpf_map_attr_numa_node(attr);
88 u32 elem_size, index_mask, max_entries;
89 bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
90 u64 array_size, mask64;
91 struct bpf_array *array;
92
93 elem_size = round_up(attr->value_size, 8);
94
95 max_entries = attr->max_entries;
96
97 /* On 32 bit archs roundup_pow_of_two() with max_entries that has
98 * upper most bit set in u32 space is undefined behavior due to
99 * resulting 1U << 32, so do it manually here in u64 space.
100 */
101 mask64 = fls_long(max_entries - 1);
102 mask64 = 1ULL << mask64;
103 mask64 -= 1;
104
105 index_mask = mask64;
106 if (!bypass_spec_v1) {
107 /* round up array size to nearest power of 2,
108 * since cpu will speculate within index_mask limits
109 */
110 max_entries = index_mask + 1;
111 /* Check for overflows. */
112 if (max_entries < attr->max_entries)
113 return ERR_PTR(-E2BIG);
114 }
115
116 array_size = sizeof(*array);
117 if (percpu) {
118 array_size += (u64) max_entries * sizeof(void *);
119 } else {
120 /* rely on vmalloc() to return page-aligned memory and
121 * ensure array->value is exactly page-aligned
122 */
123 if (attr->map_flags & BPF_F_MMAPABLE) {
124 array_size = PAGE_ALIGN(array_size);
125 array_size += PAGE_ALIGN((u64) max_entries * elem_size);
126 } else {
127 array_size += (u64) max_entries * elem_size;
128 }
129 }
130
131 /* allocate all map elements and zero-initialize them */
132 if (attr->map_flags & BPF_F_MMAPABLE) {
133 void *data;
134
135 /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
136 data = bpf_map_area_mmapable_alloc(array_size, numa_node);
137 if (!data)
138 return ERR_PTR(-ENOMEM);
139 array = data + PAGE_ALIGN(sizeof(struct bpf_array))
140 - offsetof(struct bpf_array, value);
141 } else {
142 array = bpf_map_area_alloc(array_size, numa_node);
143 }
144 if (!array)
145 return ERR_PTR(-ENOMEM);
146 array->index_mask = index_mask;
147 array->map.bypass_spec_v1 = bypass_spec_v1;
148
149 /* copy mandatory map attributes */
150 bpf_map_init_from_attr(&array->map, attr);
151 array->elem_size = elem_size;
152
153 if (percpu && bpf_array_alloc_percpu(array)) {
154 bpf_map_area_free(array);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 return &array->map;
159 }
160
array_map_elem_ptr(struct bpf_array * array,u32 index)161 static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
162 {
163 return array->value + (u64)array->elem_size * index;
164 }
165
166 /* Called from syscall or from eBPF program */
array_map_lookup_elem(struct bpf_map * map,void * key)167 static void *array_map_lookup_elem(struct bpf_map *map, void *key)
168 {
169 struct bpf_array *array = container_of(map, struct bpf_array, map);
170 u32 index = *(u32 *)key;
171
172 if (unlikely(index >= array->map.max_entries))
173 return NULL;
174
175 return array->value + (u64)array->elem_size * (index & array->index_mask);
176 }
177
array_map_get_hash(struct bpf_map * map,u32 hash_buf_size,void * hash_buf)178 static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
179 void *hash_buf)
180 {
181 struct bpf_array *array = container_of(map, struct bpf_array, map);
182
183 sha256(array->value, (u64)array->elem_size * array->map.max_entries,
184 hash_buf);
185 memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
186 return 0;
187 }
188
array_map_direct_value_addr(const struct bpf_map * map,u64 * imm,u32 off)189 static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
190 u32 off)
191 {
192 struct bpf_array *array = container_of(map, struct bpf_array, map);
193
194 if (map->max_entries != 1)
195 return -ENOTSUPP;
196 if (off >= map->value_size)
197 return -EINVAL;
198
199 *imm = (unsigned long)array->value;
200 return 0;
201 }
202
array_map_direct_value_meta(const struct bpf_map * map,u64 imm,u32 * off)203 static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
204 u32 *off)
205 {
206 struct bpf_array *array = container_of(map, struct bpf_array, map);
207 u64 base = (unsigned long)array->value;
208 u64 range = array->elem_size;
209
210 if (map->max_entries != 1)
211 return -ENOTSUPP;
212 if (imm < base || imm >= base + range)
213 return -ENOENT;
214
215 *off = imm - base;
216 return 0;
217 }
218
219 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
array_map_gen_lookup(struct bpf_map * map,struct bpf_insn * insn_buf)220 static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
221 {
222 struct bpf_array *array = container_of(map, struct bpf_array, map);
223 struct bpf_insn *insn = insn_buf;
224 u32 elem_size = array->elem_size;
225 const int ret = BPF_REG_0;
226 const int map_ptr = BPF_REG_1;
227 const int index = BPF_REG_2;
228
229 if (map->map_flags & BPF_F_INNER_MAP)
230 return -EOPNOTSUPP;
231
232 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
233 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
234 if (!map->bypass_spec_v1) {
235 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
236 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
237 } else {
238 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
239 }
240
241 if (is_power_of_2(elem_size)) {
242 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
243 } else {
244 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
245 }
246 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
247 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
248 *insn++ = BPF_MOV64_IMM(ret, 0);
249 return insn - insn_buf;
250 }
251
252 /* Called from eBPF program */
percpu_array_map_lookup_elem(struct bpf_map * map,void * key)253 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
254 {
255 struct bpf_array *array = container_of(map, struct bpf_array, map);
256 u32 index = *(u32 *)key;
257
258 if (unlikely(index >= array->map.max_entries))
259 return NULL;
260
261 return this_cpu_ptr(array->pptrs[index & array->index_mask]);
262 }
263
264 /* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */
percpu_array_map_gen_lookup(struct bpf_map * map,struct bpf_insn * insn_buf)265 static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
266 {
267 struct bpf_array *array = container_of(map, struct bpf_array, map);
268 struct bpf_insn *insn = insn_buf;
269
270 if (!bpf_jit_supports_percpu_insn())
271 return -EOPNOTSUPP;
272
273 if (map->map_flags & BPF_F_INNER_MAP)
274 return -EOPNOTSUPP;
275
276 BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0);
277 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs));
278
279 *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0);
280 if (!map->bypass_spec_v1) {
281 *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6);
282 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask);
283 } else {
284 *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5);
285 }
286
287 *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
288 *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
289 *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
290 *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
291 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
292 *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0);
293 return insn - insn_buf;
294 }
295
percpu_array_map_lookup_percpu_elem(struct bpf_map * map,void * key,u32 cpu)296 static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
297 {
298 struct bpf_array *array = container_of(map, struct bpf_array, map);
299 u32 index = *(u32 *)key;
300
301 if (cpu >= nr_cpu_ids)
302 return NULL;
303
304 if (unlikely(index >= array->map.max_entries))
305 return NULL;
306
307 return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
308 }
309
bpf_percpu_array_copy(struct bpf_map * map,void * key,void * value,u64 map_flags)310 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
311 {
312 struct bpf_array *array = container_of(map, struct bpf_array, map);
313 u32 index = *(u32 *)key;
314 void __percpu *pptr;
315 int cpu, off = 0;
316 u32 size;
317
318 if (unlikely(index >= array->map.max_entries))
319 return -ENOENT;
320
321 /* per_cpu areas are zero-filled and bpf programs can only
322 * access 'value_size' of them, so copying rounded areas
323 * will not leak any kernel data
324 */
325 size = array->elem_size;
326 rcu_read_lock();
327 pptr = array->pptrs[index & array->index_mask];
328 if (map_flags & BPF_F_CPU) {
329 cpu = map_flags >> 32;
330 copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
331 check_and_init_map_value(map, value);
332 goto unlock;
333 }
334 for_each_possible_cpu(cpu) {
335 copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
336 check_and_init_map_value(map, value + off);
337 off += size;
338 }
339 unlock:
340 rcu_read_unlock();
341 return 0;
342 }
343
344 /* Called from syscall */
bpf_array_get_next_key(struct bpf_map * map,void * key,void * next_key)345 int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key)
346 {
347 u32 index = key ? *(u32 *)key : U32_MAX;
348 u32 *next = (u32 *)next_key;
349
350 if (index >= map->max_entries) {
351 *next = 0;
352 return 0;
353 }
354
355 if (index == map->max_entries - 1)
356 return -ENOENT;
357
358 *next = index + 1;
359 return 0;
360 }
361
362 /* Called from syscall or from eBPF program */
array_map_update_elem(struct bpf_map * map,void * key,void * value,u64 map_flags)363 static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
364 u64 map_flags)
365 {
366 struct bpf_array *array = container_of(map, struct bpf_array, map);
367 u32 index = *(u32 *)key;
368 char *val;
369
370 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
371 /* unknown flags */
372 return -EINVAL;
373
374 if (unlikely(index >= array->map.max_entries))
375 /* all elements were pre-allocated, cannot insert a new one */
376 return -E2BIG;
377
378 if (unlikely(map_flags & BPF_NOEXIST))
379 /* all elements already exist */
380 return -EEXIST;
381
382 if (unlikely((map_flags & BPF_F_LOCK) &&
383 !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
384 return -EINVAL;
385
386 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
387 val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
388 copy_map_value(map, val, value);
389 bpf_obj_free_fields(array->map.record, val);
390 } else {
391 val = array->value +
392 (u64)array->elem_size * (index & array->index_mask);
393 if (map_flags & BPF_F_LOCK)
394 copy_map_value_locked(map, val, value, false);
395 else
396 copy_map_value(map, val, value);
397 bpf_obj_free_fields(array->map.record, val);
398 }
399 return 0;
400 }
401
bpf_percpu_array_update(struct bpf_map * map,void * key,void * value,u64 map_flags)402 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
403 u64 map_flags)
404 {
405 struct bpf_array *array = container_of(map, struct bpf_array, map);
406 u32 index = *(u32 *)key;
407 void __percpu *pptr;
408 void *ptr, *val;
409 u32 size;
410 int cpu;
411
412 if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))
413 /* unknown flags */
414 return -EINVAL;
415
416 if (unlikely(index >= array->map.max_entries))
417 /* all elements were pre-allocated, cannot insert a new one */
418 return -E2BIG;
419
420 if (unlikely(map_flags == BPF_NOEXIST))
421 /* all elements already exist */
422 return -EEXIST;
423
424 /* the user space will provide round_up(value_size, 8) bytes that
425 * will be copied into per-cpu area. bpf programs can only access
426 * value_size of it. During lookup the same extra bytes will be
427 * returned or zeros which were zero-filled by percpu_alloc,
428 * so no kernel data leaks possible
429 */
430 size = array->elem_size;
431 rcu_read_lock();
432 pptr = array->pptrs[index & array->index_mask];
433 if (map_flags & BPF_F_CPU) {
434 cpu = map_flags >> 32;
435 ptr = per_cpu_ptr(pptr, cpu);
436 copy_map_value(map, ptr, value);
437 bpf_obj_free_fields(array->map.record, ptr);
438 goto unlock;
439 }
440 for_each_possible_cpu(cpu) {
441 ptr = per_cpu_ptr(pptr, cpu);
442 val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
443 copy_map_value(map, ptr, val);
444 bpf_obj_free_fields(array->map.record, ptr);
445 }
446 unlock:
447 rcu_read_unlock();
448 return 0;
449 }
450
451 /* Called from syscall or from eBPF program */
array_map_delete_elem(struct bpf_map * map,void * key)452 static long array_map_delete_elem(struct bpf_map *map, void *key)
453 {
454 return -EINVAL;
455 }
456
array_map_vmalloc_addr(struct bpf_array * array)457 static void *array_map_vmalloc_addr(struct bpf_array *array)
458 {
459 return (void *)round_down((unsigned long)array, PAGE_SIZE);
460 }
461
array_map_free_internal_structs(struct bpf_map * map)462 static void array_map_free_internal_structs(struct bpf_map *map)
463 {
464 struct bpf_array *array = container_of(map, struct bpf_array, map);
465 int i;
466
467 /* We only free internal structs on uref dropping to zero */
468 if (!bpf_map_has_internal_structs(map))
469 return;
470
471 for (i = 0; i < array->map.max_entries; i++)
472 bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i));
473 }
474
475 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
array_map_free(struct bpf_map * map)476 static void array_map_free(struct bpf_map *map)
477 {
478 struct bpf_array *array = container_of(map, struct bpf_array, map);
479 int i;
480
481 if (!IS_ERR_OR_NULL(map->record)) {
482 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
483 for (i = 0; i < array->map.max_entries; i++) {
484 void __percpu *pptr = array->pptrs[i & array->index_mask];
485 int cpu;
486
487 for_each_possible_cpu(cpu) {
488 bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
489 cond_resched();
490 }
491 }
492 } else {
493 for (i = 0; i < array->map.max_entries; i++)
494 bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
495 }
496 }
497
498 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
499 bpf_array_free_percpu(array);
500
501 if (array->map.map_flags & BPF_F_MMAPABLE)
502 bpf_map_area_free(array_map_vmalloc_addr(array));
503 else
504 bpf_map_area_free(array);
505 }
506
array_map_seq_show_elem(struct bpf_map * map,void * key,struct seq_file * m)507 static void array_map_seq_show_elem(struct bpf_map *map, void *key,
508 struct seq_file *m)
509 {
510 void *value;
511
512 rcu_read_lock();
513
514 value = array_map_lookup_elem(map, key);
515 if (!value) {
516 rcu_read_unlock();
517 return;
518 }
519
520 if (map->btf_key_type_id)
521 seq_printf(m, "%u: ", *(u32 *)key);
522 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
523 seq_putc(m, '\n');
524
525 rcu_read_unlock();
526 }
527
percpu_array_map_seq_show_elem(struct bpf_map * map,void * key,struct seq_file * m)528 static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
529 struct seq_file *m)
530 {
531 struct bpf_array *array = container_of(map, struct bpf_array, map);
532 u32 index = *(u32 *)key;
533 void __percpu *pptr;
534 int cpu;
535
536 rcu_read_lock();
537
538 seq_printf(m, "%u: {\n", *(u32 *)key);
539 pptr = array->pptrs[index & array->index_mask];
540 for_each_possible_cpu(cpu) {
541 seq_printf(m, "\tcpu%d: ", cpu);
542 btf_type_seq_show(map->btf, map->btf_value_type_id,
543 per_cpu_ptr(pptr, cpu), m);
544 seq_putc(m, '\n');
545 }
546 seq_puts(m, "}\n");
547
548 rcu_read_unlock();
549 }
550
array_map_check_btf(const struct bpf_map * map,const struct btf * btf,const struct btf_type * key_type,const struct btf_type * value_type)551 static int array_map_check_btf(const struct bpf_map *map,
552 const struct btf *btf,
553 const struct btf_type *key_type,
554 const struct btf_type *value_type)
555 {
556 /* One exception for keyless BTF: .bss/.data/.rodata map */
557 if (btf_type_is_void(key_type)) {
558 if (map->map_type != BPF_MAP_TYPE_ARRAY ||
559 map->max_entries != 1)
560 return -EINVAL;
561
562 if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)
563 return -EINVAL;
564
565 return 0;
566 }
567
568 /*
569 * Bpf array can only take a u32 key. This check makes sure
570 * that the btf matches the attr used during map_create.
571 */
572 if (!btf_type_is_i32(key_type))
573 return -EINVAL;
574
575 return 0;
576 }
577
array_map_mmap(struct bpf_map * map,struct vm_area_struct * vma)578 static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
579 {
580 struct bpf_array *array = container_of(map, struct bpf_array, map);
581 pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;
582
583 if (!(map->map_flags & BPF_F_MMAPABLE))
584 return -EINVAL;
585
586 if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) >
587 PAGE_ALIGN((u64)array->map.max_entries * array->elem_size))
588 return -EINVAL;
589
590 return remap_vmalloc_range(vma, array_map_vmalloc_addr(array),
591 vma->vm_pgoff + pgoff);
592 }
593
array_map_meta_equal(const struct bpf_map * meta0,const struct bpf_map * meta1)594 static bool array_map_meta_equal(const struct bpf_map *meta0,
595 const struct bpf_map *meta1)
596 {
597 if (!bpf_map_meta_equal(meta0, meta1))
598 return false;
599 return meta0->map_flags & BPF_F_INNER_MAP ? true :
600 meta0->max_entries == meta1->max_entries;
601 }
602
603 struct bpf_iter_seq_array_map_info {
604 struct bpf_map *map;
605 void *percpu_value_buf;
606 u32 index;
607 };
608
bpf_array_map_seq_start(struct seq_file * seq,loff_t * pos)609 static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
610 {
611 struct bpf_iter_seq_array_map_info *info = seq->private;
612 struct bpf_map *map = info->map;
613 struct bpf_array *array;
614 u32 index;
615
616 if (info->index >= map->max_entries)
617 return NULL;
618
619 if (*pos == 0)
620 ++*pos;
621 array = container_of(map, struct bpf_array, map);
622 index = info->index & array->index_mask;
623 if (info->percpu_value_buf)
624 return (void *)(uintptr_t)array->pptrs[index];
625 return array_map_elem_ptr(array, index);
626 }
627
bpf_array_map_seq_next(struct seq_file * seq,void * v,loff_t * pos)628 static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
629 {
630 struct bpf_iter_seq_array_map_info *info = seq->private;
631 struct bpf_map *map = info->map;
632 struct bpf_array *array;
633 u32 index;
634
635 ++*pos;
636 ++info->index;
637 if (info->index >= map->max_entries)
638 return NULL;
639
640 array = container_of(map, struct bpf_array, map);
641 index = info->index & array->index_mask;
642 if (info->percpu_value_buf)
643 return (void *)(uintptr_t)array->pptrs[index];
644 return array_map_elem_ptr(array, index);
645 }
646
__bpf_array_map_seq_show(struct seq_file * seq,void * v)647 static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
648 {
649 struct bpf_iter_seq_array_map_info *info = seq->private;
650 struct bpf_iter__bpf_map_elem ctx = {};
651 struct bpf_map *map = info->map;
652 struct bpf_array *array = container_of(map, struct bpf_array, map);
653 struct bpf_iter_meta meta;
654 struct bpf_prog *prog;
655 int off = 0, cpu = 0;
656 void __percpu *pptr;
657 u32 size;
658
659 meta.seq = seq;
660 prog = bpf_iter_get_info(&meta, v == NULL);
661 if (!prog)
662 return 0;
663
664 ctx.meta = &meta;
665 ctx.map = info->map;
666 if (v) {
667 ctx.key = &info->index;
668
669 if (!info->percpu_value_buf) {
670 ctx.value = v;
671 } else {
672 pptr = (void __percpu *)(uintptr_t)v;
673 size = array->elem_size;
674 for_each_possible_cpu(cpu) {
675 copy_map_value_long(map, info->percpu_value_buf + off,
676 per_cpu_ptr(pptr, cpu));
677 check_and_init_map_value(map, info->percpu_value_buf + off);
678 off += size;
679 }
680 ctx.value = info->percpu_value_buf;
681 }
682 }
683
684 return bpf_iter_run_prog(prog, &ctx);
685 }
686
bpf_array_map_seq_show(struct seq_file * seq,void * v)687 static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
688 {
689 return __bpf_array_map_seq_show(seq, v);
690 }
691
bpf_array_map_seq_stop(struct seq_file * seq,void * v)692 static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
693 {
694 if (!v)
695 (void)__bpf_array_map_seq_show(seq, NULL);
696 }
697
bpf_iter_init_array_map(void * priv_data,struct bpf_iter_aux_info * aux)698 static int bpf_iter_init_array_map(void *priv_data,
699 struct bpf_iter_aux_info *aux)
700 {
701 struct bpf_iter_seq_array_map_info *seq_info = priv_data;
702 struct bpf_map *map = aux->map;
703 struct bpf_array *array = container_of(map, struct bpf_array, map);
704 void *value_buf;
705 u32 buf_size;
706
707 if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
708 buf_size = array->elem_size * num_possible_cpus();
709 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
710 if (!value_buf)
711 return -ENOMEM;
712
713 seq_info->percpu_value_buf = value_buf;
714 }
715
716 /* bpf_iter_attach_map() acquires a map uref, and the uref may be
717 * released before or in the middle of iterating map elements, so
718 * acquire an extra map uref for iterator.
719 */
720 bpf_map_inc_with_uref(map);
721 seq_info->map = map;
722 return 0;
723 }
724
bpf_iter_fini_array_map(void * priv_data)725 static void bpf_iter_fini_array_map(void *priv_data)
726 {
727 struct bpf_iter_seq_array_map_info *seq_info = priv_data;
728
729 bpf_map_put_with_uref(seq_info->map);
730 kfree(seq_info->percpu_value_buf);
731 }
732
733 static const struct seq_operations bpf_array_map_seq_ops = {
734 .start = bpf_array_map_seq_start,
735 .next = bpf_array_map_seq_next,
736 .stop = bpf_array_map_seq_stop,
737 .show = bpf_array_map_seq_show,
738 };
739
740 static const struct bpf_iter_seq_info iter_seq_info = {
741 .seq_ops = &bpf_array_map_seq_ops,
742 .init_seq_private = bpf_iter_init_array_map,
743 .fini_seq_private = bpf_iter_fini_array_map,
744 .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
745 };
746
bpf_for_each_array_elem(struct bpf_map * map,bpf_callback_t callback_fn,void * callback_ctx,u64 flags)747 static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
748 void *callback_ctx, u64 flags)
749 {
750 u32 i, key, num_elems = 0;
751 struct bpf_array *array;
752 bool is_percpu;
753 u64 ret = 0;
754 void *val;
755
756 cant_migrate();
757
758 if (flags != 0)
759 return -EINVAL;
760
761 is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
762 array = container_of(map, struct bpf_array, map);
763 for (i = 0; i < map->max_entries; i++) {
764 if (is_percpu)
765 val = this_cpu_ptr(array->pptrs[i]);
766 else
767 val = array_map_elem_ptr(array, i);
768 num_elems++;
769 key = i;
770 ret = callback_fn((u64)(long)map, (u64)(long)&key,
771 (u64)(long)val, (u64)(long)callback_ctx, 0);
772 /* return value: 0 - continue, 1 - stop and return */
773 if (ret)
774 break;
775 }
776
777 return num_elems;
778 }
779
array_map_mem_usage(const struct bpf_map * map)780 static u64 array_map_mem_usage(const struct bpf_map *map)
781 {
782 struct bpf_array *array = container_of(map, struct bpf_array, map);
783 bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
784 u32 elem_size = array->elem_size;
785 u64 entries = map->max_entries;
786 u64 usage = sizeof(*array);
787
788 if (percpu) {
789 usage += entries * sizeof(void *);
790 usage += entries * elem_size * num_possible_cpus();
791 } else {
792 if (map->map_flags & BPF_F_MMAPABLE) {
793 usage = PAGE_ALIGN(usage);
794 usage += PAGE_ALIGN(entries * elem_size);
795 } else {
796 usage += entries * elem_size;
797 }
798 }
799 return usage;
800 }
801
802 BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
803 const struct bpf_map_ops array_map_ops = {
804 .map_meta_equal = array_map_meta_equal,
805 .map_alloc_check = array_map_alloc_check,
806 .map_alloc = array_map_alloc,
807 .map_free = array_map_free,
808 .map_get_next_key = bpf_array_get_next_key,
809 .map_release_uref = array_map_free_internal_structs,
810 .map_lookup_elem = array_map_lookup_elem,
811 .map_update_elem = array_map_update_elem,
812 .map_delete_elem = array_map_delete_elem,
813 .map_gen_lookup = array_map_gen_lookup,
814 .map_direct_value_addr = array_map_direct_value_addr,
815 .map_direct_value_meta = array_map_direct_value_meta,
816 .map_mmap = array_map_mmap,
817 .map_seq_show_elem = array_map_seq_show_elem,
818 .map_check_btf = array_map_check_btf,
819 .map_lookup_batch = generic_map_lookup_batch,
820 .map_update_batch = generic_map_update_batch,
821 .map_set_for_each_callback_args = map_set_for_each_callback_args,
822 .map_for_each_callback = bpf_for_each_array_elem,
823 .map_mem_usage = array_map_mem_usage,
824 .map_btf_id = &array_map_btf_ids[0],
825 .iter_seq_info = &iter_seq_info,
826 .map_get_hash = &array_map_get_hash,
827 };
828
829 const struct bpf_map_ops percpu_array_map_ops = {
830 .map_meta_equal = bpf_map_meta_equal,
831 .map_alloc_check = array_map_alloc_check,
832 .map_alloc = array_map_alloc,
833 .map_free = array_map_free,
834 .map_get_next_key = bpf_array_get_next_key,
835 .map_lookup_elem = percpu_array_map_lookup_elem,
836 .map_gen_lookup = percpu_array_map_gen_lookup,
837 .map_update_elem = array_map_update_elem,
838 .map_delete_elem = array_map_delete_elem,
839 .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
840 .map_seq_show_elem = percpu_array_map_seq_show_elem,
841 .map_check_btf = array_map_check_btf,
842 .map_lookup_batch = generic_map_lookup_batch,
843 .map_update_batch = generic_map_update_batch,
844 .map_set_for_each_callback_args = map_set_for_each_callback_args,
845 .map_for_each_callback = bpf_for_each_array_elem,
846 .map_mem_usage = array_map_mem_usage,
847 .map_btf_id = &array_map_btf_ids[0],
848 .iter_seq_info = &iter_seq_info,
849 };
850
fd_array_map_alloc_check(union bpf_attr * attr)851 static int fd_array_map_alloc_check(union bpf_attr *attr)
852 {
853 /* only file descriptors can be stored in this type of map */
854 if (attr->value_size != sizeof(u32))
855 return -EINVAL;
856 /* Program read-only/write-only not supported for special maps yet. */
857 if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))
858 return -EINVAL;
859 return array_map_alloc_check(attr);
860 }
861
fd_array_map_free(struct bpf_map * map)862 static void fd_array_map_free(struct bpf_map *map)
863 {
864 struct bpf_array *array = container_of(map, struct bpf_array, map);
865 int i;
866
867 /* make sure it's empty */
868 for (i = 0; i < array->map.max_entries; i++)
869 BUG_ON(array->ptrs[i] != NULL);
870
871 bpf_map_area_free(array);
872 }
873
fd_array_map_lookup_elem(struct bpf_map * map,void * key)874 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
875 {
876 return ERR_PTR(-EOPNOTSUPP);
877 }
878
879 /* only called from syscall */
bpf_fd_array_map_lookup_elem(struct bpf_map * map,void * key,u32 * value)880 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
881 {
882 void **elem, *ptr;
883 int ret = 0;
884
885 if (!map->ops->map_fd_sys_lookup_elem)
886 return -ENOTSUPP;
887
888 rcu_read_lock();
889 elem = array_map_lookup_elem(map, key);
890 if (elem && (ptr = READ_ONCE(*elem)))
891 *value = map->ops->map_fd_sys_lookup_elem(ptr);
892 else
893 ret = -ENOENT;
894 rcu_read_unlock();
895
896 return ret;
897 }
898
899 /* only called from syscall */
bpf_fd_array_map_update_elem(struct bpf_map * map,struct file * map_file,void * key,void * value,u64 map_flags)900 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
901 void *key, void *value, u64 map_flags)
902 {
903 struct bpf_array *array = container_of(map, struct bpf_array, map);
904 void *new_ptr, *old_ptr;
905 u32 index = *(u32 *)key, ufd;
906
907 if (map_flags != BPF_ANY)
908 return -EINVAL;
909
910 if (index >= array->map.max_entries)
911 return -E2BIG;
912
913 ufd = *(u32 *)value;
914 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
915 if (IS_ERR(new_ptr))
916 return PTR_ERR(new_ptr);
917
918 if (map->ops->map_poke_run) {
919 mutex_lock(&array->aux->poke_mutex);
920 old_ptr = xchg(array->ptrs + index, new_ptr);
921 map->ops->map_poke_run(map, index, old_ptr, new_ptr);
922 mutex_unlock(&array->aux->poke_mutex);
923 } else {
924 old_ptr = xchg(array->ptrs + index, new_ptr);
925 }
926
927 if (old_ptr)
928 map->ops->map_fd_put_ptr(map, old_ptr, true);
929 return 0;
930 }
931
__fd_array_map_delete_elem(struct bpf_map * map,void * key,bool need_defer)932 static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
933 {
934 struct bpf_array *array = container_of(map, struct bpf_array, map);
935 void *old_ptr;
936 u32 index = *(u32 *)key;
937
938 if (index >= array->map.max_entries)
939 return -E2BIG;
940
941 if (map->ops->map_poke_run) {
942 mutex_lock(&array->aux->poke_mutex);
943 old_ptr = xchg(array->ptrs + index, NULL);
944 map->ops->map_poke_run(map, index, old_ptr, NULL);
945 mutex_unlock(&array->aux->poke_mutex);
946 } else {
947 old_ptr = xchg(array->ptrs + index, NULL);
948 }
949
950 if (old_ptr) {
951 map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
952 return 0;
953 } else {
954 return -ENOENT;
955 }
956 }
957
fd_array_map_delete_elem(struct bpf_map * map,void * key)958 static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
959 {
960 return __fd_array_map_delete_elem(map, key, true);
961 }
962
prog_fd_array_get_ptr(struct bpf_map * map,struct file * map_file,int fd)963 static void *prog_fd_array_get_ptr(struct bpf_map *map,
964 struct file *map_file, int fd)
965 {
966 struct bpf_prog *prog = bpf_prog_get(fd);
967 bool is_extended;
968
969 if (IS_ERR(prog))
970 return prog;
971
972 if (prog->type == BPF_PROG_TYPE_EXT ||
973 !bpf_prog_map_compatible(map, prog)) {
974 bpf_prog_put(prog);
975 return ERR_PTR(-EINVAL);
976 }
977
978 mutex_lock(&prog->aux->ext_mutex);
979 is_extended = prog->aux->is_extended;
980 if (!is_extended)
981 prog->aux->prog_array_member_cnt++;
982 mutex_unlock(&prog->aux->ext_mutex);
983 if (is_extended) {
984 /* Extended prog can not be tail callee. It's to prevent a
985 * potential infinite loop like:
986 * tail callee prog entry -> tail callee prog subprog ->
987 * freplace prog entry --tailcall-> tail callee prog entry.
988 */
989 bpf_prog_put(prog);
990 return ERR_PTR(-EBUSY);
991 }
992
993 return prog;
994 }
995
prog_fd_array_put_ptr(struct bpf_map * map,void * ptr,bool need_defer)996 static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
997 {
998 struct bpf_prog *prog = ptr;
999
1000 mutex_lock(&prog->aux->ext_mutex);
1001 prog->aux->prog_array_member_cnt--;
1002 mutex_unlock(&prog->aux->ext_mutex);
1003 /* bpf_prog is freed after one RCU or tasks trace grace period */
1004 bpf_prog_put(prog);
1005 }
1006
prog_fd_array_sys_lookup_elem(void * ptr)1007 static u32 prog_fd_array_sys_lookup_elem(void *ptr)
1008 {
1009 return ((struct bpf_prog *)ptr)->aux->id;
1010 }
1011
1012 /* decrement refcnt of all bpf_progs that are stored in this map */
bpf_fd_array_map_clear(struct bpf_map * map,bool need_defer)1013 static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
1014 {
1015 struct bpf_array *array = container_of(map, struct bpf_array, map);
1016 int i;
1017
1018 for (i = 0; i < array->map.max_entries; i++)
1019 __fd_array_map_delete_elem(map, &i, need_defer);
1020 }
1021
prog_array_map_seq_show_elem(struct bpf_map * map,void * key,struct seq_file * m)1022 static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
1023 struct seq_file *m)
1024 {
1025 void **elem, *ptr;
1026 u32 prog_id;
1027
1028 rcu_read_lock();
1029
1030 elem = array_map_lookup_elem(map, key);
1031 if (elem) {
1032 ptr = READ_ONCE(*elem);
1033 if (ptr) {
1034 seq_printf(m, "%u: ", *(u32 *)key);
1035 prog_id = prog_fd_array_sys_lookup_elem(ptr);
1036 btf_type_seq_show(map->btf, map->btf_value_type_id,
1037 &prog_id, m);
1038 seq_putc(m, '\n');
1039 }
1040 }
1041
1042 rcu_read_unlock();
1043 }
1044
1045 struct prog_poke_elem {
1046 struct list_head list;
1047 struct bpf_prog_aux *aux;
1048 };
1049
prog_array_map_poke_track(struct bpf_map * map,struct bpf_prog_aux * prog_aux)1050 static int prog_array_map_poke_track(struct bpf_map *map,
1051 struct bpf_prog_aux *prog_aux)
1052 {
1053 struct prog_poke_elem *elem;
1054 struct bpf_array_aux *aux;
1055 int ret = 0;
1056
1057 aux = container_of(map, struct bpf_array, map)->aux;
1058 mutex_lock(&aux->poke_mutex);
1059 list_for_each_entry(elem, &aux->poke_progs, list) {
1060 if (elem->aux == prog_aux)
1061 goto out;
1062 }
1063
1064 elem = kmalloc_obj(*elem);
1065 if (!elem) {
1066 ret = -ENOMEM;
1067 goto out;
1068 }
1069
1070 INIT_LIST_HEAD(&elem->list);
1071 /* We must track the program's aux info at this point in time
1072 * since the program pointer itself may not be stable yet, see
1073 * also comment in prog_array_map_poke_run().
1074 */
1075 elem->aux = prog_aux;
1076
1077 list_add_tail(&elem->list, &aux->poke_progs);
1078 out:
1079 mutex_unlock(&aux->poke_mutex);
1080 return ret;
1081 }
1082
prog_array_map_poke_untrack(struct bpf_map * map,struct bpf_prog_aux * prog_aux)1083 static void prog_array_map_poke_untrack(struct bpf_map *map,
1084 struct bpf_prog_aux *prog_aux)
1085 {
1086 struct prog_poke_elem *elem, *tmp;
1087 struct bpf_array_aux *aux;
1088
1089 aux = container_of(map, struct bpf_array, map)->aux;
1090 mutex_lock(&aux->poke_mutex);
1091 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
1092 if (elem->aux == prog_aux) {
1093 list_del_init(&elem->list);
1094 kfree(elem);
1095 break;
1096 }
1097 }
1098 mutex_unlock(&aux->poke_mutex);
1099 }
1100
bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor * poke,struct bpf_prog * new,struct bpf_prog * old)1101 void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
1102 struct bpf_prog *new, struct bpf_prog *old)
1103 {
1104 WARN_ON_ONCE(1);
1105 }
1106
prog_array_map_poke_run(struct bpf_map * map,u32 key,struct bpf_prog * old,struct bpf_prog * new)1107 static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
1108 struct bpf_prog *old,
1109 struct bpf_prog *new)
1110 {
1111 struct prog_poke_elem *elem;
1112 struct bpf_array_aux *aux;
1113
1114 aux = container_of(map, struct bpf_array, map)->aux;
1115 WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
1116
1117 list_for_each_entry(elem, &aux->poke_progs, list) {
1118 struct bpf_jit_poke_descriptor *poke;
1119 int i;
1120
1121 for (i = 0; i < elem->aux->size_poke_tab; i++) {
1122 poke = &elem->aux->poke_tab[i];
1123
1124 /* Few things to be aware of:
1125 *
1126 * 1) We can only ever access aux in this context, but
1127 * not aux->prog since it might not be stable yet and
1128 * there could be danger of use after free otherwise.
1129 * 2) Initially when we start tracking aux, the program
1130 * is not JITed yet and also does not have a kallsyms
1131 * entry. We skip these as poke->tailcall_target_stable
1132 * is not active yet. The JIT will do the final fixup
1133 * before setting it stable. The various
1134 * poke->tailcall_target_stable are successively
1135 * activated, so tail call updates can arrive from here
1136 * while JIT is still finishing its final fixup for
1137 * non-activated poke entries.
1138 * 3) Also programs reaching refcount of zero while patching
1139 * is in progress is okay since we're protected under
1140 * poke_mutex and untrack the programs before the JIT
1141 * buffer is freed.
1142 */
1143 if (!READ_ONCE(poke->tailcall_target_stable))
1144 continue;
1145 if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
1146 continue;
1147 if (poke->tail_call.map != map ||
1148 poke->tail_call.key != key)
1149 continue;
1150
1151 bpf_arch_poke_desc_update(poke, new, old);
1152 }
1153 }
1154 }
1155
prog_array_map_clear_deferred(struct work_struct * work)1156 static void prog_array_map_clear_deferred(struct work_struct *work)
1157 {
1158 struct bpf_map *map = container_of(work, struct bpf_array_aux,
1159 work)->map;
1160 bpf_fd_array_map_clear(map, true);
1161 bpf_map_put(map);
1162 }
1163
prog_array_map_clear(struct bpf_map * map)1164 static void prog_array_map_clear(struct bpf_map *map)
1165 {
1166 struct bpf_array_aux *aux = container_of(map, struct bpf_array,
1167 map)->aux;
1168 bpf_map_inc(map);
1169 schedule_work(&aux->work);
1170 }
1171
prog_array_map_alloc(union bpf_attr * attr)1172 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
1173 {
1174 struct bpf_array_aux *aux;
1175 struct bpf_map *map;
1176
1177 aux = kzalloc_obj(*aux, GFP_KERNEL_ACCOUNT);
1178 if (!aux)
1179 return ERR_PTR(-ENOMEM);
1180
1181 INIT_WORK(&aux->work, prog_array_map_clear_deferred);
1182 INIT_LIST_HEAD(&aux->poke_progs);
1183 mutex_init(&aux->poke_mutex);
1184
1185 map = array_map_alloc(attr);
1186 if (IS_ERR(map)) {
1187 kfree(aux);
1188 return map;
1189 }
1190
1191 container_of(map, struct bpf_array, map)->aux = aux;
1192 aux->map = map;
1193
1194 return map;
1195 }
1196
prog_array_map_free(struct bpf_map * map)1197 static void prog_array_map_free(struct bpf_map *map)
1198 {
1199 struct prog_poke_elem *elem, *tmp;
1200 struct bpf_array_aux *aux;
1201
1202 aux = container_of(map, struct bpf_array, map)->aux;
1203 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
1204 list_del_init(&elem->list);
1205 kfree(elem);
1206 }
1207 kfree(aux);
1208 fd_array_map_free(map);
1209 }
1210
1211 /* prog_array->aux->{type,jited} is a runtime binding.
1212 * Doing static check alone in the verifier is not enough.
1213 * Thus, prog_array_map cannot be used as an inner_map
1214 * and map_meta_equal is not implemented.
1215 */
1216 const struct bpf_map_ops prog_array_map_ops = {
1217 .map_alloc_check = fd_array_map_alloc_check,
1218 .map_alloc = prog_array_map_alloc,
1219 .map_free = prog_array_map_free,
1220 .map_poke_track = prog_array_map_poke_track,
1221 .map_poke_untrack = prog_array_map_poke_untrack,
1222 .map_poke_run = prog_array_map_poke_run,
1223 .map_get_next_key = bpf_array_get_next_key,
1224 .map_lookup_elem = fd_array_map_lookup_elem,
1225 .map_delete_elem = fd_array_map_delete_elem,
1226 .map_fd_get_ptr = prog_fd_array_get_ptr,
1227 .map_fd_put_ptr = prog_fd_array_put_ptr,
1228 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
1229 .map_release_uref = prog_array_map_clear,
1230 .map_seq_show_elem = prog_array_map_seq_show_elem,
1231 .map_mem_usage = array_map_mem_usage,
1232 .map_btf_id = &array_map_btf_ids[0],
1233 };
1234
bpf_event_entry_gen(struct file * perf_file,struct file * map_file)1235 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
1236 struct file *map_file)
1237 {
1238 struct bpf_event_entry *ee;
1239
1240 ee = kzalloc_obj(*ee);
1241 if (ee) {
1242 ee->event = perf_file->private_data;
1243 ee->perf_file = perf_file;
1244 ee->map_file = map_file;
1245 }
1246
1247 return ee;
1248 }
1249
__bpf_event_entry_free(struct rcu_head * rcu)1250 static void __bpf_event_entry_free(struct rcu_head *rcu)
1251 {
1252 struct bpf_event_entry *ee;
1253
1254 ee = container_of(rcu, struct bpf_event_entry, rcu);
1255 fput(ee->perf_file);
1256 kfree(ee);
1257 }
1258
bpf_event_entry_free_rcu(struct bpf_event_entry * ee)1259 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
1260 {
1261 call_rcu(&ee->rcu, __bpf_event_entry_free);
1262 }
1263
perf_event_fd_array_get_ptr(struct bpf_map * map,struct file * map_file,int fd)1264 static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
1265 struct file *map_file, int fd)
1266 {
1267 struct bpf_event_entry *ee;
1268 struct perf_event *event;
1269 struct file *perf_file;
1270 u64 value;
1271
1272 perf_file = perf_event_get(fd);
1273 if (IS_ERR(perf_file))
1274 return perf_file;
1275
1276 ee = ERR_PTR(-EOPNOTSUPP);
1277 event = perf_file->private_data;
1278 if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
1279 goto err_out;
1280
1281 ee = bpf_event_entry_gen(perf_file, map_file);
1282 if (ee)
1283 return ee;
1284 ee = ERR_PTR(-ENOMEM);
1285 err_out:
1286 fput(perf_file);
1287 return ee;
1288 }
1289
perf_event_fd_array_put_ptr(struct bpf_map * map,void * ptr,bool need_defer)1290 static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
1291 {
1292 /* bpf_perf_event is freed after one RCU grace period */
1293 bpf_event_entry_free_rcu(ptr);
1294 }
1295
perf_event_fd_array_release(struct bpf_map * map,struct file * map_file)1296 static void perf_event_fd_array_release(struct bpf_map *map,
1297 struct file *map_file)
1298 {
1299 struct bpf_array *array = container_of(map, struct bpf_array, map);
1300 struct bpf_event_entry *ee;
1301 int i;
1302
1303 if (map->map_flags & BPF_F_PRESERVE_ELEMS)
1304 return;
1305
1306 rcu_read_lock();
1307 for (i = 0; i < array->map.max_entries; i++) {
1308 ee = READ_ONCE(array->ptrs[i]);
1309 if (ee && ee->map_file == map_file)
1310 __fd_array_map_delete_elem(map, &i, true);
1311 }
1312 rcu_read_unlock();
1313 }
1314
perf_event_fd_array_map_free(struct bpf_map * map)1315 static void perf_event_fd_array_map_free(struct bpf_map *map)
1316 {
1317 if (map->map_flags & BPF_F_PRESERVE_ELEMS)
1318 bpf_fd_array_map_clear(map, false);
1319 fd_array_map_free(map);
1320 }
1321
1322 const struct bpf_map_ops perf_event_array_map_ops = {
1323 .map_meta_equal = bpf_map_meta_equal,
1324 .map_alloc_check = fd_array_map_alloc_check,
1325 .map_alloc = array_map_alloc,
1326 .map_free = perf_event_fd_array_map_free,
1327 .map_get_next_key = bpf_array_get_next_key,
1328 .map_lookup_elem = fd_array_map_lookup_elem,
1329 .map_delete_elem = fd_array_map_delete_elem,
1330 .map_fd_get_ptr = perf_event_fd_array_get_ptr,
1331 .map_fd_put_ptr = perf_event_fd_array_put_ptr,
1332 .map_release = perf_event_fd_array_release,
1333 .map_check_btf = map_check_no_btf,
1334 .map_mem_usage = array_map_mem_usage,
1335 .map_btf_id = &array_map_btf_ids[0],
1336 };
1337
1338 #ifdef CONFIG_CGROUPS
cgroup_fd_array_get_ptr(struct bpf_map * map,struct file * map_file,int fd)1339 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
1340 struct file *map_file /* not used */,
1341 int fd)
1342 {
1343 return cgroup_get_from_fd(fd);
1344 }
1345
cgroup_fd_array_put_ptr(struct bpf_map * map,void * ptr,bool need_defer)1346 static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
1347 {
1348 /* cgroup_put free cgrp after a rcu grace period */
1349 cgroup_put(ptr);
1350 }
1351
cgroup_fd_array_free(struct bpf_map * map)1352 static void cgroup_fd_array_free(struct bpf_map *map)
1353 {
1354 bpf_fd_array_map_clear(map, false);
1355 fd_array_map_free(map);
1356 }
1357
1358 const struct bpf_map_ops cgroup_array_map_ops = {
1359 .map_meta_equal = bpf_map_meta_equal,
1360 .map_alloc_check = fd_array_map_alloc_check,
1361 .map_alloc = array_map_alloc,
1362 .map_free = cgroup_fd_array_free,
1363 .map_get_next_key = bpf_array_get_next_key,
1364 .map_lookup_elem = fd_array_map_lookup_elem,
1365 .map_delete_elem = fd_array_map_delete_elem,
1366 .map_fd_get_ptr = cgroup_fd_array_get_ptr,
1367 .map_fd_put_ptr = cgroup_fd_array_put_ptr,
1368 .map_check_btf = map_check_no_btf,
1369 .map_mem_usage = array_map_mem_usage,
1370 .map_btf_id = &array_map_btf_ids[0],
1371 };
1372 #endif
1373
array_of_map_alloc(union bpf_attr * attr)1374 static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
1375 {
1376 struct bpf_map *map, *inner_map_meta;
1377
1378 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
1379 if (IS_ERR(inner_map_meta))
1380 return inner_map_meta;
1381
1382 map = array_map_alloc(attr);
1383 if (IS_ERR(map)) {
1384 bpf_map_meta_free(inner_map_meta);
1385 return map;
1386 }
1387
1388 map->inner_map_meta = inner_map_meta;
1389
1390 return map;
1391 }
1392
array_of_map_free(struct bpf_map * map)1393 static void array_of_map_free(struct bpf_map *map)
1394 {
1395 /* map->inner_map_meta is only accessed by syscall which
1396 * is protected by fdget/fdput.
1397 */
1398 bpf_map_meta_free(map->inner_map_meta);
1399 bpf_fd_array_map_clear(map, false);
1400 fd_array_map_free(map);
1401 }
1402
array_of_map_lookup_elem(struct bpf_map * map,void * key)1403 static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
1404 {
1405 struct bpf_map **inner_map = array_map_lookup_elem(map, key);
1406
1407 if (!inner_map)
1408 return NULL;
1409
1410 return READ_ONCE(*inner_map);
1411 }
1412
array_of_map_gen_lookup(struct bpf_map * map,struct bpf_insn * insn_buf)1413 static int array_of_map_gen_lookup(struct bpf_map *map,
1414 struct bpf_insn *insn_buf)
1415 {
1416 struct bpf_array *array = container_of(map, struct bpf_array, map);
1417 u32 elem_size = array->elem_size;
1418 struct bpf_insn *insn = insn_buf;
1419 const int ret = BPF_REG_0;
1420 const int map_ptr = BPF_REG_1;
1421 const int index = BPF_REG_2;
1422
1423 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
1424 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
1425 if (!map->bypass_spec_v1) {
1426 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
1427 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
1428 } else {
1429 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
1430 }
1431 if (is_power_of_2(elem_size))
1432 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
1433 else
1434 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
1435 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
1436 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
1437 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
1438 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
1439 *insn++ = BPF_MOV64_IMM(ret, 0);
1440
1441 return insn - insn_buf;
1442 }
1443
1444 const struct bpf_map_ops array_of_maps_map_ops = {
1445 .map_alloc_check = fd_array_map_alloc_check,
1446 .map_alloc = array_of_map_alloc,
1447 .map_free = array_of_map_free,
1448 .map_get_next_key = bpf_array_get_next_key,
1449 .map_lookup_elem = array_of_map_lookup_elem,
1450 .map_delete_elem = fd_array_map_delete_elem,
1451 .map_fd_get_ptr = bpf_map_fd_get_ptr,
1452 .map_fd_put_ptr = bpf_map_fd_put_ptr,
1453 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
1454 .map_gen_lookup = array_of_map_gen_lookup,
1455 .map_lookup_batch = generic_map_lookup_batch,
1456 .map_update_batch = generic_map_update_batch,
1457 .map_check_btf = map_check_no_btf,
1458 .map_mem_usage = array_map_mem_usage,
1459 .map_btf_id = &array_map_btf_ids[0],
1460 };
1461