// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Isovalent */ #include #include #include #include #include #include #include #include "map_percpu_stats.skel.h" #define MAX_ENTRIES 16384 #define MAX_ENTRIES_HASH_OF_MAPS 64 #define N_THREADS 8 #define MAX_MAP_KEY_SIZE 4 #define PCPU_MIN_UNIT_SIZE 32768 static void map_info(int map_fd, struct bpf_map_info *info) { __u32 len = sizeof(*info); int ret; memset(info, 0, sizeof(*info)); ret = bpf_obj_get_info_by_fd(map_fd, info, &len); CHECK(ret < 0, "bpf_obj_get_info_by_fd", "error: %s\n", strerror(errno)); } static const char *map_type_to_s(__u32 type) { switch (type) { case BPF_MAP_TYPE_HASH: return "HASH"; case BPF_MAP_TYPE_PERCPU_HASH: return "PERCPU_HASH"; case BPF_MAP_TYPE_LRU_HASH: return "LRU_HASH"; case BPF_MAP_TYPE_LRU_PERCPU_HASH: return "LRU_PERCPU_HASH"; case BPF_MAP_TYPE_HASH_OF_MAPS: return "BPF_MAP_TYPE_HASH_OF_MAPS"; default: return ""; } } static __u32 map_count_elements(__u32 type, int map_fd) { __u32 key = -1; int n = 0; while (!bpf_map_get_next_key(map_fd, &key, &key)) n++; return n; } #define BATCH true static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count) { static __u8 values[(8 << 10) * MAX_ENTRIES]; void *in_batch = NULL, *out_batch; __u32 save_count = count; int ret; ret = bpf_map_lookup_and_delete_batch(map_fd, &in_batch, &out_batch, keys, values, &count, NULL); /* * Despite what uapi header says, lookup_and_delete_batch will return * -ENOENT in case we successfully have deleted all elements, so check * this separately */ CHECK(ret < 0 && (errno != ENOENT || !count), "bpf_map_lookup_and_delete_batch", "error: %s\n", strerror(errno)); CHECK(count != save_count, "bpf_map_lookup_and_delete_batch", "deleted not all elements: removed=%u expected=%u\n", count, save_count); } static void delete_all_elements(__u32 type, int map_fd, bool batch) { static __u8 val[8 << 10]; /* enough for 1024 CPUs */ __u32 key = -1; void *keys; __u32 i, n; int ret; keys = calloc(MAX_MAP_KEY_SIZE, MAX_ENTRIES); CHECK(!keys, "calloc", "error: %s\n", strerror(errno)); for (n = 0; !bpf_map_get_next_key(map_fd, &key, &key); n++) memcpy(keys + n*MAX_MAP_KEY_SIZE, &key, MAX_MAP_KEY_SIZE); if (batch) { /* Can't mix delete_batch and delete_and_lookup_batch because * they have different semantics in relation to the keys * argument. However, delete_batch utilize map_delete_elem, * so we actually test it in non-batch scenario */ delete_and_lookup_batch(map_fd, keys, n); } else { /* Intentionally mix delete and lookup_and_delete so we can test both */ for (i = 0; i < n; i++) { void *keyp = keys + i*MAX_MAP_KEY_SIZE; if (i % 2 || type == BPF_MAP_TYPE_HASH_OF_MAPS) { ret = bpf_map_delete_elem(map_fd, keyp); CHECK(ret < 0, "bpf_map_delete_elem", "error: key %u: %s\n", i, strerror(errno)); } else { ret = bpf_map_lookup_and_delete_elem(map_fd, keyp, val); CHECK(ret < 0, "bpf_map_lookup_and_delete_elem", "error: key %u: %s\n", i, strerror(errno)); } } } free(keys); } static bool is_lru(__u32 map_type) { return map_type == BPF_MAP_TYPE_LRU_HASH || map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } static bool is_percpu(__u32 map_type) { return map_type == BPF_MAP_TYPE_PERCPU_HASH || map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } struct upsert_opts { __u32 map_type; int map_fd; __u32 n; bool retry_for_nomem; }; static int create_small_hash(void) { int map_fd; map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, "small", 4, 4, 4, NULL); CHECK(map_fd < 0, "bpf_map_create()", "error:%s (name=%s)\n", strerror(errno), "small"); return map_fd; } static bool retry_for_nomem_fn(int err) { return err == ENOMEM; } static void *patch_map_thread(void *arg) { /* 8KB is enough for 1024 CPUs. And it is shared between N_THREADS. */ static __u8 blob[8 << 10]; struct upsert_opts *opts = arg; void *val_ptr; int val; int ret; int i; for (i = 0; i < opts->n; i++) { if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { val = create_small_hash(); val_ptr = &val; } else if (is_percpu(opts->map_type)) { val_ptr = blob; } else { val = rand(); val_ptr = &val; } /* 2 seconds may be enough ? */ if (opts->retry_for_nomem) ret = map_update_retriable(opts->map_fd, &i, val_ptr, 0, 40, retry_for_nomem_fn); else ret = bpf_map_update_elem(opts->map_fd, &i, val_ptr, 0); CHECK(ret < 0, "bpf_map_update_elem", "key=%d error: %s\n", i, strerror(errno)); if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) close(val); } return NULL; } static void upsert_elements(struct upsert_opts *opts) { pthread_t threads[N_THREADS]; int ret; int i; for (i = 0; i < ARRAY_SIZE(threads); i++) { ret = pthread_create(&i[threads], NULL, patch_map_thread, opts); CHECK(ret != 0, "pthread_create", "error: %s\n", strerror(ret)); } for (i = 0; i < ARRAY_SIZE(threads); i++) { ret = pthread_join(i[threads], NULL); CHECK(ret != 0, "pthread_join", "error: %s\n", strerror(ret)); } } static __u32 read_cur_elements(int iter_fd) { char buf[64]; ssize_t n; __u32 ret; n = read(iter_fd, buf, sizeof(buf)-1); CHECK(n <= 0, "read", "error: %s\n", strerror(errno)); buf[n] = '\0'; errno = 0; ret = (__u32)strtol(buf, NULL, 10); CHECK(errno != 0, "strtol", "error: %s\n", strerror(errno)); return ret; } static __u32 get_cur_elements(int map_id) { struct map_percpu_stats *skel; struct bpf_link *link; __u32 n_elements; int iter_fd; int ret; skel = map_percpu_stats__open(); CHECK(skel == NULL, "map_percpu_stats__open", "error: %s", strerror(errno)); skel->bss->target_id = map_id; ret = map_percpu_stats__load(skel); CHECK(ret != 0, "map_percpu_stats__load", "error: %s", strerror(errno)); link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL); CHECK(!link, "bpf_program__attach_iter", "error: %s\n", strerror(errno)); iter_fd = bpf_iter_create(bpf_link__fd(link)); CHECK(iter_fd < 0, "bpf_iter_create", "error: %s\n", strerror(errno)); n_elements = read_cur_elements(iter_fd); close(iter_fd); bpf_link__destroy(link); map_percpu_stats__destroy(skel); return n_elements; } static void check_expected_number_elements(__u32 n_inserted, int map_fd, struct bpf_map_info *info) { __u32 n_real; __u32 n_iter; /* Count the current number of elements in the map by iterating through * all the map keys via bpf_get_next_key */ n_real = map_count_elements(info->type, map_fd); /* The "real" number of elements should be the same as the inserted * number of elements in all cases except LRU maps, where some elements * may have been evicted */ if (n_inserted == 0 || !is_lru(info->type)) CHECK(n_inserted != n_real, "map_count_elements", "n_real(%u) != n_inserted(%u)\n", n_real, n_inserted); /* Count the current number of elements in the map using an iterator */ n_iter = get_cur_elements(info->id); /* Both counts should be the same, as all updates are over */ CHECK(n_iter != n_real, "get_cur_elements", "n_iter=%u, expected %u (map_type=%s,map_flags=%08x)\n", n_iter, n_real, map_type_to_s(info->type), info->map_flags); } static void __test(int map_fd) { struct upsert_opts opts = { .map_fd = map_fd, }; struct bpf_map_info info; map_info(map_fd, &info); opts.map_type = info.type; opts.n = info.max_entries; /* Reduce the number of elements we are updating such that we don't * bump into -E2BIG from non-preallocated hash maps, but still will * have some evictions for LRU maps */ if (opts.map_type != BPF_MAP_TYPE_HASH_OF_MAPS) opts.n -= 512; else opts.n /= 2; /* per-cpu bpf memory allocator may not be able to allocate per-cpu * pointer successfully and it can not refill free llist timely, and * bpf_map_update_elem() will return -ENOMEM. so just retry to mitigate * the problem temporarily. */ opts.retry_for_nomem = is_percpu(opts.map_type) && (info.map_flags & BPF_F_NO_PREALLOC); /* * Upsert keys [0, n) under some competition: with random values from * N_THREADS threads. Check values, then delete all elements and check * values again. */ upsert_elements(&opts); check_expected_number_elements(opts.n, map_fd, &info); delete_all_elements(info.type, map_fd, !BATCH); check_expected_number_elements(0, map_fd, &info); /* Now do the same, but using batch delete operations */ upsert_elements(&opts); check_expected_number_elements(opts.n, map_fd, &info); delete_all_elements(info.type, map_fd, BATCH); check_expected_number_elements(0, map_fd, &info); close(map_fd); } static int map_create_opts(__u32 type, const char *name, struct bpf_map_create_opts *map_opts, __u32 key_size, __u32 val_size) { int max_entries; int map_fd; if (type == BPF_MAP_TYPE_HASH_OF_MAPS) max_entries = MAX_ENTRIES_HASH_OF_MAPS; else max_entries = MAX_ENTRIES; map_fd = bpf_map_create(type, name, key_size, val_size, max_entries, map_opts); CHECK(map_fd < 0, "bpf_map_create()", "error:%s (name=%s)\n", strerror(errno), name); return map_fd; } static int map_create(__u32 type, const char *name, struct bpf_map_create_opts *map_opts) { return map_create_opts(type, name, map_opts, sizeof(int), sizeof(int)); } static int create_hash(void) { LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = BPF_F_NO_PREALLOC); return map_create(BPF_MAP_TYPE_HASH, "hash", &map_opts); } static int create_percpu_hash(void) { LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = BPF_F_NO_PREALLOC); return map_create(BPF_MAP_TYPE_PERCPU_HASH, "percpu_hash", &map_opts); } static int create_hash_prealloc(void) { return map_create(BPF_MAP_TYPE_HASH, "hash", NULL); } static int create_percpu_hash_prealloc(void) { return map_create(BPF_MAP_TYPE_PERCPU_HASH, "percpu_hash_prealloc", NULL); } static int create_lru_hash(__u32 type, __u32 map_flags) { LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = map_flags); return map_create(type, "lru_hash", &map_opts); } static int create_hash_of_maps(void) { LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = BPF_F_NO_PREALLOC, .inner_map_fd = create_small_hash(), ); int ret; ret = map_create_opts(BPF_MAP_TYPE_HASH_OF_MAPS, "hash_of_maps", &map_opts, sizeof(int), sizeof(int)); close(map_opts.inner_map_fd); return ret; } static void map_percpu_stats_hash(void) { __test(create_hash()); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_percpu_hash(void) { __test(create_percpu_hash()); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_hash_prealloc(void) { __test(create_hash_prealloc()); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_percpu_hash_prealloc(void) { __test(create_percpu_hash_prealloc()); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_lru_hash(void) { __test(create_lru_hash(BPF_MAP_TYPE_LRU_HASH, 0)); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_lru_hash_no_common(void) { __test(create_lru_hash(BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU)); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_percpu_lru_hash(void) { __test(create_lru_hash(BPF_MAP_TYPE_LRU_PERCPU_HASH, 0)); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_percpu_lru_hash_no_common(void) { __test(create_lru_hash(BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU)); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_hash_of_maps(void) { __test(create_hash_of_maps()); printf("test_%s:PASS\n", __func__); } static void map_percpu_stats_map_value_size(void) { int fd; int value_sz = PCPU_MIN_UNIT_SIZE + 1; struct bpf_map_create_opts opts = { .sz = sizeof(opts) }; enum bpf_map_type map_types[] = { BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH }; for (int i = 0; i < ARRAY_SIZE(map_types); i++) { fd = bpf_map_create(map_types[i], NULL, sizeof(__u32), value_sz, 1, &opts); CHECK(fd < 0 && errno != E2BIG, "percpu map value size", "error: %s\n", strerror(errno)); } printf("test_%s:PASS\n", __func__); } void test_map_percpu_stats(void) { map_percpu_stats_hash(); map_percpu_stats_percpu_hash(); map_percpu_stats_hash_prealloc(); map_percpu_stats_percpu_hash_prealloc(); map_percpu_stats_lru_hash(); map_percpu_stats_lru_hash_no_common(); map_percpu_stats_percpu_lru_hash(); map_percpu_stats_percpu_lru_hash_no_common(); map_percpu_stats_hash_of_maps(); map_percpu_stats_map_value_size(); }