xref: /linux/kernel/bpf/bpf_local_storage.c (revision 0be08389c7f2f9db0194ee666d2e4870886e6ffb)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2019 Facebook  */
3 #include <linux/rculist.h>
4 #include <linux/list.h>
5 #include <linux/hash.h>
6 #include <linux/types.h>
7 #include <linux/spinlock.h>
8 #include <linux/bpf.h>
9 #include <linux/btf_ids.h>
10 #include <linux/bpf_local_storage.h>
11 #include <net/sock.h>
12 #include <uapi/linux/sock_diag.h>
13 #include <uapi/linux/btf.h>
14 #include <linux/rcupdate.h>
15 #include <linux/rcupdate_trace.h>
16 #include <linux/rcupdate_wait.h>
17 
18 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
19 
20 static struct bpf_local_storage_map_bucket *
21 select_bucket(struct bpf_local_storage_map *smap,
22 	      struct bpf_local_storage *local_storage)
23 {
24 	return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
25 }
26 
27 static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
28 {
29 	struct bpf_map *map = &smap->map;
30 
31 	if (!map->ops->map_local_storage_charge)
32 		return 0;
33 
34 	return map->ops->map_local_storage_charge(smap, owner, size);
35 }
36 
37 static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
38 			 u32 size)
39 {
40 	struct bpf_map *map = &smap->map;
41 
42 	if (map->ops->map_local_storage_uncharge)
43 		map->ops->map_local_storage_uncharge(smap, owner, size);
44 }
45 
46 static struct bpf_local_storage __rcu **
47 owner_storage(struct bpf_local_storage_map *smap, void *owner)
48 {
49 	struct bpf_map *map = &smap->map;
50 
51 	return map->ops->map_owner_storage_ptr(owner);
52 }
53 
54 static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
55 {
56 	return !hlist_unhashed_lockless(&selem->snode);
57 }
58 
59 static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
60 {
61 	return !hlist_unhashed(&selem->snode);
62 }
63 
64 static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
65 {
66 	return !hlist_unhashed(&selem->map_node);
67 }
68 
69 struct bpf_local_storage_elem *
70 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
71 		void *value, bool swap_uptrs, gfp_t gfp_flags)
72 {
73 	struct bpf_local_storage_elem *selem;
74 
75 	if (mem_charge(smap, owner, smap->elem_size))
76 		return NULL;
77 
78 	if (smap->use_kmalloc_nolock) {
79 		selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
80 					       __GFP_ZERO, NUMA_NO_NODE);
81 	} else {
82 		selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
83 					gfp_flags | __GFP_NOWARN);
84 	}
85 
86 	if (selem) {
87 		RCU_INIT_POINTER(SDATA(selem)->smap, smap);
88 		atomic_set(&selem->state, 0);
89 		selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
90 
91 		if (value) {
92 			/* No need to call check_and_init_map_value as memory is zero init */
93 			copy_map_value(&smap->map, SDATA(selem)->data, value);
94 			if (swap_uptrs)
95 				bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
96 		}
97 		return selem;
98 	}
99 
100 	mem_uncharge(smap, owner, smap->elem_size);
101 
102 	return NULL;
103 }
104 
105 /* rcu tasks trace callback for use_kmalloc_nolock == false */
106 static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
107 {
108 	struct bpf_local_storage *local_storage;
109 
110 	/* If RCU Tasks Trace grace period implies RCU grace period, do
111 	 * kfree(), else do kfree_rcu().
112 	 */
113 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
114 	if (rcu_trace_implies_rcu_gp())
115 		kfree(local_storage);
116 	else
117 		kfree_rcu(local_storage, rcu);
118 }
119 
120 /* Handle use_kmalloc_nolock == false */
121 static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
122 				     bool vanilla_rcu)
123 {
124 	if (vanilla_rcu)
125 		kfree_rcu(local_storage, rcu);
126 	else
127 		call_rcu_tasks_trace(&local_storage->rcu,
128 				     __bpf_local_storage_free_trace_rcu);
129 }
130 
131 static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
132 {
133 	struct bpf_local_storage *local_storage;
134 
135 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
136 	kfree_nolock(local_storage);
137 }
138 
139 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
140 {
141 	if (rcu_trace_implies_rcu_gp())
142 		bpf_local_storage_free_rcu(rcu);
143 	else
144 		call_rcu(rcu, bpf_local_storage_free_rcu);
145 }
146 
147 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
148 				   bool reuse_now)
149 {
150 	if (!local_storage)
151 		return;
152 
153 	if (!local_storage->use_kmalloc_nolock) {
154 		__bpf_local_storage_free(local_storage, reuse_now);
155 		return;
156 	}
157 
158 	if (reuse_now) {
159 		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
160 		return;
161 	}
162 
163 	call_rcu_tasks_trace(&local_storage->rcu,
164 			     bpf_local_storage_free_trace_rcu);
165 }
166 
167 /* rcu tasks trace callback for use_kmalloc_nolock == false */
168 static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
169 {
170 	struct bpf_local_storage_elem *selem;
171 
172 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
173 	if (rcu_trace_implies_rcu_gp())
174 		kfree(selem);
175 	else
176 		kfree_rcu(selem, rcu);
177 }
178 
179 /* Handle use_kmalloc_nolock == false */
180 static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
181 			     bool vanilla_rcu)
182 {
183 	if (vanilla_rcu)
184 		kfree_rcu(selem, rcu);
185 	else
186 		call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
187 }
188 
189 static void bpf_selem_free_rcu(struct rcu_head *rcu)
190 {
191 	struct bpf_local_storage_elem *selem;
192 	struct bpf_local_storage_map *smap;
193 
194 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
195 	/* The bpf_local_storage_map_free will wait for rcu_barrier */
196 	smap = rcu_dereference_check(SDATA(selem)->smap, 1);
197 
198 	if (smap) {
199 		migrate_disable();
200 		bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
201 		migrate_enable();
202 	}
203 	kfree_nolock(selem);
204 }
205 
206 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
207 {
208 	if (rcu_trace_implies_rcu_gp())
209 		bpf_selem_free_rcu(rcu);
210 	else
211 		call_rcu(rcu, bpf_selem_free_rcu);
212 }
213 
214 void bpf_selem_free(struct bpf_local_storage_elem *selem,
215 		    bool reuse_now)
216 {
217 	struct bpf_local_storage_map *smap;
218 
219 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
220 
221 	if (!selem->use_kmalloc_nolock) {
222 		/*
223 		 * No uptr will be unpin even when reuse_now == false since uptr
224 		 * is only supported in task local storage, where
225 		 * smap->use_kmalloc_nolock == true.
226 		 */
227 		if (smap)
228 			bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
229 		__bpf_selem_free(selem, reuse_now);
230 		return;
231 	}
232 
233 	if (reuse_now) {
234 		/*
235 		 * While it is okay to call bpf_obj_free_fields() that unpins uptr when
236 		 * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
237 		 */
238 		call_rcu(&selem->rcu, bpf_selem_free_rcu);
239 		return;
240 	}
241 
242 	call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
243 }
244 
245 static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
246 {
247 	struct bpf_local_storage_elem *selem;
248 	struct hlist_node *n;
249 
250 	/* The "_safe" iteration is needed.
251 	 * The loop is not removing the selem from the list
252 	 * but bpf_selem_free will use the selem->rcu_head
253 	 * which is union-ized with the selem->free_node.
254 	 */
255 	hlist_for_each_entry_safe(selem, n, list, free_node)
256 		bpf_selem_free(selem, reuse_now);
257 }
258 
259 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
260 						 struct bpf_local_storage_map *smap,
261 						 struct bpf_local_storage *local_storage,
262 						 bool free_local_storage, bool pin_owner)
263 {
264 	void *owner = local_storage->owner;
265 	u32 uncharge = smap->elem_size;
266 
267 	if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
268 	    SDATA(selem))
269 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
270 
271 	if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
272 		return;
273 
274 	uncharge += free_local_storage ? sizeof(*local_storage) : 0;
275 	mem_uncharge(smap, local_storage->owner, uncharge);
276 	local_storage->mem_charge -= uncharge;
277 
278 	if (free_local_storage) {
279 		local_storage->owner = NULL;
280 
281 		/* After this RCU_INIT, owner may be freed and cannot be used */
282 		RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
283 	}
284 
285 	if (pin_owner)
286 		refcount_dec(&local_storage->owner_refcnt);
287 }
288 
289 /* local_storage->lock must be held and selem->local_storage == local_storage.
290  * The caller must ensure selem->smap is still valid to be
291  * dereferenced for its smap->elem_size and smap->cache_idx.
292  */
293 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
294 					    struct bpf_local_storage_elem *selem,
295 					    struct hlist_head *free_selem_list)
296 {
297 	struct bpf_local_storage_map *smap;
298 	bool free_local_storage;
299 
300 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
301 
302 	free_local_storage = hlist_is_singular_node(&selem->snode,
303 						    &local_storage->list);
304 
305 	bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
306 					     free_local_storage, false);
307 
308 	hlist_del_init_rcu(&selem->snode);
309 
310 	hlist_add_head(&selem->free_node, free_selem_list);
311 
312 	return free_local_storage;
313 }
314 
315 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
316 				   struct bpf_local_storage_elem *selem)
317 {
318 	struct bpf_local_storage_map *smap;
319 
320 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
321 	local_storage->mem_charge += smap->elem_size;
322 
323 	RCU_INIT_POINTER(selem->local_storage, local_storage);
324 	hlist_add_head_rcu(&selem->snode, &local_storage->list);
325 }
326 
327 static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
328 {
329 	struct bpf_local_storage *local_storage;
330 	struct bpf_local_storage_map *smap;
331 	struct bpf_local_storage_map_bucket *b;
332 	unsigned long flags;
333 	int err;
334 
335 	local_storage = rcu_dereference_check(selem->local_storage,
336 					      bpf_rcu_lock_held());
337 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
338 	b = select_bucket(smap, local_storage);
339 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
340 	if (err)
341 		return err;
342 
343 	hlist_del_init_rcu(&selem->map_node);
344 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
345 
346 	return 0;
347 }
348 
349 static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
350 {
351 	hlist_del_init_rcu(&selem->map_node);
352 }
353 
354 int bpf_selem_link_map(struct bpf_local_storage_map *smap,
355 		       struct bpf_local_storage *local_storage,
356 		       struct bpf_local_storage_elem *selem)
357 {
358 	struct bpf_local_storage_map_bucket *b;
359 	unsigned long flags;
360 	int err;
361 
362 	b = select_bucket(smap, local_storage);
363 
364 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
365 	if (err)
366 		return err;
367 
368 	hlist_add_head_rcu(&selem->map_node, &b->list);
369 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
370 
371 	return 0;
372 }
373 
374 static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
375 				      struct bpf_local_storage_elem *selem)
376 {
377 	hlist_add_head_rcu(&selem->map_node, &b->list);
378 }
379 
380 /*
381  * Unlink an selem from map and local storage with lock held.
382  * This is the common path used by local storages to delete an selem.
383  */
384 int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
385 {
386 	struct bpf_local_storage *local_storage;
387 	bool free_local_storage = false;
388 	HLIST_HEAD(selem_free_list);
389 	unsigned long flags;
390 	int err;
391 
392 	if (unlikely(!selem_linked_to_storage_lockless(selem)))
393 		/* selem has already been unlinked from sk */
394 		return 0;
395 
396 	local_storage = rcu_dereference_check(selem->local_storage,
397 					      bpf_rcu_lock_held());
398 
399 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
400 	if (err)
401 		return err;
402 
403 	if (likely(selem_linked_to_storage(selem))) {
404 		/* Always unlink from map before unlinking from local_storage
405 		 * because selem will be freed after successfully unlinked from
406 		 * the local_storage.
407 		 */
408 		err = bpf_selem_unlink_map(selem);
409 		if (err)
410 			goto out;
411 
412 		free_local_storage = bpf_selem_unlink_storage_nolock(
413 			local_storage, selem, &selem_free_list);
414 	}
415 out:
416 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
417 
418 	bpf_selem_free_list(&selem_free_list, false);
419 
420 	if (free_local_storage)
421 		bpf_local_storage_free(local_storage, false);
422 
423 	return err;
424 }
425 
426 /*
427  * Unlink an selem from map and local storage with lockless fallback if callers
428  * are racing or rqspinlock returns error. It should only be called by
429  * bpf_local_storage_destroy() or bpf_local_storage_map_free().
430  */
431 static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
432 				    struct bpf_local_storage_map_bucket *b)
433 {
434 	bool in_map_free = !!b, free_storage = false;
435 	struct bpf_local_storage *local_storage;
436 	struct bpf_local_storage_map *smap;
437 	unsigned long flags;
438 	int err, unlink = 0;
439 
440 	local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
441 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
442 
443 	if (smap) {
444 		b = b ? : select_bucket(smap, local_storage);
445 		err = raw_res_spin_lock_irqsave(&b->lock, flags);
446 		if (!err) {
447 			/*
448 			 * Call bpf_obj_free_fields() under b->lock to make sure it is done
449 			 * exactly once for an selem. Safe to free special fields immediately
450 			 * as no BPF program should be referencing the selem.
451 			 */
452 			if (likely(selem_linked_to_map(selem))) {
453 				hlist_del_init_rcu(&selem->map_node);
454 				bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
455 				unlink++;
456 			}
457 			raw_res_spin_unlock_irqrestore(&b->lock, flags);
458 		}
459 		/*
460 		 * Highly unlikely scenario: resource leak
461 		 *
462 		 * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
463 		 * and both selem belong to the same bucket, if destroy(selem2) acquired
464 		 * b->lock and block for too long, neither map_free(selem1) and
465 		 * destroy(selem1) will be able to free the special field associated
466 		 * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
467 		 */
468 		WARN_ON_ONCE(err && in_map_free);
469 		if (!err || in_map_free)
470 			RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
471 	}
472 
473 	if (local_storage) {
474 		err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
475 		if (!err) {
476 			if (likely(selem_linked_to_storage(selem))) {
477 				free_storage = hlist_is_singular_node(&selem->snode,
478 								      &local_storage->list);
479 				 /*
480 				  * Okay to skip clearing owner_storage and storage->owner in
481 				  * destroy() since the owner is going away. No user or bpf
482 				  * programs should be able to reference it.
483 				  */
484 				if (smap && in_map_free)
485 					bpf_selem_unlink_storage_nolock_misc(
486 						selem, smap, local_storage,
487 						free_storage, true);
488 				hlist_del_init_rcu(&selem->snode);
489 				unlink++;
490 			}
491 			raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
492 		}
493 		if (!err || !in_map_free)
494 			RCU_INIT_POINTER(selem->local_storage, NULL);
495 	}
496 
497 	if (unlink != 2)
498 		atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
499 
500 	/*
501 	 * Normally, an selem can be unlinked under local_storage->lock and b->lock, and
502 	 * then freed after an RCU grace period. However, if destroy() and map_free() are
503 	 * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
504 	 * the selem only after both map_free() and destroy() see the selem.
505 	 */
506 	if (unlink == 2 ||
507 	    atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
508 		bpf_selem_free(selem, true);
509 
510 	if (free_storage)
511 		bpf_local_storage_free(local_storage, true);
512 }
513 
514 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
515 				      struct bpf_local_storage_map *smap,
516 				      struct bpf_local_storage_elem *selem)
517 {
518 	unsigned long flags;
519 	int err;
520 
521 	/* spinlock is needed to avoid racing with the
522 	 * parallel delete.  Otherwise, publishing an already
523 	 * deleted sdata to the cache will become a use-after-free
524 	 * problem in the next bpf_local_storage_lookup().
525 	 */
526 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
527 	if (err)
528 		return;
529 
530 	if (selem_linked_to_storage(selem))
531 		rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
532 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
533 }
534 
535 static int check_flags(const struct bpf_local_storage_data *old_sdata,
536 		       u64 map_flags)
537 {
538 	if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
539 		/* elem already exists */
540 		return -EEXIST;
541 
542 	if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
543 		/* elem doesn't exist, cannot update it */
544 		return -ENOENT;
545 
546 	return 0;
547 }
548 
549 int bpf_local_storage_alloc(void *owner,
550 			    struct bpf_local_storage_map *smap,
551 			    struct bpf_local_storage_elem *first_selem,
552 			    gfp_t gfp_flags)
553 {
554 	struct bpf_local_storage *prev_storage, *storage;
555 	struct bpf_local_storage **owner_storage_ptr;
556 	struct bpf_local_storage_map_bucket *b;
557 	unsigned long flags;
558 	int err;
559 
560 	err = mem_charge(smap, owner, sizeof(*storage));
561 	if (err)
562 		return err;
563 
564 	if (smap->use_kmalloc_nolock)
565 		storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
566 						 __GFP_ZERO, NUMA_NO_NODE);
567 	else
568 		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
569 					  gfp_flags | __GFP_NOWARN);
570 	if (!storage) {
571 		err = -ENOMEM;
572 		goto uncharge;
573 	}
574 
575 	INIT_HLIST_HEAD(&storage->list);
576 	raw_res_spin_lock_init(&storage->lock);
577 	storage->owner = owner;
578 	storage->mem_charge = sizeof(*storage);
579 	storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
580 	refcount_set(&storage->owner_refcnt, 1);
581 
582 	bpf_selem_link_storage_nolock(storage, first_selem);
583 
584 	b = select_bucket(smap, storage);
585 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
586 	if (err)
587 		goto uncharge;
588 
589 	bpf_selem_link_map_nolock(b, first_selem);
590 
591 	owner_storage_ptr =
592 		(struct bpf_local_storage **)owner_storage(smap, owner);
593 	/* Publish storage to the owner.
594 	 * Instead of using any lock of the kernel object (i.e. owner),
595 	 * cmpxchg will work with any kernel object regardless what
596 	 * the running context is, bh, irq...etc.
597 	 *
598 	 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
599 	 * is protected by the storage->lock.  Hence, when freeing
600 	 * the owner->storage, the storage->lock must be held before
601 	 * setting owner->storage ptr to NULL.
602 	 */
603 	prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
604 	if (unlikely(prev_storage)) {
605 		bpf_selem_unlink_map_nolock(first_selem);
606 		raw_res_spin_unlock_irqrestore(&b->lock, flags);
607 		err = -EAGAIN;
608 		goto uncharge;
609 	}
610 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
611 
612 	return 0;
613 
614 uncharge:
615 	bpf_local_storage_free(storage, true);
616 	mem_uncharge(smap, owner, sizeof(*storage));
617 	return err;
618 }
619 
620 /* sk cannot be going away because it is linking new elem
621  * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
622  * Otherwise, it will become a leak (and other memory issues
623  * during map destruction).
624  */
625 struct bpf_local_storage_data *
626 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
627 			 void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
628 {
629 	struct bpf_local_storage_data *old_sdata = NULL;
630 	struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
631 	struct bpf_local_storage *local_storage;
632 	struct bpf_local_storage_map_bucket *b;
633 	HLIST_HEAD(old_selem_free_list);
634 	unsigned long flags, b_flags;
635 	int err;
636 
637 	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
638 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
639 	    /* BPF_F_LOCK can only be used in a value with spin_lock */
640 	    unlikely((map_flags & BPF_F_LOCK) &&
641 		     !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
642 		return ERR_PTR(-EINVAL);
643 
644 	if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
645 		return ERR_PTR(-EINVAL);
646 
647 	local_storage = rcu_dereference_check(*owner_storage(smap, owner),
648 					      bpf_rcu_lock_held());
649 	if (!local_storage || hlist_empty(&local_storage->list)) {
650 		/* Very first elem for the owner */
651 		err = check_flags(NULL, map_flags);
652 		if (err)
653 			return ERR_PTR(err);
654 
655 		selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
656 		if (!selem)
657 			return ERR_PTR(-ENOMEM);
658 
659 		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
660 		if (err) {
661 			bpf_selem_free(selem, true);
662 			mem_uncharge(smap, owner, smap->elem_size);
663 			return ERR_PTR(err);
664 		}
665 
666 		return SDATA(selem);
667 	}
668 
669 	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
670 		/* Hoping to find an old_sdata to do inline update
671 		 * such that it can avoid taking the local_storage->lock
672 		 * and changing the lists.
673 		 */
674 		old_sdata =
675 			bpf_local_storage_lookup(local_storage, smap, false);
676 		err = check_flags(old_sdata, map_flags);
677 		if (err)
678 			return ERR_PTR(err);
679 		if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
680 			copy_map_value_locked(&smap->map, old_sdata->data,
681 					      value, false);
682 			return old_sdata;
683 		}
684 	}
685 
686 	/* A lookup has just been done before and concluded a new selem is
687 	 * needed. The chance of an unnecessary alloc is unlikely.
688 	 */
689 	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
690 	if (!alloc_selem)
691 		return ERR_PTR(-ENOMEM);
692 
693 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
694 	if (err)
695 		goto free_selem;
696 
697 	/* Recheck local_storage->list under local_storage->lock */
698 	if (unlikely(hlist_empty(&local_storage->list))) {
699 		/* A parallel del is happening and local_storage is going
700 		 * away.  It has just been checked before, so very
701 		 * unlikely.  Return instead of retry to keep things
702 		 * simple.
703 		 */
704 		err = -EAGAIN;
705 		goto unlock;
706 	}
707 
708 	old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
709 	err = check_flags(old_sdata, map_flags);
710 	if (err)
711 		goto unlock;
712 
713 	if (old_sdata && (map_flags & BPF_F_LOCK)) {
714 		copy_map_value_locked(&smap->map, old_sdata->data, value,
715 				      false);
716 		selem = SELEM(old_sdata);
717 		goto unlock;
718 	}
719 
720 	b = select_bucket(smap, local_storage);
721 
722 	err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
723 	if (err)
724 		goto unlock;
725 
726 	alloc_selem = NULL;
727 	/* First, link the new selem to the map */
728 	bpf_selem_link_map_nolock(b, selem);
729 
730 	/* Second, link (and publish) the new selem to local_storage */
731 	bpf_selem_link_storage_nolock(local_storage, selem);
732 
733 	/* Third, remove old selem, SELEM(old_sdata) */
734 	if (old_sdata) {
735 		bpf_selem_unlink_map_nolock(SELEM(old_sdata));
736 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
737 						&old_selem_free_list);
738 	}
739 
740 	raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
741 unlock:
742 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
743 free_selem:
744 	bpf_selem_free_list(&old_selem_free_list, false);
745 	if (alloc_selem) {
746 		mem_uncharge(smap, owner, smap->elem_size);
747 		bpf_selem_free(alloc_selem, true);
748 	}
749 	return err ? ERR_PTR(err) : SDATA(selem);
750 }
751 
752 static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
753 {
754 	u64 min_usage = U64_MAX;
755 	u16 i, res = 0;
756 
757 	spin_lock(&cache->idx_lock);
758 
759 	for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
760 		if (cache->idx_usage_counts[i] < min_usage) {
761 			min_usage = cache->idx_usage_counts[i];
762 			res = i;
763 
764 			/* Found a free cache_idx */
765 			if (!min_usage)
766 				break;
767 		}
768 	}
769 	cache->idx_usage_counts[res]++;
770 
771 	spin_unlock(&cache->idx_lock);
772 
773 	return res;
774 }
775 
776 static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
777 					     u16 idx)
778 {
779 	spin_lock(&cache->idx_lock);
780 	cache->idx_usage_counts[idx]--;
781 	spin_unlock(&cache->idx_lock);
782 }
783 
784 int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
785 {
786 	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
787 	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
788 	    attr->max_entries ||
789 	    attr->key_size != sizeof(int) || !attr->value_size ||
790 	    /* Enforce BTF for userspace sk dumping */
791 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
792 		return -EINVAL;
793 
794 	if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
795 		return -E2BIG;
796 
797 	return 0;
798 }
799 
800 int bpf_local_storage_map_check_btf(const struct bpf_map *map,
801 				    const struct btf *btf,
802 				    const struct btf_type *key_type,
803 				    const struct btf_type *value_type)
804 {
805 	if (!btf_type_is_i32(key_type))
806 		return -EINVAL;
807 
808 	return 0;
809 }
810 
811 /*
812  * Destroy local storage when the owner is going away. Caller must uncharge memory
813  * if memory charging is used.
814  */
815 u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
816 {
817 	struct bpf_local_storage_elem *selem;
818 
819 	/* Neither the bpf_prog nor the bpf_map's syscall
820 	 * could be modifying the local_storage->list now.
821 	 * Thus, no elem can be added to or deleted from the
822 	 * local_storage->list by the bpf_prog or by the bpf_map's syscall.
823 	 *
824 	 * It is racing with bpf_local_storage_map_free() alone
825 	 * when unlinking elem from the local_storage->list and
826 	 * the map's bucket->list.
827 	 */
828 	hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
829 		bpf_selem_unlink_nofail(selem, NULL);
830 
831 	if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
832 		while (refcount_read(&local_storage->owner_refcnt))
833 			cpu_relax();
834 		/*
835 		 * Paired with refcount_dec() in bpf_selem_unlink_nofail()
836 		 * to make sure destroy() sees the correct local_storage->mem_charge.
837 		 */
838 		smp_mb();
839 	}
840 
841 	return local_storage->mem_charge;
842 }
843 
844 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
845 {
846 	struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
847 	u64 usage = sizeof(*smap);
848 
849 	/* The dynamically callocated selems are not counted currently. */
850 	usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
851 	return usage;
852 }
853 
854 struct bpf_map *
855 bpf_local_storage_map_alloc(union bpf_attr *attr,
856 			    struct bpf_local_storage_cache *cache,
857 			    bool use_kmalloc_nolock)
858 {
859 	struct bpf_local_storage_map *smap;
860 	unsigned int i;
861 	u32 nbuckets;
862 	int err;
863 
864 	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
865 	if (!smap)
866 		return ERR_PTR(-ENOMEM);
867 	bpf_map_init_from_attr(&smap->map, attr);
868 
869 	nbuckets = roundup_pow_of_two(num_possible_cpus());
870 	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
871 	nbuckets = max_t(u32, 2, nbuckets);
872 	smap->bucket_log = ilog2(nbuckets);
873 
874 	smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
875 					 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
876 	if (!smap->buckets) {
877 		err = -ENOMEM;
878 		goto free_smap;
879 	}
880 
881 	for (i = 0; i < nbuckets; i++) {
882 		INIT_HLIST_HEAD(&smap->buckets[i].list);
883 		raw_res_spin_lock_init(&smap->buckets[i].lock);
884 	}
885 
886 	smap->elem_size = offsetof(struct bpf_local_storage_elem,
887 				   sdata.data[attr->value_size]);
888 
889 	/* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
890 	 * preemptible context. Thus, enforce all storages to use
891 	 * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
892 	 */
893 	smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
894 
895 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
896 	return &smap->map;
897 
898 free_smap:
899 	kvfree(smap->buckets);
900 	bpf_map_area_free(smap);
901 	return ERR_PTR(err);
902 }
903 
904 void bpf_local_storage_map_free(struct bpf_map *map,
905 				struct bpf_local_storage_cache *cache)
906 {
907 	struct bpf_local_storage_map_bucket *b;
908 	struct bpf_local_storage_elem *selem;
909 	struct bpf_local_storage_map *smap;
910 	unsigned int i;
911 
912 	smap = (struct bpf_local_storage_map *)map;
913 	bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
914 
915 	/* Note that this map might be concurrently cloned from
916 	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
917 	 * RCU read section to finish before proceeding. New RCU
918 	 * read sections should be prevented via bpf_map_inc_not_zero.
919 	 */
920 	synchronize_rcu();
921 
922 	/* bpf prog and the userspace can no longer access this map
923 	 * now.  No new selem (of this map) can be added
924 	 * to the owner->storage or to the map bucket's list.
925 	 *
926 	 * The elem of this map can be cleaned up here
927 	 * or when the storage is freed e.g.
928 	 * by bpf_sk_storage_free() during __sk_destruct().
929 	 */
930 	for (i = 0; i < (1U << smap->bucket_log); i++) {
931 		b = &smap->buckets[i];
932 
933 		rcu_read_lock();
934 		/* No one is adding to b->list now */
935 restart:
936 		hlist_for_each_entry_rcu(selem, &b->list, map_node) {
937 			bpf_selem_unlink_nofail(selem, b);
938 
939 			if (need_resched()) {
940 				cond_resched_rcu();
941 				goto restart;
942 			}
943 		}
944 		rcu_read_unlock();
945 	}
946 
947 	/* While freeing the storage we may still need to access the map.
948 	 *
949 	 * e.g. when bpf_sk_storage_free() has unlinked selem from the map
950 	 * which then made the above while((selem = ...)) loop
951 	 * exit immediately.
952 	 *
953 	 * However, while freeing the storage one still needs to access the
954 	 * smap->elem_size to do the uncharging in
955 	 * bpf_selem_unlink_storage_nolock().
956 	 *
957 	 * Hence, wait another rcu grace period for the storage to be freed.
958 	 */
959 	synchronize_rcu();
960 
961 	if (smap->use_kmalloc_nolock) {
962 		rcu_barrier_tasks_trace();
963 		rcu_barrier();
964 	}
965 	kvfree(smap->buckets);
966 	bpf_map_area_free(smap);
967 }
968