xref: /linux/kernel/bpf/bpf_local_storage.c (revision a075082a15e7f5c4889d0cbb51a4041c332cb00c)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2019 Facebook  */
3 #include <linux/rculist.h>
4 #include <linux/list.h>
5 #include <linux/hash.h>
6 #include <linux/types.h>
7 #include <linux/spinlock.h>
8 #include <linux/bpf.h>
9 #include <linux/btf_ids.h>
10 #include <linux/bpf_local_storage.h>
11 #include <net/sock.h>
12 #include <uapi/linux/sock_diag.h>
13 #include <uapi/linux/btf.h>
14 #include <linux/rcupdate.h>
15 #include <linux/rcupdate_trace.h>
16 #include <linux/rcupdate_wait.h>
17 
18 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
19 
20 static struct bpf_local_storage_map_bucket *
21 select_bucket(struct bpf_local_storage_map *smap,
22 	      struct bpf_local_storage *local_storage)
23 {
24 	return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
25 }
26 
27 static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
28 {
29 	struct bpf_map *map = &smap->map;
30 
31 	if (!map->ops->map_local_storage_charge)
32 		return 0;
33 
34 	return map->ops->map_local_storage_charge(smap, owner, size);
35 }
36 
37 static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
38 			 u32 size)
39 {
40 	struct bpf_map *map = &smap->map;
41 
42 	if (map->ops->map_local_storage_uncharge)
43 		map->ops->map_local_storage_uncharge(smap, owner, size);
44 }
45 
46 static struct bpf_local_storage __rcu **
47 owner_storage(struct bpf_local_storage_map *smap, void *owner)
48 {
49 	struct bpf_map *map = &smap->map;
50 
51 	return map->ops->map_owner_storage_ptr(owner);
52 }
53 
54 static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
55 {
56 	return !hlist_unhashed_lockless(&selem->snode);
57 }
58 
59 static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
60 {
61 	return !hlist_unhashed(&selem->snode);
62 }
63 
64 static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
65 {
66 	return !hlist_unhashed(&selem->map_node);
67 }
68 
69 struct bpf_local_storage_elem *
70 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
71 		void *value, bool swap_uptrs, gfp_t gfp_flags)
72 {
73 	struct bpf_local_storage_elem *selem;
74 
75 	if (mem_charge(smap, owner, smap->elem_size))
76 		return NULL;
77 
78 	if (smap->use_kmalloc_nolock) {
79 		selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
80 					       __GFP_ZERO, NUMA_NO_NODE);
81 	} else {
82 		selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
83 					gfp_flags | __GFP_NOWARN);
84 	}
85 
86 	if (selem) {
87 		RCU_INIT_POINTER(SDATA(selem)->smap, smap);
88 		atomic_set(&selem->state, 0);
89 		selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
90 
91 		if (value) {
92 			/* No need to call check_and_init_map_value as memory is zero init */
93 			copy_map_value(&smap->map, SDATA(selem)->data, value);
94 			if (swap_uptrs)
95 				bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
96 		}
97 		return selem;
98 	}
99 
100 	mem_uncharge(smap, owner, smap->elem_size);
101 
102 	return NULL;
103 }
104 
105 /* rcu tasks trace callback for use_kmalloc_nolock == false */
106 static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
107 {
108 	struct bpf_local_storage *local_storage;
109 
110 	/*
111 	 * RCU Tasks Trace grace period implies RCU grace period, do
112 	 * kfree() directly.
113 	 */
114 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
115 	kfree(local_storage);
116 }
117 
118 /* Handle use_kmalloc_nolock == false */
119 static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
120 				     bool vanilla_rcu)
121 {
122 	if (vanilla_rcu)
123 		kfree_rcu(local_storage, rcu);
124 	else
125 		call_rcu_tasks_trace(&local_storage->rcu,
126 				     __bpf_local_storage_free_trace_rcu);
127 }
128 
129 static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
130 {
131 	struct bpf_local_storage *local_storage;
132 
133 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
134 	kfree_nolock(local_storage);
135 }
136 
137 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
138 {
139 	/*
140 	 * RCU Tasks Trace grace period implies RCU grace period, do
141 	 * kfree() directly.
142 	 */
143 	bpf_local_storage_free_rcu(rcu);
144 }
145 
146 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
147 				   bool reuse_now)
148 {
149 	if (!local_storage)
150 		return;
151 
152 	if (!local_storage->use_kmalloc_nolock) {
153 		__bpf_local_storage_free(local_storage, reuse_now);
154 		return;
155 	}
156 
157 	if (reuse_now) {
158 		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
159 		return;
160 	}
161 
162 	call_rcu_tasks_trace(&local_storage->rcu,
163 			     bpf_local_storage_free_trace_rcu);
164 }
165 
166 /* rcu callback for use_kmalloc_nolock == false */
167 static void __bpf_selem_free_rcu(struct rcu_head *rcu)
168 {
169 	struct bpf_local_storage_elem *selem;
170 	struct bpf_local_storage_map *smap;
171 
172 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
173 	/* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
174 	smap = rcu_dereference_check(SDATA(selem)->smap, 1);
175 
176 	if (smap)
177 		bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
178 	kfree(selem);
179 }
180 
181 /* rcu tasks trace callback for use_kmalloc_nolock == false */
182 static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
183 {
184 	/*
185 	 * RCU Tasks Trace grace period implies RCU grace period, do
186 	 * kfree() directly.
187 	 */
188 	__bpf_selem_free_rcu(rcu);
189 }
190 
191 /* Handle use_kmalloc_nolock == false */
192 static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
193 			     bool vanilla_rcu)
194 {
195 	if (vanilla_rcu)
196 		call_rcu(&selem->rcu, __bpf_selem_free_rcu);
197 	else
198 		call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
199 }
200 
201 static void bpf_selem_free_rcu(struct rcu_head *rcu)
202 {
203 	struct bpf_local_storage_elem *selem;
204 	struct bpf_local_storage_map *smap;
205 
206 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
207 	/* The bpf_local_storage_map_free will wait for rcu_barrier */
208 	smap = rcu_dereference_check(SDATA(selem)->smap, 1);
209 
210 	if (smap)
211 		bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
212 	kfree_nolock(selem);
213 }
214 
215 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
216 {
217 	/*
218 	 * RCU Tasks Trace grace period implies RCU grace period, do
219 	 * kfree() directly.
220 	 */
221 	bpf_selem_free_rcu(rcu);
222 }
223 
224 void bpf_selem_free(struct bpf_local_storage_elem *selem,
225 		    bool reuse_now)
226 {
227 	if (!selem->use_kmalloc_nolock) {
228 		/*
229 		 * No uptr will be unpin even when reuse_now == false since uptr
230 		 * is only supported in task local storage, where
231 		 * smap->use_kmalloc_nolock == true.
232 		 */
233 		__bpf_selem_free(selem, reuse_now);
234 		return;
235 	}
236 
237 	if (reuse_now) {
238 		/*
239 		 * While it is okay to call bpf_obj_free_fields() that unpins uptr when
240 		 * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
241 		 */
242 		call_rcu(&selem->rcu, bpf_selem_free_rcu);
243 		return;
244 	}
245 
246 	call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
247 }
248 
249 static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
250 {
251 	struct bpf_local_storage_elem *selem;
252 	struct hlist_node *n;
253 
254 	/* The "_safe" iteration is needed.
255 	 * The loop is not removing the selem from the list
256 	 * but bpf_selem_free will use the selem->rcu_head
257 	 * which is union-ized with the selem->free_node.
258 	 */
259 	hlist_for_each_entry_safe(selem, n, list, free_node)
260 		bpf_selem_free(selem, reuse_now);
261 }
262 
263 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
264 						 struct bpf_local_storage_map *smap,
265 						 struct bpf_local_storage *local_storage,
266 						 bool free_local_storage, bool pin_owner)
267 {
268 	void *owner = local_storage->owner;
269 	u32 uncharge = smap->elem_size;
270 
271 	if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
272 	    SDATA(selem))
273 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
274 
275 	if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
276 		return;
277 
278 	uncharge += free_local_storage ? sizeof(*local_storage) : 0;
279 	mem_uncharge(smap, local_storage->owner, uncharge);
280 	local_storage->mem_charge -= uncharge;
281 
282 	if (free_local_storage) {
283 		local_storage->owner = NULL;
284 
285 		/* After this RCU_INIT, owner may be freed and cannot be used */
286 		RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
287 	}
288 
289 	if (pin_owner)
290 		refcount_dec(&local_storage->owner_refcnt);
291 }
292 
293 /* local_storage->lock must be held and selem->local_storage == local_storage.
294  * The caller must ensure selem->smap is still valid to be
295  * dereferenced for its smap->elem_size and smap->cache_idx.
296  */
297 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
298 					    struct bpf_local_storage_elem *selem,
299 					    struct hlist_head *free_selem_list)
300 {
301 	struct bpf_local_storage_map *smap;
302 	bool free_local_storage;
303 
304 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
305 
306 	free_local_storage = hlist_is_singular_node(&selem->snode,
307 						    &local_storage->list);
308 
309 	bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
310 					     free_local_storage, false);
311 
312 	hlist_del_init_rcu(&selem->snode);
313 
314 	hlist_add_head(&selem->free_node, free_selem_list);
315 
316 	return free_local_storage;
317 }
318 
319 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
320 				   struct bpf_local_storage_elem *selem)
321 {
322 	struct bpf_local_storage_map *smap;
323 
324 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
325 	local_storage->mem_charge += smap->elem_size;
326 
327 	RCU_INIT_POINTER(selem->local_storage, local_storage);
328 	hlist_add_head_rcu(&selem->snode, &local_storage->list);
329 }
330 
331 static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
332 {
333 	struct bpf_local_storage *local_storage;
334 	struct bpf_local_storage_map *smap;
335 	struct bpf_local_storage_map_bucket *b;
336 	unsigned long flags;
337 	int err;
338 
339 	local_storage = rcu_dereference_check(selem->local_storage,
340 					      bpf_rcu_lock_held());
341 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
342 	b = select_bucket(smap, local_storage);
343 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
344 	if (err)
345 		return err;
346 
347 	hlist_del_init_rcu(&selem->map_node);
348 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
349 
350 	return 0;
351 }
352 
353 static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
354 {
355 	hlist_del_init_rcu(&selem->map_node);
356 }
357 
358 int bpf_selem_link_map(struct bpf_local_storage_map *smap,
359 		       struct bpf_local_storage *local_storage,
360 		       struct bpf_local_storage_elem *selem)
361 {
362 	struct bpf_local_storage_map_bucket *b;
363 	unsigned long flags;
364 	int err;
365 
366 	b = select_bucket(smap, local_storage);
367 
368 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
369 	if (err)
370 		return err;
371 
372 	hlist_add_head_rcu(&selem->map_node, &b->list);
373 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
374 
375 	return 0;
376 }
377 
378 static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
379 				      struct bpf_local_storage_elem *selem)
380 {
381 	hlist_add_head_rcu(&selem->map_node, &b->list);
382 }
383 
384 /*
385  * Unlink an selem from map and local storage with lock held.
386  * This is the common path used by local storages to delete an selem.
387  */
388 int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
389 {
390 	struct bpf_local_storage *local_storage;
391 	bool free_local_storage = false;
392 	HLIST_HEAD(selem_free_list);
393 	unsigned long flags;
394 	int err;
395 
396 	if (unlikely(!selem_linked_to_storage_lockless(selem)))
397 		/* selem has already been unlinked from sk */
398 		return 0;
399 
400 	local_storage = rcu_dereference_check(selem->local_storage,
401 					      bpf_rcu_lock_held());
402 
403 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
404 	if (err)
405 		return err;
406 
407 	if (likely(selem_linked_to_storage(selem))) {
408 		/* Always unlink from map before unlinking from local_storage
409 		 * because selem will be freed after successfully unlinked from
410 		 * the local_storage.
411 		 */
412 		err = bpf_selem_unlink_map(selem);
413 		if (err)
414 			goto out;
415 
416 		free_local_storage = bpf_selem_unlink_storage_nolock(
417 			local_storage, selem, &selem_free_list);
418 	}
419 out:
420 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
421 
422 	bpf_selem_free_list(&selem_free_list, false);
423 
424 	if (free_local_storage)
425 		bpf_local_storage_free(local_storage, false);
426 
427 	return err;
428 }
429 
430 /*
431  * Unlink an selem from map and local storage with lockless fallback if callers
432  * are racing or rqspinlock returns error. It should only be called by
433  * bpf_local_storage_destroy() or bpf_local_storage_map_free().
434  */
435 static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
436 				    struct bpf_local_storage_map_bucket *b)
437 {
438 	bool in_map_free = !!b, free_storage = false;
439 	struct bpf_local_storage *local_storage;
440 	struct bpf_local_storage_map *smap;
441 	unsigned long flags;
442 	int err, unlink = 0;
443 
444 	local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
445 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
446 
447 	if (smap) {
448 		b = b ? : select_bucket(smap, local_storage);
449 		err = raw_res_spin_lock_irqsave(&b->lock, flags);
450 		if (!err) {
451 			/*
452 			 * Call bpf_obj_free_fields() under b->lock to make sure it is done
453 			 * exactly once for an selem. Safe to free special fields immediately
454 			 * as no BPF program should be referencing the selem.
455 			 */
456 			if (likely(selem_linked_to_map(selem))) {
457 				hlist_del_init_rcu(&selem->map_node);
458 				bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
459 				unlink++;
460 			}
461 			raw_res_spin_unlock_irqrestore(&b->lock, flags);
462 		}
463 		/*
464 		 * Highly unlikely scenario: resource leak
465 		 *
466 		 * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
467 		 * and both selem belong to the same bucket, if destroy(selem2) acquired
468 		 * b->lock and block for too long, neither map_free(selem1) and
469 		 * destroy(selem1) will be able to free the special field associated
470 		 * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
471 		 */
472 		WARN_ON_ONCE(err && in_map_free);
473 		if (!err || in_map_free)
474 			RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
475 	}
476 
477 	if (local_storage) {
478 		err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
479 		if (!err) {
480 			if (likely(selem_linked_to_storage(selem))) {
481 				free_storage = hlist_is_singular_node(&selem->snode,
482 								      &local_storage->list);
483 				 /*
484 				  * Okay to skip clearing owner_storage and storage->owner in
485 				  * destroy() since the owner is going away. No user or bpf
486 				  * programs should be able to reference it.
487 				  */
488 				if (smap && in_map_free)
489 					bpf_selem_unlink_storage_nolock_misc(
490 						selem, smap, local_storage,
491 						free_storage, true);
492 				hlist_del_init_rcu(&selem->snode);
493 				unlink++;
494 			}
495 			raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
496 		}
497 		if (!err || !in_map_free)
498 			RCU_INIT_POINTER(selem->local_storage, NULL);
499 	}
500 
501 	if (unlink != 2)
502 		atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
503 
504 	/*
505 	 * Normally, an selem can be unlinked under local_storage->lock and b->lock, and
506 	 * then freed after an RCU grace period. However, if destroy() and map_free() are
507 	 * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
508 	 * the selem only after both map_free() and destroy() see the selem.
509 	 */
510 	if (unlink == 2 ||
511 	    atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
512 		bpf_selem_free(selem, true);
513 
514 	if (free_storage)
515 		bpf_local_storage_free(local_storage, true);
516 }
517 
518 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
519 				      struct bpf_local_storage_map *smap,
520 				      struct bpf_local_storage_elem *selem)
521 {
522 	unsigned long flags;
523 	int err;
524 
525 	/* spinlock is needed to avoid racing with the
526 	 * parallel delete.  Otherwise, publishing an already
527 	 * deleted sdata to the cache will become a use-after-free
528 	 * problem in the next bpf_local_storage_lookup().
529 	 */
530 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
531 	if (err)
532 		return;
533 
534 	if (selem_linked_to_storage(selem))
535 		rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
536 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
537 }
538 
539 static int check_flags(const struct bpf_local_storage_data *old_sdata,
540 		       u64 map_flags)
541 {
542 	if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
543 		/* elem already exists */
544 		return -EEXIST;
545 
546 	if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
547 		/* elem doesn't exist, cannot update it */
548 		return -ENOENT;
549 
550 	return 0;
551 }
552 
553 int bpf_local_storage_alloc(void *owner,
554 			    struct bpf_local_storage_map *smap,
555 			    struct bpf_local_storage_elem *first_selem,
556 			    gfp_t gfp_flags)
557 {
558 	struct bpf_local_storage *prev_storage, *storage;
559 	struct bpf_local_storage **owner_storage_ptr;
560 	struct bpf_local_storage_map_bucket *b;
561 	unsigned long flags;
562 	int err;
563 
564 	err = mem_charge(smap, owner, sizeof(*storage));
565 	if (err)
566 		return err;
567 
568 	if (smap->use_kmalloc_nolock)
569 		storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
570 						 __GFP_ZERO, NUMA_NO_NODE);
571 	else
572 		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
573 					  gfp_flags | __GFP_NOWARN);
574 	if (!storage) {
575 		err = -ENOMEM;
576 		goto uncharge;
577 	}
578 
579 	INIT_HLIST_HEAD(&storage->list);
580 	raw_res_spin_lock_init(&storage->lock);
581 	storage->owner = owner;
582 	storage->mem_charge = sizeof(*storage);
583 	storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
584 	refcount_set(&storage->owner_refcnt, 1);
585 
586 	bpf_selem_link_storage_nolock(storage, first_selem);
587 
588 	b = select_bucket(smap, storage);
589 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
590 	if (err)
591 		goto uncharge;
592 
593 	bpf_selem_link_map_nolock(b, first_selem);
594 
595 	owner_storage_ptr =
596 		(struct bpf_local_storage **)owner_storage(smap, owner);
597 	/* Publish storage to the owner.
598 	 * Instead of using any lock of the kernel object (i.e. owner),
599 	 * cmpxchg will work with any kernel object regardless what
600 	 * the running context is, bh, irq...etc.
601 	 *
602 	 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
603 	 * is protected by the storage->lock.  Hence, when freeing
604 	 * the owner->storage, the storage->lock must be held before
605 	 * setting owner->storage ptr to NULL.
606 	 */
607 	prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
608 	if (unlikely(prev_storage)) {
609 		bpf_selem_unlink_map_nolock(first_selem);
610 		raw_res_spin_unlock_irqrestore(&b->lock, flags);
611 		err = -EAGAIN;
612 		goto uncharge;
613 	}
614 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
615 
616 	return 0;
617 
618 uncharge:
619 	bpf_local_storage_free(storage, true);
620 	mem_uncharge(smap, owner, sizeof(*storage));
621 	return err;
622 }
623 
624 /* sk cannot be going away because it is linking new elem
625  * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
626  * Otherwise, it will become a leak (and other memory issues
627  * during map destruction).
628  */
629 struct bpf_local_storage_data *
630 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
631 			 void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
632 {
633 	struct bpf_local_storage_data *old_sdata = NULL;
634 	struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
635 	struct bpf_local_storage *local_storage;
636 	struct bpf_local_storage_map_bucket *b;
637 	HLIST_HEAD(old_selem_free_list);
638 	unsigned long flags, b_flags;
639 	int err;
640 
641 	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
642 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
643 	    /* BPF_F_LOCK can only be used in a value with spin_lock */
644 	    unlikely((map_flags & BPF_F_LOCK) &&
645 		     !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
646 		return ERR_PTR(-EINVAL);
647 
648 	if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
649 		return ERR_PTR(-EINVAL);
650 
651 	local_storage = rcu_dereference_check(*owner_storage(smap, owner),
652 					      bpf_rcu_lock_held());
653 	if (!local_storage || hlist_empty(&local_storage->list)) {
654 		/* Very first elem for the owner */
655 		err = check_flags(NULL, map_flags);
656 		if (err)
657 			return ERR_PTR(err);
658 
659 		selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
660 		if (!selem)
661 			return ERR_PTR(-ENOMEM);
662 
663 		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
664 		if (err) {
665 			bpf_selem_free(selem, true);
666 			mem_uncharge(smap, owner, smap->elem_size);
667 			return ERR_PTR(err);
668 		}
669 
670 		return SDATA(selem);
671 	}
672 
673 	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
674 		/* Hoping to find an old_sdata to do inline update
675 		 * such that it can avoid taking the local_storage->lock
676 		 * and changing the lists.
677 		 */
678 		old_sdata =
679 			bpf_local_storage_lookup(local_storage, smap, false);
680 		err = check_flags(old_sdata, map_flags);
681 		if (err)
682 			return ERR_PTR(err);
683 		if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
684 			copy_map_value_locked(&smap->map, old_sdata->data,
685 					      value, false);
686 			return old_sdata;
687 		}
688 	}
689 
690 	/* A lookup has just been done before and concluded a new selem is
691 	 * needed. The chance of an unnecessary alloc is unlikely.
692 	 */
693 	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
694 	if (!alloc_selem)
695 		return ERR_PTR(-ENOMEM);
696 
697 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
698 	if (err)
699 		goto free_selem;
700 
701 	/* Recheck local_storage->list under local_storage->lock */
702 	if (unlikely(hlist_empty(&local_storage->list))) {
703 		/* A parallel del is happening and local_storage is going
704 		 * away.  It has just been checked before, so very
705 		 * unlikely.  Return instead of retry to keep things
706 		 * simple.
707 		 */
708 		err = -EAGAIN;
709 		goto unlock;
710 	}
711 
712 	old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
713 	err = check_flags(old_sdata, map_flags);
714 	if (err)
715 		goto unlock;
716 
717 	if (old_sdata && (map_flags & BPF_F_LOCK)) {
718 		copy_map_value_locked(&smap->map, old_sdata->data, value,
719 				      false);
720 		selem = SELEM(old_sdata);
721 		goto unlock;
722 	}
723 
724 	b = select_bucket(smap, local_storage);
725 
726 	err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
727 	if (err)
728 		goto unlock;
729 
730 	alloc_selem = NULL;
731 	/* First, link the new selem to the map */
732 	bpf_selem_link_map_nolock(b, selem);
733 
734 	/* Second, link (and publish) the new selem to local_storage */
735 	bpf_selem_link_storage_nolock(local_storage, selem);
736 
737 	/* Third, remove old selem, SELEM(old_sdata) */
738 	if (old_sdata) {
739 		bpf_selem_unlink_map_nolock(SELEM(old_sdata));
740 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
741 						&old_selem_free_list);
742 	}
743 
744 	raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
745 unlock:
746 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
747 free_selem:
748 	bpf_selem_free_list(&old_selem_free_list, false);
749 	if (alloc_selem) {
750 		mem_uncharge(smap, owner, smap->elem_size);
751 		bpf_selem_free(alloc_selem, true);
752 	}
753 	return err ? ERR_PTR(err) : SDATA(selem);
754 }
755 
756 static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
757 {
758 	u64 min_usage = U64_MAX;
759 	u16 i, res = 0;
760 
761 	spin_lock(&cache->idx_lock);
762 
763 	for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
764 		if (cache->idx_usage_counts[i] < min_usage) {
765 			min_usage = cache->idx_usage_counts[i];
766 			res = i;
767 
768 			/* Found a free cache_idx */
769 			if (!min_usage)
770 				break;
771 		}
772 	}
773 	cache->idx_usage_counts[res]++;
774 
775 	spin_unlock(&cache->idx_lock);
776 
777 	return res;
778 }
779 
780 static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
781 					     u16 idx)
782 {
783 	spin_lock(&cache->idx_lock);
784 	cache->idx_usage_counts[idx]--;
785 	spin_unlock(&cache->idx_lock);
786 }
787 
788 int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
789 {
790 	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
791 	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
792 	    attr->max_entries ||
793 	    attr->key_size != sizeof(int) || !attr->value_size ||
794 	    /* Enforce BTF for userspace sk dumping */
795 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
796 		return -EINVAL;
797 
798 	if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
799 		return -E2BIG;
800 
801 	return 0;
802 }
803 
804 int bpf_local_storage_map_check_btf(struct bpf_map *map,
805 				    const struct btf *btf,
806 				    const struct btf_type *key_type,
807 				    const struct btf_type *value_type)
808 {
809 	if (!btf_type_is_i32(key_type))
810 		return -EINVAL;
811 
812 	return 0;
813 }
814 
815 /*
816  * Destroy local storage when the owner is going away. Caller must uncharge memory
817  * if memory charging is used.
818  */
819 u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
820 {
821 	struct bpf_local_storage_elem *selem;
822 
823 	/* Neither the bpf_prog nor the bpf_map's syscall
824 	 * could be modifying the local_storage->list now.
825 	 * Thus, no elem can be added to or deleted from the
826 	 * local_storage->list by the bpf_prog or by the bpf_map's syscall.
827 	 *
828 	 * It is racing with bpf_local_storage_map_free() alone
829 	 * when unlinking elem from the local_storage->list and
830 	 * the map's bucket->list.
831 	 */
832 	hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
833 		bpf_selem_unlink_nofail(selem, NULL);
834 
835 	if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
836 		while (refcount_read(&local_storage->owner_refcnt))
837 			cpu_relax();
838 		/*
839 		 * Paired with refcount_dec() in bpf_selem_unlink_nofail()
840 		 * to make sure destroy() sees the correct local_storage->mem_charge.
841 		 */
842 		smp_mb();
843 	}
844 
845 	return local_storage->mem_charge;
846 }
847 
848 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
849 {
850 	struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
851 	u64 usage = sizeof(*smap);
852 
853 	/* The dynamically callocated selems are not counted currently. */
854 	usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
855 	return usage;
856 }
857 
858 struct bpf_map *
859 bpf_local_storage_map_alloc(union bpf_attr *attr,
860 			    struct bpf_local_storage_cache *cache,
861 			    bool use_kmalloc_nolock)
862 {
863 	struct bpf_local_storage_map *smap;
864 	unsigned int i;
865 	u32 nbuckets;
866 	int err;
867 
868 	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
869 	if (!smap)
870 		return ERR_PTR(-ENOMEM);
871 	bpf_map_init_from_attr(&smap->map, attr);
872 
873 	nbuckets = roundup_pow_of_two(num_possible_cpus());
874 	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
875 	nbuckets = max_t(u32, 2, nbuckets);
876 	smap->bucket_log = ilog2(nbuckets);
877 
878 	smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
879 					 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
880 	if (!smap->buckets) {
881 		err = -ENOMEM;
882 		goto free_smap;
883 	}
884 
885 	for (i = 0; i < nbuckets; i++) {
886 		INIT_HLIST_HEAD(&smap->buckets[i].list);
887 		raw_res_spin_lock_init(&smap->buckets[i].lock);
888 	}
889 
890 	smap->elem_size = offsetof(struct bpf_local_storage_elem,
891 				   sdata.data[attr->value_size]);
892 
893 	/* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
894 	 * preemptible context. Thus, enforce all storages to use
895 	 * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
896 	 */
897 	smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
898 
899 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
900 	return &smap->map;
901 
902 free_smap:
903 	kvfree(smap->buckets);
904 	bpf_map_area_free(smap);
905 	return ERR_PTR(err);
906 }
907 
908 void bpf_local_storage_map_free(struct bpf_map *map,
909 				struct bpf_local_storage_cache *cache)
910 {
911 	struct bpf_local_storage_map_bucket *b;
912 	struct bpf_local_storage_elem *selem;
913 	struct bpf_local_storage_map *smap;
914 	unsigned int i;
915 
916 	smap = (struct bpf_local_storage_map *)map;
917 	bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
918 
919 	/* Note that this map might be concurrently cloned from
920 	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
921 	 * RCU read section to finish before proceeding. New RCU
922 	 * read sections should be prevented via bpf_map_inc_not_zero.
923 	 */
924 	synchronize_rcu();
925 
926 	/* bpf prog and the userspace can no longer access this map
927 	 * now.  No new selem (of this map) can be added
928 	 * to the owner->storage or to the map bucket's list.
929 	 *
930 	 * The elem of this map can be cleaned up here
931 	 * or when the storage is freed e.g.
932 	 * by bpf_sk_storage_free() during __sk_destruct().
933 	 */
934 	for (i = 0; i < (1U << smap->bucket_log); i++) {
935 		b = &smap->buckets[i];
936 
937 		rcu_read_lock();
938 		/* No one is adding to b->list now */
939 restart:
940 		hlist_for_each_entry_rcu(selem, &b->list, map_node) {
941 			bpf_selem_unlink_nofail(selem, b);
942 
943 			if (need_resched()) {
944 				cond_resched_rcu();
945 				goto restart;
946 			}
947 		}
948 		rcu_read_unlock();
949 	}
950 
951 	/* While freeing the storage we may still need to access the map.
952 	 *
953 	 * e.g. when bpf_sk_storage_free() has unlinked selem from the map
954 	 * which then made the above while((selem = ...)) loop
955 	 * exit immediately.
956 	 *
957 	 * However, while freeing the storage one still needs to access the
958 	 * smap->elem_size to do the uncharging in
959 	 * bpf_selem_unlink_storage_nolock().
960 	 *
961 	 * Hence, wait another rcu grace period for the storage to be freed.
962 	 */
963 	synchronize_rcu();
964 
965 	/* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
966 	rcu_barrier_tasks_trace();
967 	rcu_barrier();
968 	kvfree(smap->buckets);
969 	bpf_map_area_free(smap);
970 }
971