1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2019 Facebook */
3 #include <linux/rculist.h>
4 #include <linux/list.h>
5 #include <linux/hash.h>
6 #include <linux/types.h>
7 #include <linux/spinlock.h>
8 #include <linux/bpf.h>
9 #include <linux/btf_ids.h>
10 #include <linux/bpf_local_storage.h>
11 #include <net/sock.h>
12 #include <uapi/linux/sock_diag.h>
13 #include <uapi/linux/btf.h>
14 #include <linux/rcupdate.h>
15 #include <linux/rcupdate_trace.h>
16 #include <linux/rcupdate_wait.h>
17
18 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
19
20 static struct bpf_local_storage_map_bucket *
select_bucket(struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage)21 select_bucket(struct bpf_local_storage_map *smap,
22 struct bpf_local_storage *local_storage)
23 {
24 return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
25 }
26
mem_charge(struct bpf_local_storage_map * smap,void * owner,u32 size)27 static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
28 {
29 struct bpf_map *map = &smap->map;
30
31 if (!map->ops->map_local_storage_charge)
32 return 0;
33
34 return map->ops->map_local_storage_charge(smap, owner, size);
35 }
36
mem_uncharge(struct bpf_local_storage_map * smap,void * owner,u32 size)37 static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
38 u32 size)
39 {
40 struct bpf_map *map = &smap->map;
41
42 if (map->ops->map_local_storage_uncharge)
43 map->ops->map_local_storage_uncharge(smap, owner, size);
44 }
45
46 static struct bpf_local_storage __rcu **
owner_storage(struct bpf_local_storage_map * smap,void * owner)47 owner_storage(struct bpf_local_storage_map *smap, void *owner)
48 {
49 struct bpf_map *map = &smap->map;
50
51 return map->ops->map_owner_storage_ptr(owner);
52 }
53
selem_linked_to_storage_lockless(const struct bpf_local_storage_elem * selem)54 static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
55 {
56 return !hlist_unhashed_lockless(&selem->snode);
57 }
58
selem_linked_to_storage(const struct bpf_local_storage_elem * selem)59 static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
60 {
61 return !hlist_unhashed(&selem->snode);
62 }
63
selem_linked_to_map(const struct bpf_local_storage_elem * selem)64 static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
65 {
66 return !hlist_unhashed(&selem->map_node);
67 }
68
69 struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map * smap,void * owner,void * value,bool swap_uptrs,gfp_t gfp_flags)70 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
71 void *value, bool swap_uptrs, gfp_t gfp_flags)
72 {
73 struct bpf_local_storage_elem *selem;
74
75 if (mem_charge(smap, owner, smap->elem_size))
76 return NULL;
77
78 if (smap->use_kmalloc_nolock) {
79 selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
80 __GFP_ZERO, NUMA_NO_NODE);
81 } else {
82 selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
83 gfp_flags | __GFP_NOWARN);
84 }
85
86 if (selem) {
87 RCU_INIT_POINTER(SDATA(selem)->smap, smap);
88 atomic_set(&selem->state, 0);
89 selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
90
91 if (value) {
92 /* No need to call check_and_init_map_value as memory is zero init */
93 copy_map_value(&smap->map, SDATA(selem)->data, value);
94 if (swap_uptrs)
95 bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
96 }
97 return selem;
98 }
99
100 mem_uncharge(smap, owner, smap->elem_size);
101
102 return NULL;
103 }
104
105 /* rcu tasks trace callback for use_kmalloc_nolock == false */
__bpf_local_storage_free_trace_rcu(struct rcu_head * rcu)106 static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
107 {
108 struct bpf_local_storage *local_storage;
109
110 /*
111 * RCU Tasks Trace grace period implies RCU grace period, do
112 * kfree() directly.
113 */
114 local_storage = container_of(rcu, struct bpf_local_storage, rcu);
115 kfree(local_storage);
116 }
117
118 /* Handle use_kmalloc_nolock == false */
__bpf_local_storage_free(struct bpf_local_storage * local_storage,bool vanilla_rcu)119 static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
120 bool vanilla_rcu)
121 {
122 if (vanilla_rcu)
123 kfree_rcu(local_storage, rcu);
124 else
125 call_rcu_tasks_trace(&local_storage->rcu,
126 __bpf_local_storage_free_trace_rcu);
127 }
128
bpf_local_storage_free_rcu(struct rcu_head * rcu)129 static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
130 {
131 struct bpf_local_storage *local_storage;
132
133 local_storage = container_of(rcu, struct bpf_local_storage, rcu);
134 kfree_nolock(local_storage);
135 }
136
bpf_local_storage_free_trace_rcu(struct rcu_head * rcu)137 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
138 {
139 /*
140 * RCU Tasks Trace grace period implies RCU grace period, do
141 * kfree() directly.
142 */
143 bpf_local_storage_free_rcu(rcu);
144 }
145
bpf_local_storage_free(struct bpf_local_storage * local_storage,bool reuse_now)146 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
147 bool reuse_now)
148 {
149 if (!local_storage)
150 return;
151
152 if (!local_storage->use_kmalloc_nolock) {
153 __bpf_local_storage_free(local_storage, reuse_now);
154 return;
155 }
156
157 if (reuse_now) {
158 call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
159 return;
160 }
161
162 call_rcu_tasks_trace(&local_storage->rcu,
163 bpf_local_storage_free_trace_rcu);
164 }
165
166 /* rcu callback for use_kmalloc_nolock == false */
__bpf_selem_free_rcu(struct rcu_head * rcu)167 static void __bpf_selem_free_rcu(struct rcu_head *rcu)
168 {
169 struct bpf_local_storage_elem *selem;
170 struct bpf_local_storage_map *smap;
171
172 selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
173 /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
174 smap = rcu_dereference_check(SDATA(selem)->smap, 1);
175
176 if (smap)
177 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
178 kfree(selem);
179 }
180
181 /* rcu tasks trace callback for use_kmalloc_nolock == false */
__bpf_selem_free_trace_rcu(struct rcu_head * rcu)182 static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
183 {
184 /*
185 * RCU Tasks Trace grace period implies RCU grace period, do
186 * kfree() directly.
187 */
188 __bpf_selem_free_rcu(rcu);
189 }
190
191 /* Handle use_kmalloc_nolock == false */
__bpf_selem_free(struct bpf_local_storage_elem * selem,bool vanilla_rcu)192 static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
193 bool vanilla_rcu)
194 {
195 if (vanilla_rcu)
196 call_rcu(&selem->rcu, __bpf_selem_free_rcu);
197 else
198 call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
199 }
200
bpf_selem_free_rcu(struct rcu_head * rcu)201 static void bpf_selem_free_rcu(struct rcu_head *rcu)
202 {
203 struct bpf_local_storage_elem *selem;
204 struct bpf_local_storage_map *smap;
205
206 selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
207 /* The bpf_local_storage_map_free will wait for rcu_barrier */
208 smap = rcu_dereference_check(SDATA(selem)->smap, 1);
209
210 if (smap)
211 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
212 kfree_nolock(selem);
213 }
214
bpf_selem_free_trace_rcu(struct rcu_head * rcu)215 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
216 {
217 /*
218 * RCU Tasks Trace grace period implies RCU grace period, do
219 * kfree() directly.
220 */
221 bpf_selem_free_rcu(rcu);
222 }
223
bpf_selem_free(struct bpf_local_storage_elem * selem,bool reuse_now)224 void bpf_selem_free(struct bpf_local_storage_elem *selem,
225 bool reuse_now)
226 {
227 if (!selem->use_kmalloc_nolock) {
228 /*
229 * No uptr will be unpin even when reuse_now == false since uptr
230 * is only supported in task local storage, where
231 * smap->use_kmalloc_nolock == true.
232 */
233 __bpf_selem_free(selem, reuse_now);
234 return;
235 }
236
237 if (reuse_now) {
238 /*
239 * While it is okay to call bpf_obj_free_fields() that unpins uptr when
240 * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
241 */
242 call_rcu(&selem->rcu, bpf_selem_free_rcu);
243 return;
244 }
245
246 call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
247 }
248
bpf_selem_free_list(struct hlist_head * list,bool reuse_now)249 static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
250 {
251 struct bpf_local_storage_elem *selem;
252 struct hlist_node *n;
253
254 /* The "_safe" iteration is needed.
255 * The loop is not removing the selem from the list
256 * but bpf_selem_free will use the selem->rcu_head
257 * which is union-ized with the selem->free_node.
258 */
259 hlist_for_each_entry_safe(selem, n, list, free_node)
260 bpf_selem_free(selem, reuse_now);
261 }
262
bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * selem,struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage,bool free_local_storage,bool pin_owner)263 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
264 struct bpf_local_storage_map *smap,
265 struct bpf_local_storage *local_storage,
266 bool free_local_storage, bool pin_owner)
267 {
268 void *owner = local_storage->owner;
269 u32 uncharge = smap->elem_size;
270
271 if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
272 SDATA(selem))
273 RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
274
275 if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
276 return;
277
278 uncharge += free_local_storage ? sizeof(*local_storage) : 0;
279 mem_uncharge(smap, local_storage->owner, uncharge);
280 local_storage->mem_charge -= uncharge;
281
282 if (free_local_storage) {
283 local_storage->owner = NULL;
284
285 /* After this RCU_INIT, owner may be freed and cannot be used */
286 RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
287 }
288
289 if (pin_owner)
290 refcount_dec(&local_storage->owner_refcnt);
291 }
292
293 /* local_storage->lock must be held and selem->local_storage == local_storage.
294 * The caller must ensure selem->smap is still valid to be
295 * dereferenced for its smap->elem_size and smap->cache_idx.
296 */
bpf_selem_unlink_storage_nolock(struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem,struct hlist_head * free_selem_list)297 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
298 struct bpf_local_storage_elem *selem,
299 struct hlist_head *free_selem_list)
300 {
301 struct bpf_local_storage_map *smap;
302 bool free_local_storage;
303
304 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
305
306 free_local_storage = hlist_is_singular_node(&selem->snode,
307 &local_storage->list);
308
309 bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
310 free_local_storage, false);
311
312 hlist_del_init_rcu(&selem->snode);
313
314 hlist_add_head(&selem->free_node, free_selem_list);
315
316 return free_local_storage;
317 }
318
bpf_selem_link_storage_nolock(struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem)319 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
320 struct bpf_local_storage_elem *selem)
321 {
322 struct bpf_local_storage_map *smap;
323
324 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
325 local_storage->mem_charge += smap->elem_size;
326
327 RCU_INIT_POINTER(selem->local_storage, local_storage);
328 hlist_add_head_rcu(&selem->snode, &local_storage->list);
329 }
330
bpf_selem_unlink_map(struct bpf_local_storage_elem * selem)331 static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
332 {
333 struct bpf_local_storage *local_storage;
334 struct bpf_local_storage_map *smap;
335 struct bpf_local_storage_map_bucket *b;
336 unsigned long flags;
337 int err;
338
339 local_storage = rcu_dereference_check(selem->local_storage,
340 bpf_rcu_lock_held());
341 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
342 b = select_bucket(smap, local_storage);
343 err = raw_res_spin_lock_irqsave(&b->lock, flags);
344 if (err)
345 return err;
346
347 hlist_del_init_rcu(&selem->map_node);
348 raw_res_spin_unlock_irqrestore(&b->lock, flags);
349
350 return 0;
351 }
352
bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem * selem)353 static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
354 {
355 hlist_del_init_rcu(&selem->map_node);
356 }
357
bpf_selem_link_map(struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem)358 int bpf_selem_link_map(struct bpf_local_storage_map *smap,
359 struct bpf_local_storage *local_storage,
360 struct bpf_local_storage_elem *selem)
361 {
362 struct bpf_local_storage_map_bucket *b;
363 unsigned long flags;
364 int err;
365
366 b = select_bucket(smap, local_storage);
367
368 err = raw_res_spin_lock_irqsave(&b->lock, flags);
369 if (err)
370 return err;
371
372 hlist_add_head_rcu(&selem->map_node, &b->list);
373 raw_res_spin_unlock_irqrestore(&b->lock, flags);
374
375 return 0;
376 }
377
bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket * b,struct bpf_local_storage_elem * selem)378 static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
379 struct bpf_local_storage_elem *selem)
380 {
381 hlist_add_head_rcu(&selem->map_node, &b->list);
382 }
383
384 /*
385 * Unlink an selem from map and local storage with lock held.
386 * This is the common path used by local storages to delete an selem.
387 */
bpf_selem_unlink(struct bpf_local_storage_elem * selem)388 int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
389 {
390 struct bpf_local_storage *local_storage;
391 bool free_local_storage = false;
392 HLIST_HEAD(selem_free_list);
393 unsigned long flags;
394 int err;
395
396 if (unlikely(!selem_linked_to_storage_lockless(selem)))
397 /* selem has already been unlinked from sk */
398 return 0;
399
400 local_storage = rcu_dereference_check(selem->local_storage,
401 bpf_rcu_lock_held());
402
403 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
404 if (err)
405 return err;
406
407 if (likely(selem_linked_to_storage(selem))) {
408 /* Always unlink from map before unlinking from local_storage
409 * because selem will be freed after successfully unlinked from
410 * the local_storage.
411 */
412 err = bpf_selem_unlink_map(selem);
413 if (err)
414 goto out;
415
416 free_local_storage = bpf_selem_unlink_storage_nolock(
417 local_storage, selem, &selem_free_list);
418 }
419 out:
420 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
421
422 bpf_selem_free_list(&selem_free_list, false);
423
424 if (free_local_storage)
425 bpf_local_storage_free(local_storage, false);
426
427 return err;
428 }
429
430 /*
431 * Unlink an selem from map and local storage with lockless fallback if callers
432 * are racing or rqspinlock returns error. It should only be called by
433 * bpf_local_storage_destroy() or bpf_local_storage_map_free().
434 */
bpf_selem_unlink_nofail(struct bpf_local_storage_elem * selem,struct bpf_local_storage_map_bucket * b)435 static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
436 struct bpf_local_storage_map_bucket *b)
437 {
438 bool in_map_free = !!b, free_storage = false;
439 struct bpf_local_storage *local_storage;
440 struct bpf_local_storage_map *smap;
441 unsigned long flags;
442 int err, unlink = 0;
443
444 local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
445 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
446
447 if (smap) {
448 b = b ? : select_bucket(smap, local_storage);
449 err = raw_res_spin_lock_irqsave(&b->lock, flags);
450 if (!err) {
451 /*
452 * Call bpf_obj_free_fields() under b->lock to make sure it is done
453 * exactly once for an selem. Safe to free special fields immediately
454 * as no BPF program should be referencing the selem.
455 */
456 if (likely(selem_linked_to_map(selem))) {
457 hlist_del_init_rcu(&selem->map_node);
458 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
459 unlink++;
460 }
461 raw_res_spin_unlock_irqrestore(&b->lock, flags);
462 }
463 /*
464 * Highly unlikely scenario: resource leak
465 *
466 * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
467 * and both selem belong to the same bucket, if destroy(selem2) acquired
468 * b->lock and block for too long, neither map_free(selem1) and
469 * destroy(selem1) will be able to free the special field associated
470 * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
471 */
472 WARN_ON_ONCE(err && in_map_free);
473 if (!err || in_map_free)
474 RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
475 }
476
477 if (local_storage) {
478 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
479 if (!err) {
480 if (likely(selem_linked_to_storage(selem))) {
481 free_storage = hlist_is_singular_node(&selem->snode,
482 &local_storage->list);
483 /*
484 * Okay to skip clearing owner_storage and storage->owner in
485 * destroy() since the owner is going away. No user or bpf
486 * programs should be able to reference it.
487 */
488 if (smap && in_map_free)
489 bpf_selem_unlink_storage_nolock_misc(
490 selem, smap, local_storage,
491 free_storage, true);
492 hlist_del_init_rcu(&selem->snode);
493 unlink++;
494 }
495 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
496 }
497 if (!err || !in_map_free)
498 RCU_INIT_POINTER(selem->local_storage, NULL);
499 }
500
501 if (unlink != 2)
502 atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
503
504 /*
505 * Normally, an selem can be unlinked under local_storage->lock and b->lock, and
506 * then freed after an RCU grace period. However, if destroy() and map_free() are
507 * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
508 * the selem only after both map_free() and destroy() see the selem.
509 */
510 if (unlink == 2 ||
511 atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
512 bpf_selem_free(selem, true);
513
514 if (free_storage)
515 bpf_local_storage_free(local_storage, true);
516 }
517
__bpf_local_storage_insert_cache(struct bpf_local_storage * local_storage,struct bpf_local_storage_map * smap,struct bpf_local_storage_elem * selem)518 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
519 struct bpf_local_storage_map *smap,
520 struct bpf_local_storage_elem *selem)
521 {
522 unsigned long flags;
523 int err;
524
525 /* spinlock is needed to avoid racing with the
526 * parallel delete. Otherwise, publishing an already
527 * deleted sdata to the cache will become a use-after-free
528 * problem in the next bpf_local_storage_lookup().
529 */
530 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
531 if (err)
532 return;
533
534 if (selem_linked_to_storage(selem))
535 rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
536 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
537 }
538
check_flags(const struct bpf_local_storage_data * old_sdata,u64 map_flags)539 static int check_flags(const struct bpf_local_storage_data *old_sdata,
540 u64 map_flags)
541 {
542 if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
543 /* elem already exists */
544 return -EEXIST;
545
546 if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
547 /* elem doesn't exist, cannot update it */
548 return -ENOENT;
549
550 return 0;
551 }
552
bpf_local_storage_alloc(void * owner,struct bpf_local_storage_map * smap,struct bpf_local_storage_elem * first_selem,gfp_t gfp_flags)553 int bpf_local_storage_alloc(void *owner,
554 struct bpf_local_storage_map *smap,
555 struct bpf_local_storage_elem *first_selem,
556 gfp_t gfp_flags)
557 {
558 struct bpf_local_storage *prev_storage, *storage;
559 struct bpf_local_storage **owner_storage_ptr;
560 struct bpf_local_storage_map_bucket *b;
561 unsigned long flags;
562 int err;
563
564 err = mem_charge(smap, owner, sizeof(*storage));
565 if (err)
566 return err;
567
568 if (smap->use_kmalloc_nolock)
569 storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
570 __GFP_ZERO, NUMA_NO_NODE);
571 else
572 storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
573 gfp_flags | __GFP_NOWARN);
574 if (!storage) {
575 err = -ENOMEM;
576 goto uncharge;
577 }
578
579 INIT_HLIST_HEAD(&storage->list);
580 raw_res_spin_lock_init(&storage->lock);
581 storage->owner = owner;
582 storage->mem_charge = sizeof(*storage);
583 storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
584 refcount_set(&storage->owner_refcnt, 1);
585
586 bpf_selem_link_storage_nolock(storage, first_selem);
587
588 b = select_bucket(smap, storage);
589 err = raw_res_spin_lock_irqsave(&b->lock, flags);
590 if (err)
591 goto uncharge;
592
593 bpf_selem_link_map_nolock(b, first_selem);
594
595 owner_storage_ptr =
596 (struct bpf_local_storage **)owner_storage(smap, owner);
597 /* Publish storage to the owner.
598 * Instead of using any lock of the kernel object (i.e. owner),
599 * cmpxchg will work with any kernel object regardless what
600 * the running context is, bh, irq...etc.
601 *
602 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
603 * is protected by the storage->lock. Hence, when freeing
604 * the owner->storage, the storage->lock must be held before
605 * setting owner->storage ptr to NULL.
606 */
607 prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
608 if (unlikely(prev_storage)) {
609 bpf_selem_unlink_map_nolock(first_selem);
610 raw_res_spin_unlock_irqrestore(&b->lock, flags);
611 err = -EAGAIN;
612 goto uncharge;
613 }
614 raw_res_spin_unlock_irqrestore(&b->lock, flags);
615
616 return 0;
617
618 uncharge:
619 bpf_local_storage_free(storage, true);
620 mem_uncharge(smap, owner, sizeof(*storage));
621 return err;
622 }
623
624 /* sk cannot be going away because it is linking new elem
625 * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
626 * Otherwise, it will become a leak (and other memory issues
627 * during map destruction).
628 */
629 struct bpf_local_storage_data *
bpf_local_storage_update(void * owner,struct bpf_local_storage_map * smap,void * value,u64 map_flags,bool swap_uptrs,gfp_t gfp_flags)630 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
631 void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
632 {
633 struct bpf_local_storage_data *old_sdata = NULL;
634 struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
635 struct bpf_local_storage *local_storage;
636 struct bpf_local_storage_map_bucket *b;
637 HLIST_HEAD(old_selem_free_list);
638 unsigned long flags, b_flags;
639 int err;
640
641 /* BPF_EXIST and BPF_NOEXIST cannot be both set */
642 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
643 /* BPF_F_LOCK can only be used in a value with spin_lock */
644 unlikely((map_flags & BPF_F_LOCK) &&
645 !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
646 return ERR_PTR(-EINVAL);
647
648 if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
649 return ERR_PTR(-EINVAL);
650
651 local_storage = rcu_dereference_check(*owner_storage(smap, owner),
652 bpf_rcu_lock_held());
653 if (!local_storage || hlist_empty(&local_storage->list)) {
654 /* Very first elem for the owner */
655 err = check_flags(NULL, map_flags);
656 if (err)
657 return ERR_PTR(err);
658
659 selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
660 if (!selem)
661 return ERR_PTR(-ENOMEM);
662
663 err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
664 if (err) {
665 bpf_selem_free(selem, true);
666 mem_uncharge(smap, owner, smap->elem_size);
667 return ERR_PTR(err);
668 }
669
670 return SDATA(selem);
671 }
672
673 if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
674 /* Hoping to find an old_sdata to do inline update
675 * such that it can avoid taking the local_storage->lock
676 * and changing the lists.
677 */
678 old_sdata =
679 bpf_local_storage_lookup(local_storage, smap, false);
680 err = check_flags(old_sdata, map_flags);
681 if (err)
682 return ERR_PTR(err);
683 if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
684 copy_map_value_locked(&smap->map, old_sdata->data,
685 value, false);
686 return old_sdata;
687 }
688 }
689
690 /* A lookup has just been done before and concluded a new selem is
691 * needed. The chance of an unnecessary alloc is unlikely.
692 */
693 alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
694 if (!alloc_selem)
695 return ERR_PTR(-ENOMEM);
696
697 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
698 if (err)
699 goto free_selem;
700
701 /* Recheck local_storage->list under local_storage->lock */
702 if (unlikely(hlist_empty(&local_storage->list))) {
703 /* A parallel del is happening and local_storage is going
704 * away. It has just been checked before, so very
705 * unlikely. Return instead of retry to keep things
706 * simple.
707 */
708 err = -EAGAIN;
709 goto unlock;
710 }
711
712 old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
713 err = check_flags(old_sdata, map_flags);
714 if (err)
715 goto unlock;
716
717 if (old_sdata && (map_flags & BPF_F_LOCK)) {
718 copy_map_value_locked(&smap->map, old_sdata->data, value,
719 false);
720 selem = SELEM(old_sdata);
721 goto unlock;
722 }
723
724 b = select_bucket(smap, local_storage);
725
726 err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
727 if (err)
728 goto unlock;
729
730 alloc_selem = NULL;
731 /* First, link the new selem to the map */
732 bpf_selem_link_map_nolock(b, selem);
733
734 /* Second, link (and publish) the new selem to local_storage */
735 bpf_selem_link_storage_nolock(local_storage, selem);
736
737 /* Third, remove old selem, SELEM(old_sdata) */
738 if (old_sdata) {
739 bpf_selem_unlink_map_nolock(SELEM(old_sdata));
740 bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
741 &old_selem_free_list);
742 }
743
744 raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
745 unlock:
746 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
747 free_selem:
748 bpf_selem_free_list(&old_selem_free_list, false);
749 if (alloc_selem) {
750 mem_uncharge(smap, owner, smap->elem_size);
751 bpf_selem_free(alloc_selem, true);
752 }
753 return err ? ERR_PTR(err) : SDATA(selem);
754 }
755
bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache * cache)756 static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
757 {
758 u64 min_usage = U64_MAX;
759 u16 i, res = 0;
760
761 spin_lock(&cache->idx_lock);
762
763 for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
764 if (cache->idx_usage_counts[i] < min_usage) {
765 min_usage = cache->idx_usage_counts[i];
766 res = i;
767
768 /* Found a free cache_idx */
769 if (!min_usage)
770 break;
771 }
772 }
773 cache->idx_usage_counts[res]++;
774
775 spin_unlock(&cache->idx_lock);
776
777 return res;
778 }
779
bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache * cache,u16 idx)780 static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
781 u16 idx)
782 {
783 spin_lock(&cache->idx_lock);
784 cache->idx_usage_counts[idx]--;
785 spin_unlock(&cache->idx_lock);
786 }
787
bpf_local_storage_map_alloc_check(union bpf_attr * attr)788 int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
789 {
790 if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
791 !(attr->map_flags & BPF_F_NO_PREALLOC) ||
792 attr->max_entries ||
793 attr->key_size != sizeof(int) || !attr->value_size ||
794 /* Enforce BTF for userspace sk dumping */
795 !attr->btf_key_type_id || !attr->btf_value_type_id)
796 return -EINVAL;
797
798 if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
799 return -E2BIG;
800
801 return 0;
802 }
803
bpf_local_storage_map_check_btf(struct bpf_map * map,const struct btf * btf,const struct btf_type * key_type,const struct btf_type * value_type)804 int bpf_local_storage_map_check_btf(struct bpf_map *map,
805 const struct btf *btf,
806 const struct btf_type *key_type,
807 const struct btf_type *value_type)
808 {
809 if (!btf_type_is_i32(key_type))
810 return -EINVAL;
811
812 return 0;
813 }
814
815 /*
816 * Destroy local storage when the owner is going away. Caller must uncharge memory
817 * if memory charging is used.
818 */
bpf_local_storage_destroy(struct bpf_local_storage * local_storage)819 u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
820 {
821 struct bpf_local_storage_elem *selem;
822
823 /* Neither the bpf_prog nor the bpf_map's syscall
824 * could be modifying the local_storage->list now.
825 * Thus, no elem can be added to or deleted from the
826 * local_storage->list by the bpf_prog or by the bpf_map's syscall.
827 *
828 * It is racing with bpf_local_storage_map_free() alone
829 * when unlinking elem from the local_storage->list and
830 * the map's bucket->list.
831 */
832 hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
833 bpf_selem_unlink_nofail(selem, NULL);
834
835 if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
836 while (refcount_read(&local_storage->owner_refcnt))
837 cpu_relax();
838 /*
839 * Paired with refcount_dec() in bpf_selem_unlink_nofail()
840 * to make sure destroy() sees the correct local_storage->mem_charge.
841 */
842 smp_mb();
843 }
844
845 return local_storage->mem_charge;
846 }
847
bpf_local_storage_map_mem_usage(const struct bpf_map * map)848 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
849 {
850 struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
851 u64 usage = sizeof(*smap);
852
853 /* The dynamically callocated selems are not counted currently. */
854 usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
855 return usage;
856 }
857
858 struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr * attr,struct bpf_local_storage_cache * cache,bool use_kmalloc_nolock)859 bpf_local_storage_map_alloc(union bpf_attr *attr,
860 struct bpf_local_storage_cache *cache,
861 bool use_kmalloc_nolock)
862 {
863 struct bpf_local_storage_map *smap;
864 unsigned int i;
865 u32 nbuckets;
866 int err;
867
868 smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
869 if (!smap)
870 return ERR_PTR(-ENOMEM);
871 bpf_map_init_from_attr(&smap->map, attr);
872
873 nbuckets = roundup_pow_of_two(num_possible_cpus());
874 /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
875 nbuckets = max_t(u32, 2, nbuckets);
876 smap->bucket_log = ilog2(nbuckets);
877
878 smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
879 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
880 if (!smap->buckets) {
881 err = -ENOMEM;
882 goto free_smap;
883 }
884
885 for (i = 0; i < nbuckets; i++) {
886 INIT_HLIST_HEAD(&smap->buckets[i].list);
887 raw_res_spin_lock_init(&smap->buckets[i].lock);
888 }
889
890 smap->elem_size = offsetof(struct bpf_local_storage_elem,
891 sdata.data[attr->value_size]);
892
893 /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
894 * preemptible context. Thus, enforce all storages to use
895 * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
896 */
897 smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
898
899 smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
900 return &smap->map;
901
902 free_smap:
903 kvfree(smap->buckets);
904 bpf_map_area_free(smap);
905 return ERR_PTR(err);
906 }
907
bpf_local_storage_map_free(struct bpf_map * map,struct bpf_local_storage_cache * cache)908 void bpf_local_storage_map_free(struct bpf_map *map,
909 struct bpf_local_storage_cache *cache)
910 {
911 struct bpf_local_storage_map_bucket *b;
912 struct bpf_local_storage_elem *selem;
913 struct bpf_local_storage_map *smap;
914 unsigned int i;
915
916 smap = (struct bpf_local_storage_map *)map;
917 bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
918
919 /* Note that this map might be concurrently cloned from
920 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
921 * RCU read section to finish before proceeding. New RCU
922 * read sections should be prevented via bpf_map_inc_not_zero.
923 */
924 synchronize_rcu();
925
926 /* bpf prog and the userspace can no longer access this map
927 * now. No new selem (of this map) can be added
928 * to the owner->storage or to the map bucket's list.
929 *
930 * The elem of this map can be cleaned up here
931 * or when the storage is freed e.g.
932 * by bpf_sk_storage_free() during __sk_destruct().
933 */
934 for (i = 0; i < (1U << smap->bucket_log); i++) {
935 b = &smap->buckets[i];
936
937 rcu_read_lock();
938 /* No one is adding to b->list now */
939 restart:
940 hlist_for_each_entry_rcu(selem, &b->list, map_node) {
941 bpf_selem_unlink_nofail(selem, b);
942
943 if (need_resched()) {
944 cond_resched_rcu();
945 goto restart;
946 }
947 }
948 rcu_read_unlock();
949 }
950
951 /* While freeing the storage we may still need to access the map.
952 *
953 * e.g. when bpf_sk_storage_free() has unlinked selem from the map
954 * which then made the above while((selem = ...)) loop
955 * exit immediately.
956 *
957 * However, while freeing the storage one still needs to access the
958 * smap->elem_size to do the uncharging in
959 * bpf_selem_unlink_storage_nolock().
960 *
961 * Hence, wait another rcu grace period for the storage to be freed.
962 */
963 synchronize_rcu();
964
965 /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
966 rcu_barrier_tasks_trace();
967 rcu_barrier();
968 kvfree(smap->buckets);
969 bpf_map_area_free(smap);
970 }
971