12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 27eb95156SPavel Emelyanov /* 37eb95156SPavel Emelyanov * inet fragments management 47eb95156SPavel Emelyanov * 57eb95156SPavel Emelyanov * Authors: Pavel Emelyanov <xemul@openvz.org> 67eb95156SPavel Emelyanov * Started as consolidation of ipv4/ip_fragment.c, 77eb95156SPavel Emelyanov * ipv6/reassembly. and ipv6 nf conntrack reassembly 87eb95156SPavel Emelyanov */ 97eb95156SPavel Emelyanov 107eb95156SPavel Emelyanov #include <linux/list.h> 117eb95156SPavel Emelyanov #include <linux/spinlock.h> 127eb95156SPavel Emelyanov #include <linux/module.h> 137eb95156SPavel Emelyanov #include <linux/timer.h> 147eb95156SPavel Emelyanov #include <linux/mm.h> 15321a3a99SPavel Emelyanov #include <linux/random.h> 161e4b8287SPavel Emelyanov #include <linux/skbuff.h> 171e4b8287SPavel Emelyanov #include <linux/rtnetlink.h> 185a0e3ad6STejun Heo #include <linux/slab.h> 190eb71a9dSNeilBrown #include <linux/rhashtable.h> 207eb95156SPavel Emelyanov 215a3da1feSHannes Frederic Sowa #include <net/sock.h> 227eb95156SPavel Emelyanov #include <net/inet_frag.h> 23be991971SHannes Frederic Sowa #include <net/inet_ecn.h> 24c23f35d1SPeter Oskolkov #include <net/ip.h> 25c23f35d1SPeter Oskolkov #include <net/ipv6.h> 26c23f35d1SPeter Oskolkov 2718685451SFlorian Westphal #include "../core/sock_destructor.h" 2818685451SFlorian Westphal 29c23f35d1SPeter Oskolkov /* Use skb->cb to track consecutive/adjacent fragments coming at 30c23f35d1SPeter Oskolkov * the end of the queue. Nodes in the rb-tree queue will 31c23f35d1SPeter Oskolkov * contain "runs" of one or more adjacent fragments. 32c23f35d1SPeter Oskolkov * 33c23f35d1SPeter Oskolkov * Invariants: 34c23f35d1SPeter Oskolkov * - next_frag is NULL at the tail of a "run"; 35c23f35d1SPeter Oskolkov * - the head of a "run" has the sum of all fragment lengths in frag_run_len. 36c23f35d1SPeter Oskolkov */ 37c23f35d1SPeter Oskolkov struct ipfrag_skb_cb { 38c23f35d1SPeter Oskolkov union { 39c23f35d1SPeter Oskolkov struct inet_skb_parm h4; 40c23f35d1SPeter Oskolkov struct inet6_skb_parm h6; 41c23f35d1SPeter Oskolkov }; 42c23f35d1SPeter Oskolkov struct sk_buff *next_frag; 43c23f35d1SPeter Oskolkov int frag_run_len; 4418685451SFlorian Westphal int ip_defrag_offset; 45c23f35d1SPeter Oskolkov }; 46c23f35d1SPeter Oskolkov 47c23f35d1SPeter Oskolkov #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) 48c23f35d1SPeter Oskolkov 49c23f35d1SPeter Oskolkov static void fragcb_clear(struct sk_buff *skb) 50c23f35d1SPeter Oskolkov { 51c23f35d1SPeter Oskolkov RB_CLEAR_NODE(&skb->rbnode); 52c23f35d1SPeter Oskolkov FRAG_CB(skb)->next_frag = NULL; 53c23f35d1SPeter Oskolkov FRAG_CB(skb)->frag_run_len = skb->len; 54c23f35d1SPeter Oskolkov } 55c23f35d1SPeter Oskolkov 56c23f35d1SPeter Oskolkov /* Append skb to the last "run". */ 57c23f35d1SPeter Oskolkov static void fragrun_append_to_last(struct inet_frag_queue *q, 58c23f35d1SPeter Oskolkov struct sk_buff *skb) 59c23f35d1SPeter Oskolkov { 60c23f35d1SPeter Oskolkov fragcb_clear(skb); 61c23f35d1SPeter Oskolkov 62c23f35d1SPeter Oskolkov FRAG_CB(q->last_run_head)->frag_run_len += skb->len; 63c23f35d1SPeter Oskolkov FRAG_CB(q->fragments_tail)->next_frag = skb; 64c23f35d1SPeter Oskolkov q->fragments_tail = skb; 65c23f35d1SPeter Oskolkov } 66c23f35d1SPeter Oskolkov 67c23f35d1SPeter Oskolkov /* Create a new "run" with the skb. */ 68c23f35d1SPeter Oskolkov static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) 69c23f35d1SPeter Oskolkov { 70c23f35d1SPeter Oskolkov BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); 71c23f35d1SPeter Oskolkov fragcb_clear(skb); 72c23f35d1SPeter Oskolkov 73c23f35d1SPeter Oskolkov if (q->last_run_head) 74c23f35d1SPeter Oskolkov rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, 75c23f35d1SPeter Oskolkov &q->last_run_head->rbnode.rb_right); 76c23f35d1SPeter Oskolkov else 77c23f35d1SPeter Oskolkov rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); 78c23f35d1SPeter Oskolkov rb_insert_color(&skb->rbnode, &q->rb_fragments); 79c23f35d1SPeter Oskolkov 80c23f35d1SPeter Oskolkov q->fragments_tail = skb; 81c23f35d1SPeter Oskolkov q->last_run_head = skb; 82c23f35d1SPeter Oskolkov } 83be991971SHannes Frederic Sowa 84be991971SHannes Frederic Sowa /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 85be991971SHannes Frederic Sowa * Value : 0xff if frame should be dropped. 86be991971SHannes Frederic Sowa * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 87be991971SHannes Frederic Sowa */ 88be991971SHannes Frederic Sowa const u8 ip_frag_ecn_table[16] = { 89be991971SHannes Frederic Sowa /* at least one fragment had CE, and others ECT_0 or ECT_1 */ 90be991971SHannes Frederic Sowa [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, 91be991971SHannes Frederic Sowa [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 92be991971SHannes Frederic Sowa [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 93be991971SHannes Frederic Sowa 94be991971SHannes Frederic Sowa /* invalid combinations : drop frame */ 95be991971SHannes Frederic Sowa [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, 96be991971SHannes Frederic Sowa [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, 97be991971SHannes Frederic Sowa [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, 98be991971SHannes Frederic Sowa [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 99be991971SHannes Frederic Sowa [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, 100be991971SHannes Frederic Sowa [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, 101be991971SHannes Frederic Sowa [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 102be991971SHannes Frederic Sowa }; 103be991971SHannes Frederic Sowa EXPORT_SYMBOL(ip_frag_ecn_table); 1047eb95156SPavel Emelyanov 105d4ad4d22SNikolay Aleksandrov int inet_frags_init(struct inet_frags *f) 1067eb95156SPavel Emelyanov { 107d4ad4d22SNikolay Aleksandrov f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, 108d4ad4d22SNikolay Aleksandrov NULL); 109d4ad4d22SNikolay Aleksandrov if (!f->frags_cachep) 110d4ad4d22SNikolay Aleksandrov return -ENOMEM; 111d4ad4d22SNikolay Aleksandrov 112dc93f46bSEric Dumazet refcount_set(&f->refcnt, 1); 113dc93f46bSEric Dumazet init_completion(&f->completion); 114d4ad4d22SNikolay Aleksandrov return 0; 1157eb95156SPavel Emelyanov } 1167eb95156SPavel Emelyanov EXPORT_SYMBOL(inet_frags_init); 1177eb95156SPavel Emelyanov 1187eb95156SPavel Emelyanov void inet_frags_fini(struct inet_frags *f) 1197eb95156SPavel Emelyanov { 120dc93f46bSEric Dumazet if (refcount_dec_and_test(&f->refcnt)) 121dc93f46bSEric Dumazet complete(&f->completion); 122dc93f46bSEric Dumazet 123dc93f46bSEric Dumazet wait_for_completion(&f->completion); 124648700f7SEric Dumazet 125d4ad4d22SNikolay Aleksandrov kmem_cache_destroy(f->frags_cachep); 126648700f7SEric Dumazet f->frags_cachep = NULL; 1277eb95156SPavel Emelyanov } 1287eb95156SPavel Emelyanov EXPORT_SYMBOL(inet_frags_fini); 129277e650dSPavel Emelyanov 1303c8fc878SEric Dumazet /* called from rhashtable_free_and_destroy() at netns_frags dismantle */ 131648700f7SEric Dumazet static void inet_frags_free_cb(void *ptr, void *arg) 132648700f7SEric Dumazet { 133648700f7SEric Dumazet struct inet_frag_queue *fq = ptr; 1343c8fc878SEric Dumazet int count; 135648700f7SEric Dumazet 1363c8fc878SEric Dumazet count = del_timer_sync(&fq->timer) ? 1 : 0; 137648700f7SEric Dumazet 138648700f7SEric Dumazet spin_lock_bh(&fq->lock); 13977adfd3aSEric Dumazet fq->flags |= INET_FRAG_DROP; 140648700f7SEric Dumazet if (!(fq->flags & INET_FRAG_COMPLETE)) { 141648700f7SEric Dumazet fq->flags |= INET_FRAG_COMPLETE; 1423c8fc878SEric Dumazet count++; 1433c8fc878SEric Dumazet } else if (fq->flags & INET_FRAG_HASH_DEAD) { 1443c8fc878SEric Dumazet count++; 145648700f7SEric Dumazet } 146648700f7SEric Dumazet spin_unlock_bh(&fq->lock); 147648700f7SEric Dumazet 1483c8fc878SEric Dumazet if (refcount_sub_and_test(count, &fq->refcnt)) 1493c8fc878SEric Dumazet inet_frag_destroy(fq); 1503c8fc878SEric Dumazet } 1513c8fc878SEric Dumazet 1520b9b2414SSeongJae Park static LLIST_HEAD(fqdir_free_list); 1533c8fc878SEric Dumazet 1540b9b2414SSeongJae Park static void fqdir_free_fn(struct work_struct *work) 1550b9b2414SSeongJae Park { 1560b9b2414SSeongJae Park struct llist_node *kill_list; 1570b9b2414SSeongJae Park struct fqdir *fqdir, *tmp; 1580b9b2414SSeongJae Park struct inet_frags *f; 1590b9b2414SSeongJae Park 1600b9b2414SSeongJae Park /* Atomically snapshot the list of fqdirs to free */ 1610b9b2414SSeongJae Park kill_list = llist_del_all(&fqdir_free_list); 162dc93f46bSEric Dumazet 163dc93f46bSEric Dumazet /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) 164dc93f46bSEric Dumazet * have completed, since they need to dereference fqdir. 165dc93f46bSEric Dumazet * Would it not be nice to have kfree_rcu_barrier() ? :) 166dc93f46bSEric Dumazet */ 167dc93f46bSEric Dumazet rcu_barrier(); 168dc93f46bSEric Dumazet 1690b9b2414SSeongJae Park llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { 1700b9b2414SSeongJae Park f = fqdir->f; 171dc93f46bSEric Dumazet if (refcount_dec_and_test(&f->refcnt)) 172dc93f46bSEric Dumazet complete(&f->completion); 173dc93f46bSEric Dumazet 1743c8fc878SEric Dumazet kfree(fqdir); 175648700f7SEric Dumazet } 1760b9b2414SSeongJae Park } 1770b9b2414SSeongJae Park 178*802e12ffSEric Dumazet static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); 1790b9b2414SSeongJae Park 1800b9b2414SSeongJae Park static void fqdir_work_fn(struct work_struct *work) 1810b9b2414SSeongJae Park { 1820b9b2414SSeongJae Park struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); 1830b9b2414SSeongJae Park 1840b9b2414SSeongJae Park rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); 1850b9b2414SSeongJae Park 1860b9b2414SSeongJae Park if (llist_add(&fqdir->free_list, &fqdir_free_list)) 187*802e12ffSEric Dumazet queue_delayed_work(system_wq, &fqdir_free_work, HZ); 1880b9b2414SSeongJae Park } 189648700f7SEric Dumazet 1906b73d197SEric Dumazet int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) 1916b73d197SEric Dumazet { 1926b73d197SEric Dumazet struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); 1936b73d197SEric Dumazet int res; 1946b73d197SEric Dumazet 1956b73d197SEric Dumazet if (!fqdir) 1966b73d197SEric Dumazet return -ENOMEM; 1976b73d197SEric Dumazet fqdir->f = f; 1986b73d197SEric Dumazet fqdir->net = net; 1996b73d197SEric Dumazet res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); 2006b73d197SEric Dumazet if (res < 0) { 2016b73d197SEric Dumazet kfree(fqdir); 2026b73d197SEric Dumazet return res; 2036b73d197SEric Dumazet } 204dc93f46bSEric Dumazet refcount_inc(&f->refcnt); 2056b73d197SEric Dumazet *fqdirp = fqdir; 2066b73d197SEric Dumazet return 0; 2076b73d197SEric Dumazet } 2086b73d197SEric Dumazet EXPORT_SYMBOL(fqdir_init); 2096b73d197SEric Dumazet 2100b9b2414SSeongJae Park static struct workqueue_struct *inet_frag_wq; 2110b9b2414SSeongJae Park 2120b9b2414SSeongJae Park static int __init inet_frag_wq_init(void) 2130b9b2414SSeongJae Park { 2140b9b2414SSeongJae Park inet_frag_wq = create_workqueue("inet_frag_wq"); 2150b9b2414SSeongJae Park if (!inet_frag_wq) 2160b9b2414SSeongJae Park panic("Could not create inet frag workq"); 2170b9b2414SSeongJae Park return 0; 2180b9b2414SSeongJae Park } 2190b9b2414SSeongJae Park 2200b9b2414SSeongJae Park pure_initcall(inet_frag_wq_init); 2210b9b2414SSeongJae Park 22289fb9005SEric Dumazet void fqdir_exit(struct fqdir *fqdir) 22381566e83SPavel Emelyanov { 224d5dd8879SEric Dumazet INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); 2250b9b2414SSeongJae Park queue_work(inet_frag_wq, &fqdir->destroy_work); 22681566e83SPavel Emelyanov } 22789fb9005SEric Dumazet EXPORT_SYMBOL(fqdir_exit); 22881566e83SPavel Emelyanov 229093ba729SEric Dumazet void inet_frag_kill(struct inet_frag_queue *fq) 230277e650dSPavel Emelyanov { 231277e650dSPavel Emelyanov if (del_timer(&fq->timer)) 232edcb6918SReshetova, Elena refcount_dec(&fq->refcnt); 233277e650dSPavel Emelyanov 23406aa8b8aSNikolay Aleksandrov if (!(fq->flags & INET_FRAG_COMPLETE)) { 2356ce3b4dcSEric Dumazet struct fqdir *fqdir = fq->fqdir; 236648700f7SEric Dumazet 237648700f7SEric Dumazet fq->flags |= INET_FRAG_COMPLETE; 2383c8fc878SEric Dumazet rcu_read_lock(); 23932707c4dSHerbert Xu /* The RCU read lock provides a memory barrier 24032707c4dSHerbert Xu * guaranteeing that if fqdir->dead is false then 24132707c4dSHerbert Xu * the hash table destruction will not start until 24291341fa0SEric Dumazet * after we unlock. Paired with fqdir_pre_exit(). 2433c8fc878SEric Dumazet */ 24491341fa0SEric Dumazet if (!READ_ONCE(fqdir->dead)) { 2453c8fc878SEric Dumazet rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, 2463c8fc878SEric Dumazet fqdir->f->rhash_params); 247edcb6918SReshetova, Elena refcount_dec(&fq->refcnt); 2483c8fc878SEric Dumazet } else { 2493c8fc878SEric Dumazet fq->flags |= INET_FRAG_HASH_DEAD; 2503c8fc878SEric Dumazet } 2513c8fc878SEric Dumazet rcu_read_unlock(); 252277e650dSPavel Emelyanov } 253277e650dSPavel Emelyanov } 254277e650dSPavel Emelyanov EXPORT_SYMBOL(inet_frag_kill); 2551e4b8287SPavel Emelyanov 256648700f7SEric Dumazet static void inet_frag_destroy_rcu(struct rcu_head *head) 257648700f7SEric Dumazet { 258648700f7SEric Dumazet struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, 259648700f7SEric Dumazet rcu); 2606ce3b4dcSEric Dumazet struct inet_frags *f = q->fqdir->f; 261648700f7SEric Dumazet 262648700f7SEric Dumazet if (f->destructor) 263648700f7SEric Dumazet f->destructor(q); 264648700f7SEric Dumazet kmem_cache_free(f->frags_cachep, q); 265648700f7SEric Dumazet } 266648700f7SEric Dumazet 26777adfd3aSEric Dumazet unsigned int inet_frag_rbtree_purge(struct rb_root *root, 26877adfd3aSEric Dumazet enum skb_drop_reason reason) 269c23f35d1SPeter Oskolkov { 270c23f35d1SPeter Oskolkov struct rb_node *p = rb_first(root); 271c23f35d1SPeter Oskolkov unsigned int sum = 0; 272c23f35d1SPeter Oskolkov 273c23f35d1SPeter Oskolkov while (p) { 274c23f35d1SPeter Oskolkov struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 275c23f35d1SPeter Oskolkov 276c23f35d1SPeter Oskolkov p = rb_next(p); 277c23f35d1SPeter Oskolkov rb_erase(&skb->rbnode, root); 278c23f35d1SPeter Oskolkov while (skb) { 279c23f35d1SPeter Oskolkov struct sk_buff *next = FRAG_CB(skb)->next_frag; 280c23f35d1SPeter Oskolkov 281c23f35d1SPeter Oskolkov sum += skb->truesize; 28277adfd3aSEric Dumazet kfree_skb_reason(skb, reason); 283c23f35d1SPeter Oskolkov skb = next; 284c23f35d1SPeter Oskolkov } 285c23f35d1SPeter Oskolkov } 286c23f35d1SPeter Oskolkov return sum; 287c23f35d1SPeter Oskolkov } 288c23f35d1SPeter Oskolkov EXPORT_SYMBOL(inet_frag_rbtree_purge); 289c23f35d1SPeter Oskolkov 290093ba729SEric Dumazet void inet_frag_destroy(struct inet_frag_queue *q) 2911e4b8287SPavel Emelyanov { 292d433673eSJesper Dangaard Brouer unsigned int sum, sum_truesize = 0; 29377adfd3aSEric Dumazet enum skb_drop_reason reason; 294093ba729SEric Dumazet struct inet_frags *f; 29577adfd3aSEric Dumazet struct fqdir *fqdir; 2961e4b8287SPavel Emelyanov 29706aa8b8aSNikolay Aleksandrov WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); 29877adfd3aSEric Dumazet reason = (q->flags & INET_FRAG_DROP) ? 29977adfd3aSEric Dumazet SKB_DROP_REASON_FRAG_REASM_TIMEOUT : 30077adfd3aSEric Dumazet SKB_CONSUMED; 301547b792cSIlpo Järvinen WARN_ON(del_timer(&q->timer) != 0); 3021e4b8287SPavel Emelyanov 3031e4b8287SPavel Emelyanov /* Release all fragment data. */ 3046ce3b4dcSEric Dumazet fqdir = q->fqdir; 3056ce3b4dcSEric Dumazet f = fqdir->f; 30677adfd3aSEric Dumazet sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); 307d433673eSJesper Dangaard Brouer sum = sum_truesize + f->qsize; 3081e4b8287SPavel Emelyanov 309648700f7SEric Dumazet call_rcu(&q->rcu, inet_frag_destroy_rcu); 3105719b296SFlorian Westphal 3116ce3b4dcSEric Dumazet sub_frag_mem_limit(fqdir, sum); 3121e4b8287SPavel Emelyanov } 3131e4b8287SPavel Emelyanov EXPORT_SYMBOL(inet_frag_destroy); 3148e7999c4SPavel Emelyanov 3156ce3b4dcSEric Dumazet static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, 316f926e236SNikolay Aleksandrov struct inet_frags *f, 317f926e236SNikolay Aleksandrov void *arg) 318e521db9dSPavel Emelyanov { 319e521db9dSPavel Emelyanov struct inet_frag_queue *q; 320e521db9dSPavel Emelyanov 321d4ad4d22SNikolay Aleksandrov q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 32251456b29SIan Morris if (!q) 323e521db9dSPavel Emelyanov return NULL; 324e521db9dSPavel Emelyanov 3256ce3b4dcSEric Dumazet q->fqdir = fqdir; 326c6fda282SPavel Emelyanov f->constructor(q, arg); 3276ce3b4dcSEric Dumazet add_frag_mem_limit(fqdir, f->qsize); 328d433673eSJesper Dangaard Brouer 32978802011SKees Cook timer_setup(&q->timer, f->frag_expire, 0); 330e521db9dSPavel Emelyanov spin_lock_init(&q->lock); 331648700f7SEric Dumazet refcount_set(&q->refcnt, 3); 332e521db9dSPavel Emelyanov 333e521db9dSPavel Emelyanov return q; 334e521db9dSPavel Emelyanov } 335c6fda282SPavel Emelyanov 3366ce3b4dcSEric Dumazet static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, 3370d5b9311SEric Dumazet void *arg, 3380d5b9311SEric Dumazet struct inet_frag_queue **prev) 339c6fda282SPavel Emelyanov { 3406ce3b4dcSEric Dumazet struct inet_frags *f = fqdir->f; 341c6fda282SPavel Emelyanov struct inet_frag_queue *q; 342c6fda282SPavel Emelyanov 3436ce3b4dcSEric Dumazet q = inet_frag_alloc(fqdir, f, arg); 3440d5b9311SEric Dumazet if (!q) { 3450d5b9311SEric Dumazet *prev = ERR_PTR(-ENOMEM); 346c6fda282SPavel Emelyanov return NULL; 3470d5b9311SEric Dumazet } 3486ce3b4dcSEric Dumazet mod_timer(&q->timer, jiffies + fqdir->timeout); 349648700f7SEric Dumazet 3506ce3b4dcSEric Dumazet *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, 3510d5b9311SEric Dumazet &q->node, f->rhash_params); 3520d5b9311SEric Dumazet if (*prev) { 353648700f7SEric Dumazet q->flags |= INET_FRAG_COMPLETE; 354648700f7SEric Dumazet inet_frag_kill(q); 355648700f7SEric Dumazet inet_frag_destroy(q); 356648700f7SEric Dumazet return NULL; 357c6fda282SPavel Emelyanov } 358abd6523dSPavel Emelyanov return q; 359abd6523dSPavel Emelyanov } 360648700f7SEric Dumazet 361648700f7SEric Dumazet /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 3626ce3b4dcSEric Dumazet struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) 363648700f7SEric Dumazet { 36491341fa0SEric Dumazet /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ 36591341fa0SEric Dumazet long high_thresh = READ_ONCE(fqdir->high_thresh); 3660d5b9311SEric Dumazet struct inet_frag_queue *fq = NULL, *prev; 367648700f7SEric Dumazet 36891341fa0SEric Dumazet if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) 36956e2c94fSEric Dumazet return NULL; 37056e2c94fSEric Dumazet 371648700f7SEric Dumazet rcu_read_lock(); 372648700f7SEric Dumazet 3736ce3b4dcSEric Dumazet prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); 3740d5b9311SEric Dumazet if (!prev) 3756ce3b4dcSEric Dumazet fq = inet_frag_create(fqdir, key, &prev); 376c7148c03SPavel Machek if (!IS_ERR_OR_NULL(prev)) { 3770d5b9311SEric Dumazet fq = prev; 378648700f7SEric Dumazet if (!refcount_inc_not_zero(&fq->refcnt)) 379648700f7SEric Dumazet fq = NULL; 380abd6523dSPavel Emelyanov } 381648700f7SEric Dumazet rcu_read_unlock(); 3820d5b9311SEric Dumazet return fq; 383abd6523dSPavel Emelyanov } 384abd6523dSPavel Emelyanov EXPORT_SYMBOL(inet_frag_find); 385c23f35d1SPeter Oskolkov 386c23f35d1SPeter Oskolkov int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, 387c23f35d1SPeter Oskolkov int offset, int end) 388c23f35d1SPeter Oskolkov { 389c23f35d1SPeter Oskolkov struct sk_buff *last = q->fragments_tail; 390c23f35d1SPeter Oskolkov 391c23f35d1SPeter Oskolkov /* RFC5722, Section 4, amended by Errata ID : 3089 392c23f35d1SPeter Oskolkov * When reassembling an IPv6 datagram, if 393c23f35d1SPeter Oskolkov * one or more its constituent fragments is determined to be an 394c23f35d1SPeter Oskolkov * overlapping fragment, the entire datagram (and any constituent 395c23f35d1SPeter Oskolkov * fragments) MUST be silently discarded. 396c23f35d1SPeter Oskolkov * 397c23f35d1SPeter Oskolkov * Duplicates, however, should be ignored (i.e. skb dropped, but the 398c23f35d1SPeter Oskolkov * queue/fragments kept for later reassembly). 399c23f35d1SPeter Oskolkov */ 400c23f35d1SPeter Oskolkov if (!last) 401c23f35d1SPeter Oskolkov fragrun_create(q, skb); /* First fragment. */ 40218685451SFlorian Westphal else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { 403c23f35d1SPeter Oskolkov /* This is the common case: skb goes to the end. */ 404c23f35d1SPeter Oskolkov /* Detect and discard overlaps. */ 40518685451SFlorian Westphal if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) 406c23f35d1SPeter Oskolkov return IPFRAG_OVERLAP; 40718685451SFlorian Westphal if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) 408c23f35d1SPeter Oskolkov fragrun_append_to_last(q, skb); 409c23f35d1SPeter Oskolkov else 410c23f35d1SPeter Oskolkov fragrun_create(q, skb); 411c23f35d1SPeter Oskolkov } else { 412c23f35d1SPeter Oskolkov /* Binary search. Note that skb can become the first fragment, 413c23f35d1SPeter Oskolkov * but not the last (covered above). 414c23f35d1SPeter Oskolkov */ 415c23f35d1SPeter Oskolkov struct rb_node **rbn, *parent; 416c23f35d1SPeter Oskolkov 417c23f35d1SPeter Oskolkov rbn = &q->rb_fragments.rb_node; 418c23f35d1SPeter Oskolkov do { 419c23f35d1SPeter Oskolkov struct sk_buff *curr; 420c23f35d1SPeter Oskolkov int curr_run_end; 421c23f35d1SPeter Oskolkov 422c23f35d1SPeter Oskolkov parent = *rbn; 423c23f35d1SPeter Oskolkov curr = rb_to_skb(parent); 42418685451SFlorian Westphal curr_run_end = FRAG_CB(curr)->ip_defrag_offset + 425c23f35d1SPeter Oskolkov FRAG_CB(curr)->frag_run_len; 42618685451SFlorian Westphal if (end <= FRAG_CB(curr)->ip_defrag_offset) 427c23f35d1SPeter Oskolkov rbn = &parent->rb_left; 428c23f35d1SPeter Oskolkov else if (offset >= curr_run_end) 429c23f35d1SPeter Oskolkov rbn = &parent->rb_right; 43018685451SFlorian Westphal else if (offset >= FRAG_CB(curr)->ip_defrag_offset && 431c23f35d1SPeter Oskolkov end <= curr_run_end) 432c23f35d1SPeter Oskolkov return IPFRAG_DUP; 433c23f35d1SPeter Oskolkov else 434c23f35d1SPeter Oskolkov return IPFRAG_OVERLAP; 435c23f35d1SPeter Oskolkov } while (*rbn); 436c23f35d1SPeter Oskolkov /* Here we have parent properly set, and rbn pointing to 437c23f35d1SPeter Oskolkov * one of its NULL left/right children. Insert skb. 438c23f35d1SPeter Oskolkov */ 439c23f35d1SPeter Oskolkov fragcb_clear(skb); 440c23f35d1SPeter Oskolkov rb_link_node(&skb->rbnode, parent, rbn); 441c23f35d1SPeter Oskolkov rb_insert_color(&skb->rbnode, &q->rb_fragments); 442c23f35d1SPeter Oskolkov } 443c23f35d1SPeter Oskolkov 44418685451SFlorian Westphal FRAG_CB(skb)->ip_defrag_offset = offset; 445c23f35d1SPeter Oskolkov 446c23f35d1SPeter Oskolkov return IPFRAG_OK; 447c23f35d1SPeter Oskolkov } 448c23f35d1SPeter Oskolkov EXPORT_SYMBOL(inet_frag_queue_insert); 449c23f35d1SPeter Oskolkov 450c23f35d1SPeter Oskolkov void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, 451c23f35d1SPeter Oskolkov struct sk_buff *parent) 452c23f35d1SPeter Oskolkov { 453c23f35d1SPeter Oskolkov struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); 45418685451SFlorian Westphal void (*destructor)(struct sk_buff *); 45518685451SFlorian Westphal unsigned int orig_truesize = 0; 45618685451SFlorian Westphal struct sk_buff **nextp = NULL; 45718685451SFlorian Westphal struct sock *sk = skb->sk; 458c23f35d1SPeter Oskolkov int delta; 459c23f35d1SPeter Oskolkov 46018685451SFlorian Westphal if (sk && is_skb_wmem(skb)) { 46118685451SFlorian Westphal /* TX: skb->sk might have been passed as argument to 46218685451SFlorian Westphal * dst->output and must remain valid until tx completes. 46318685451SFlorian Westphal * 46418685451SFlorian Westphal * Move sk to reassembled skb and fix up wmem accounting. 46518685451SFlorian Westphal */ 46618685451SFlorian Westphal orig_truesize = skb->truesize; 46718685451SFlorian Westphal destructor = skb->destructor; 46818685451SFlorian Westphal } 46918685451SFlorian Westphal 470c23f35d1SPeter Oskolkov if (head != skb) { 471c23f35d1SPeter Oskolkov fp = skb_clone(skb, GFP_ATOMIC); 47218685451SFlorian Westphal if (!fp) { 47318685451SFlorian Westphal head = skb; 47418685451SFlorian Westphal goto out_restore_sk; 47518685451SFlorian Westphal } 476c23f35d1SPeter Oskolkov FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; 477c23f35d1SPeter Oskolkov if (RB_EMPTY_NODE(&skb->rbnode)) 478c23f35d1SPeter Oskolkov FRAG_CB(parent)->next_frag = fp; 479c23f35d1SPeter Oskolkov else 480c23f35d1SPeter Oskolkov rb_replace_node(&skb->rbnode, &fp->rbnode, 481c23f35d1SPeter Oskolkov &q->rb_fragments); 482c23f35d1SPeter Oskolkov if (q->fragments_tail == skb) 483c23f35d1SPeter Oskolkov q->fragments_tail = fp; 48418685451SFlorian Westphal 48518685451SFlorian Westphal if (orig_truesize) { 48618685451SFlorian Westphal /* prevent skb_morph from releasing sk */ 48718685451SFlorian Westphal skb->sk = NULL; 48818685451SFlorian Westphal skb->destructor = NULL; 48918685451SFlorian Westphal } 490c23f35d1SPeter Oskolkov skb_morph(skb, head); 491c23f35d1SPeter Oskolkov FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; 492c23f35d1SPeter Oskolkov rb_replace_node(&head->rbnode, &skb->rbnode, 493c23f35d1SPeter Oskolkov &q->rb_fragments); 494c23f35d1SPeter Oskolkov consume_skb(head); 495c23f35d1SPeter Oskolkov head = skb; 496c23f35d1SPeter Oskolkov } 49718685451SFlorian Westphal WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); 498c23f35d1SPeter Oskolkov 499c23f35d1SPeter Oskolkov delta = -head->truesize; 500c23f35d1SPeter Oskolkov 501c23f35d1SPeter Oskolkov /* Head of list must not be cloned. */ 502c23f35d1SPeter Oskolkov if (skb_unclone(head, GFP_ATOMIC)) 50318685451SFlorian Westphal goto out_restore_sk; 504c23f35d1SPeter Oskolkov 505c23f35d1SPeter Oskolkov delta += head->truesize; 506c23f35d1SPeter Oskolkov if (delta) 5076ce3b4dcSEric Dumazet add_frag_mem_limit(q->fqdir, delta); 508c23f35d1SPeter Oskolkov 509c23f35d1SPeter Oskolkov /* If the first fragment is fragmented itself, we split 510c23f35d1SPeter Oskolkov * it to two chunks: the first with data and paged part 511c23f35d1SPeter Oskolkov * and the second, holding only fragments. 512c23f35d1SPeter Oskolkov */ 513c23f35d1SPeter Oskolkov if (skb_has_frag_list(head)) { 514c23f35d1SPeter Oskolkov struct sk_buff *clone; 515c23f35d1SPeter Oskolkov int i, plen = 0; 516c23f35d1SPeter Oskolkov 517c23f35d1SPeter Oskolkov clone = alloc_skb(0, GFP_ATOMIC); 518c23f35d1SPeter Oskolkov if (!clone) 51918685451SFlorian Westphal goto out_restore_sk; 520c23f35d1SPeter Oskolkov skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 521c23f35d1SPeter Oskolkov skb_frag_list_init(head); 522c23f35d1SPeter Oskolkov for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 523c23f35d1SPeter Oskolkov plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 524c23f35d1SPeter Oskolkov clone->data_len = head->data_len - plen; 525c23f35d1SPeter Oskolkov clone->len = clone->data_len; 526c23f35d1SPeter Oskolkov head->truesize += clone->truesize; 527c23f35d1SPeter Oskolkov clone->csum = 0; 528c23f35d1SPeter Oskolkov clone->ip_summed = head->ip_summed; 5296ce3b4dcSEric Dumazet add_frag_mem_limit(q->fqdir, clone->truesize); 530c23f35d1SPeter Oskolkov skb_shinfo(head)->frag_list = clone; 531c23f35d1SPeter Oskolkov nextp = &clone->next; 532c23f35d1SPeter Oskolkov } else { 533c23f35d1SPeter Oskolkov nextp = &skb_shinfo(head)->frag_list; 534c23f35d1SPeter Oskolkov } 535c23f35d1SPeter Oskolkov 53618685451SFlorian Westphal out_restore_sk: 53718685451SFlorian Westphal if (orig_truesize) { 53818685451SFlorian Westphal int ts_delta = head->truesize - orig_truesize; 53918685451SFlorian Westphal 54018685451SFlorian Westphal /* if this reassembled skb is fragmented later, 54118685451SFlorian Westphal * fraglist skbs will get skb->sk assigned from head->sk, 54218685451SFlorian Westphal * and each frag skb will be released via sock_wfree. 54318685451SFlorian Westphal * 54418685451SFlorian Westphal * Update sk_wmem_alloc. 54518685451SFlorian Westphal */ 54618685451SFlorian Westphal head->sk = sk; 54718685451SFlorian Westphal head->destructor = destructor; 54818685451SFlorian Westphal refcount_add(ts_delta, &sk->sk_wmem_alloc); 54918685451SFlorian Westphal } 55018685451SFlorian Westphal 551c23f35d1SPeter Oskolkov return nextp; 552c23f35d1SPeter Oskolkov } 553c23f35d1SPeter Oskolkov EXPORT_SYMBOL(inet_frag_reasm_prepare); 554c23f35d1SPeter Oskolkov 555c23f35d1SPeter Oskolkov void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, 556891584f4SGuillaume Nault void *reasm_data, bool try_coalesce) 557c23f35d1SPeter Oskolkov { 55818685451SFlorian Westphal struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; 55918685451SFlorian Westphal const unsigned int head_truesize = head->truesize; 5602e47eeceSYu Zhe struct sk_buff **nextp = reasm_data; 561c23f35d1SPeter Oskolkov struct rb_node *rbn; 562c23f35d1SPeter Oskolkov struct sk_buff *fp; 563891584f4SGuillaume Nault int sum_truesize; 564c23f35d1SPeter Oskolkov 565c23f35d1SPeter Oskolkov skb_push(head, head->data - skb_network_header(head)); 566c23f35d1SPeter Oskolkov 567c23f35d1SPeter Oskolkov /* Traverse the tree in order, to build frag_list. */ 568c23f35d1SPeter Oskolkov fp = FRAG_CB(head)->next_frag; 569c23f35d1SPeter Oskolkov rbn = rb_next(&head->rbnode); 570c23f35d1SPeter Oskolkov rb_erase(&head->rbnode, &q->rb_fragments); 571891584f4SGuillaume Nault 572891584f4SGuillaume Nault sum_truesize = head->truesize; 573c23f35d1SPeter Oskolkov while (rbn || fp) { 574c23f35d1SPeter Oskolkov /* fp points to the next sk_buff in the current run; 575c23f35d1SPeter Oskolkov * rbn points to the next run. 576c23f35d1SPeter Oskolkov */ 577c23f35d1SPeter Oskolkov /* Go through the current run. */ 578c23f35d1SPeter Oskolkov while (fp) { 579891584f4SGuillaume Nault struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; 580891584f4SGuillaume Nault bool stolen; 581891584f4SGuillaume Nault int delta; 582891584f4SGuillaume Nault 583891584f4SGuillaume Nault sum_truesize += fp->truesize; 584c23f35d1SPeter Oskolkov if (head->ip_summed != fp->ip_summed) 585c23f35d1SPeter Oskolkov head->ip_summed = CHECKSUM_NONE; 586c23f35d1SPeter Oskolkov else if (head->ip_summed == CHECKSUM_COMPLETE) 587c23f35d1SPeter Oskolkov head->csum = csum_add(head->csum, fp->csum); 588891584f4SGuillaume Nault 589891584f4SGuillaume Nault if (try_coalesce && skb_try_coalesce(head, fp, &stolen, 590891584f4SGuillaume Nault &delta)) { 591891584f4SGuillaume Nault kfree_skb_partial(fp, stolen); 592891584f4SGuillaume Nault } else { 593891584f4SGuillaume Nault fp->prev = NULL; 594891584f4SGuillaume Nault memset(&fp->rbnode, 0, sizeof(fp->rbnode)); 595891584f4SGuillaume Nault fp->sk = NULL; 596891584f4SGuillaume Nault 597891584f4SGuillaume Nault head->data_len += fp->len; 598891584f4SGuillaume Nault head->len += fp->len; 599c23f35d1SPeter Oskolkov head->truesize += fp->truesize; 600891584f4SGuillaume Nault 601891584f4SGuillaume Nault *nextp = fp; 602891584f4SGuillaume Nault nextp = &fp->next; 603891584f4SGuillaume Nault } 604891584f4SGuillaume Nault 605891584f4SGuillaume Nault fp = next_frag; 606c23f35d1SPeter Oskolkov } 607c23f35d1SPeter Oskolkov /* Move to the next run. */ 608c23f35d1SPeter Oskolkov if (rbn) { 609c23f35d1SPeter Oskolkov struct rb_node *rbnext = rb_next(rbn); 610c23f35d1SPeter Oskolkov 611c23f35d1SPeter Oskolkov fp = rb_to_skb(rbn); 612c23f35d1SPeter Oskolkov rb_erase(rbn, &q->rb_fragments); 613c23f35d1SPeter Oskolkov rbn = rbnext; 614c23f35d1SPeter Oskolkov } 615c23f35d1SPeter Oskolkov } 616891584f4SGuillaume Nault sub_frag_mem_limit(q->fqdir, sum_truesize); 617c23f35d1SPeter Oskolkov 618c23f35d1SPeter Oskolkov *nextp = NULL; 619c23f35d1SPeter Oskolkov skb_mark_not_on_list(head); 620c23f35d1SPeter Oskolkov head->prev = NULL; 621c23f35d1SPeter Oskolkov head->tstamp = q->stamp; 6228672406eSMartin KaFai Lau head->mono_delivery_time = q->mono_delivery_time; 62318685451SFlorian Westphal 62418685451SFlorian Westphal if (sk) 62518685451SFlorian Westphal refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); 626c23f35d1SPeter Oskolkov } 627c23f35d1SPeter Oskolkov EXPORT_SYMBOL(inet_frag_reasm_finish); 628c23f35d1SPeter Oskolkov 629c23f35d1SPeter Oskolkov struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) 630c23f35d1SPeter Oskolkov { 631d8cf757fSPeter Oskolkov struct sk_buff *head, *skb; 632c23f35d1SPeter Oskolkov 633c23f35d1SPeter Oskolkov head = skb_rb_first(&q->rb_fragments); 634c23f35d1SPeter Oskolkov if (!head) 635c23f35d1SPeter Oskolkov return NULL; 636c23f35d1SPeter Oskolkov skb = FRAG_CB(head)->next_frag; 637c23f35d1SPeter Oskolkov if (skb) 638c23f35d1SPeter Oskolkov rb_replace_node(&head->rbnode, &skb->rbnode, 639c23f35d1SPeter Oskolkov &q->rb_fragments); 640c23f35d1SPeter Oskolkov else 641c23f35d1SPeter Oskolkov rb_erase(&head->rbnode, &q->rb_fragments); 642c23f35d1SPeter Oskolkov memset(&head->rbnode, 0, sizeof(head->rbnode)); 643c23f35d1SPeter Oskolkov barrier(); 644d8cf757fSPeter Oskolkov 645c23f35d1SPeter Oskolkov if (head == q->fragments_tail) 646c23f35d1SPeter Oskolkov q->fragments_tail = NULL; 647c23f35d1SPeter Oskolkov 6486ce3b4dcSEric Dumazet sub_frag_mem_limit(q->fqdir, head->truesize); 649c23f35d1SPeter Oskolkov 650c23f35d1SPeter Oskolkov return head; 651c23f35d1SPeter Oskolkov } 652c23f35d1SPeter Oskolkov EXPORT_SYMBOL(inet_frag_pull_head); 653