1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * inet fragments management 4 * 5 * Authors: Pavel Emelyanov <xemul@openvz.org> 6 * Started as consolidation of ipv4/ip_fragment.c, 7 * ipv6/reassembly. and ipv6 nf conntrack reassembly 8 */ 9 10 #include <linux/list.h> 11 #include <linux/spinlock.h> 12 #include <linux/module.h> 13 #include <linux/timer.h> 14 #include <linux/mm.h> 15 #include <linux/random.h> 16 #include <linux/skbuff.h> 17 #include <linux/rtnetlink.h> 18 #include <linux/slab.h> 19 #include <linux/rhashtable.h> 20 21 #include <net/sock.h> 22 #include <net/inet_frag.h> 23 #include <net/inet_ecn.h> 24 #include <net/ip.h> 25 #include <net/ipv6.h> 26 27 #include "../core/sock_destructor.h" 28 29 /* Use skb->cb to track consecutive/adjacent fragments coming at 30 * the end of the queue. Nodes in the rb-tree queue will 31 * contain "runs" of one or more adjacent fragments. 32 * 33 * Invariants: 34 * - next_frag is NULL at the tail of a "run"; 35 * - the head of a "run" has the sum of all fragment lengths in frag_run_len. 36 */ 37 struct ipfrag_skb_cb { 38 union { 39 struct inet_skb_parm h4; 40 struct inet6_skb_parm h6; 41 }; 42 struct sk_buff *next_frag; 43 int frag_run_len; 44 int ip_defrag_offset; 45 }; 46 47 #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) 48 49 static void fragcb_clear(struct sk_buff *skb) 50 { 51 RB_CLEAR_NODE(&skb->rbnode); 52 FRAG_CB(skb)->next_frag = NULL; 53 FRAG_CB(skb)->frag_run_len = skb->len; 54 } 55 56 /* Append skb to the last "run". */ 57 static void fragrun_append_to_last(struct inet_frag_queue *q, 58 struct sk_buff *skb) 59 { 60 fragcb_clear(skb); 61 62 FRAG_CB(q->last_run_head)->frag_run_len += skb->len; 63 FRAG_CB(q->fragments_tail)->next_frag = skb; 64 q->fragments_tail = skb; 65 } 66 67 /* Create a new "run" with the skb. */ 68 static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) 69 { 70 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); 71 fragcb_clear(skb); 72 73 if (q->last_run_head) 74 rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, 75 &q->last_run_head->rbnode.rb_right); 76 else 77 rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); 78 rb_insert_color(&skb->rbnode, &q->rb_fragments); 79 80 q->fragments_tail = skb; 81 q->last_run_head = skb; 82 } 83 84 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 85 * Value : 0xff if frame should be dropped. 86 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 87 */ 88 const u8 ip_frag_ecn_table[16] = { 89 /* at least one fragment had CE, and others ECT_0 or ECT_1 */ 90 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, 91 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 92 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 93 94 /* invalid combinations : drop frame */ 95 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, 96 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, 97 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, 98 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 99 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, 100 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, 101 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 102 }; 103 EXPORT_SYMBOL(ip_frag_ecn_table); 104 105 int inet_frags_init(struct inet_frags *f) 106 { 107 f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, 108 NULL); 109 if (!f->frags_cachep) 110 return -ENOMEM; 111 112 refcount_set(&f->refcnt, 1); 113 init_completion(&f->completion); 114 return 0; 115 } 116 EXPORT_SYMBOL(inet_frags_init); 117 118 void inet_frags_fini(struct inet_frags *f) 119 { 120 if (refcount_dec_and_test(&f->refcnt)) 121 complete(&f->completion); 122 123 wait_for_completion(&f->completion); 124 125 kmem_cache_destroy(f->frags_cachep); 126 f->frags_cachep = NULL; 127 } 128 EXPORT_SYMBOL(inet_frags_fini); 129 130 /* called from rhashtable_free_and_destroy() at netns_frags dismantle */ 131 static void inet_frags_free_cb(void *ptr, void *arg) 132 { 133 struct inet_frag_queue *fq = ptr; 134 int count; 135 136 count = del_timer_sync(&fq->timer) ? 1 : 0; 137 138 spin_lock_bh(&fq->lock); 139 fq->flags |= INET_FRAG_DROP; 140 if (!(fq->flags & INET_FRAG_COMPLETE)) { 141 fq->flags |= INET_FRAG_COMPLETE; 142 count++; 143 } else if (fq->flags & INET_FRAG_HASH_DEAD) { 144 count++; 145 } 146 spin_unlock_bh(&fq->lock); 147 148 if (refcount_sub_and_test(count, &fq->refcnt)) 149 inet_frag_destroy(fq); 150 } 151 152 static LLIST_HEAD(fqdir_free_list); 153 154 static void fqdir_free_fn(struct work_struct *work) 155 { 156 struct llist_node *kill_list; 157 struct fqdir *fqdir, *tmp; 158 struct inet_frags *f; 159 160 /* Atomically snapshot the list of fqdirs to free */ 161 kill_list = llist_del_all(&fqdir_free_list); 162 163 /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) 164 * have completed, since they need to dereference fqdir. 165 * Would it not be nice to have kfree_rcu_barrier() ? :) 166 */ 167 rcu_barrier(); 168 169 llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { 170 f = fqdir->f; 171 if (refcount_dec_and_test(&f->refcnt)) 172 complete(&f->completion); 173 174 kfree(fqdir); 175 } 176 } 177 178 static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); 179 180 static void fqdir_work_fn(struct work_struct *work) 181 { 182 struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); 183 184 rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); 185 186 if (llist_add(&fqdir->free_list, &fqdir_free_list)) 187 queue_delayed_work(system_wq, &fqdir_free_work, HZ); 188 } 189 190 int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) 191 { 192 struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); 193 int res; 194 195 if (!fqdir) 196 return -ENOMEM; 197 fqdir->f = f; 198 fqdir->net = net; 199 res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); 200 if (res < 0) { 201 kfree(fqdir); 202 return res; 203 } 204 refcount_inc(&f->refcnt); 205 *fqdirp = fqdir; 206 return 0; 207 } 208 EXPORT_SYMBOL(fqdir_init); 209 210 static struct workqueue_struct *inet_frag_wq; 211 212 static int __init inet_frag_wq_init(void) 213 { 214 inet_frag_wq = create_workqueue("inet_frag_wq"); 215 if (!inet_frag_wq) 216 panic("Could not create inet frag workq"); 217 return 0; 218 } 219 220 pure_initcall(inet_frag_wq_init); 221 222 void fqdir_exit(struct fqdir *fqdir) 223 { 224 INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); 225 queue_work(inet_frag_wq, &fqdir->destroy_work); 226 } 227 EXPORT_SYMBOL(fqdir_exit); 228 229 void inet_frag_kill(struct inet_frag_queue *fq) 230 { 231 if (del_timer(&fq->timer)) 232 refcount_dec(&fq->refcnt); 233 234 if (!(fq->flags & INET_FRAG_COMPLETE)) { 235 struct fqdir *fqdir = fq->fqdir; 236 237 fq->flags |= INET_FRAG_COMPLETE; 238 rcu_read_lock(); 239 /* The RCU read lock provides a memory barrier 240 * guaranteeing that if fqdir->dead is false then 241 * the hash table destruction will not start until 242 * after we unlock. Paired with fqdir_pre_exit(). 243 */ 244 if (!READ_ONCE(fqdir->dead)) { 245 rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, 246 fqdir->f->rhash_params); 247 refcount_dec(&fq->refcnt); 248 } else { 249 fq->flags |= INET_FRAG_HASH_DEAD; 250 } 251 rcu_read_unlock(); 252 } 253 } 254 EXPORT_SYMBOL(inet_frag_kill); 255 256 static void inet_frag_destroy_rcu(struct rcu_head *head) 257 { 258 struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, 259 rcu); 260 struct inet_frags *f = q->fqdir->f; 261 262 if (f->destructor) 263 f->destructor(q); 264 kmem_cache_free(f->frags_cachep, q); 265 } 266 267 unsigned int inet_frag_rbtree_purge(struct rb_root *root, 268 enum skb_drop_reason reason) 269 { 270 struct rb_node *p = rb_first(root); 271 unsigned int sum = 0; 272 273 while (p) { 274 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 275 276 p = rb_next(p); 277 rb_erase(&skb->rbnode, root); 278 while (skb) { 279 struct sk_buff *next = FRAG_CB(skb)->next_frag; 280 281 sum += skb->truesize; 282 kfree_skb_reason(skb, reason); 283 skb = next; 284 } 285 } 286 return sum; 287 } 288 EXPORT_SYMBOL(inet_frag_rbtree_purge); 289 290 void inet_frag_destroy(struct inet_frag_queue *q) 291 { 292 unsigned int sum, sum_truesize = 0; 293 enum skb_drop_reason reason; 294 struct inet_frags *f; 295 struct fqdir *fqdir; 296 297 WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); 298 reason = (q->flags & INET_FRAG_DROP) ? 299 SKB_DROP_REASON_FRAG_REASM_TIMEOUT : 300 SKB_CONSUMED; 301 WARN_ON(del_timer(&q->timer) != 0); 302 303 /* Release all fragment data. */ 304 fqdir = q->fqdir; 305 f = fqdir->f; 306 sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); 307 sum = sum_truesize + f->qsize; 308 309 call_rcu(&q->rcu, inet_frag_destroy_rcu); 310 311 sub_frag_mem_limit(fqdir, sum); 312 } 313 EXPORT_SYMBOL(inet_frag_destroy); 314 315 static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, 316 struct inet_frags *f, 317 void *arg) 318 { 319 struct inet_frag_queue *q; 320 321 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 322 if (!q) 323 return NULL; 324 325 q->fqdir = fqdir; 326 f->constructor(q, arg); 327 add_frag_mem_limit(fqdir, f->qsize); 328 329 timer_setup(&q->timer, f->frag_expire, 0); 330 spin_lock_init(&q->lock); 331 refcount_set(&q->refcnt, 3); 332 333 return q; 334 } 335 336 static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, 337 void *arg, 338 struct inet_frag_queue **prev) 339 { 340 struct inet_frags *f = fqdir->f; 341 struct inet_frag_queue *q; 342 343 q = inet_frag_alloc(fqdir, f, arg); 344 if (!q) { 345 *prev = ERR_PTR(-ENOMEM); 346 return NULL; 347 } 348 mod_timer(&q->timer, jiffies + fqdir->timeout); 349 350 *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, 351 &q->node, f->rhash_params); 352 if (*prev) { 353 q->flags |= INET_FRAG_COMPLETE; 354 inet_frag_kill(q); 355 inet_frag_destroy(q); 356 return NULL; 357 } 358 return q; 359 } 360 361 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 362 struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) 363 { 364 /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ 365 long high_thresh = READ_ONCE(fqdir->high_thresh); 366 struct inet_frag_queue *fq = NULL, *prev; 367 368 if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) 369 return NULL; 370 371 rcu_read_lock(); 372 373 prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); 374 if (!prev) 375 fq = inet_frag_create(fqdir, key, &prev); 376 if (!IS_ERR_OR_NULL(prev)) { 377 fq = prev; 378 if (!refcount_inc_not_zero(&fq->refcnt)) 379 fq = NULL; 380 } 381 rcu_read_unlock(); 382 return fq; 383 } 384 EXPORT_SYMBOL(inet_frag_find); 385 386 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, 387 int offset, int end) 388 { 389 struct sk_buff *last = q->fragments_tail; 390 391 /* RFC5722, Section 4, amended by Errata ID : 3089 392 * When reassembling an IPv6 datagram, if 393 * one or more its constituent fragments is determined to be an 394 * overlapping fragment, the entire datagram (and any constituent 395 * fragments) MUST be silently discarded. 396 * 397 * Duplicates, however, should be ignored (i.e. skb dropped, but the 398 * queue/fragments kept for later reassembly). 399 */ 400 if (!last) 401 fragrun_create(q, skb); /* First fragment. */ 402 else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { 403 /* This is the common case: skb goes to the end. */ 404 /* Detect and discard overlaps. */ 405 if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) 406 return IPFRAG_OVERLAP; 407 if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) 408 fragrun_append_to_last(q, skb); 409 else 410 fragrun_create(q, skb); 411 } else { 412 /* Binary search. Note that skb can become the first fragment, 413 * but not the last (covered above). 414 */ 415 struct rb_node **rbn, *parent; 416 417 rbn = &q->rb_fragments.rb_node; 418 do { 419 struct sk_buff *curr; 420 int curr_run_end; 421 422 parent = *rbn; 423 curr = rb_to_skb(parent); 424 curr_run_end = FRAG_CB(curr)->ip_defrag_offset + 425 FRAG_CB(curr)->frag_run_len; 426 if (end <= FRAG_CB(curr)->ip_defrag_offset) 427 rbn = &parent->rb_left; 428 else if (offset >= curr_run_end) 429 rbn = &parent->rb_right; 430 else if (offset >= FRAG_CB(curr)->ip_defrag_offset && 431 end <= curr_run_end) 432 return IPFRAG_DUP; 433 else 434 return IPFRAG_OVERLAP; 435 } while (*rbn); 436 /* Here we have parent properly set, and rbn pointing to 437 * one of its NULL left/right children. Insert skb. 438 */ 439 fragcb_clear(skb); 440 rb_link_node(&skb->rbnode, parent, rbn); 441 rb_insert_color(&skb->rbnode, &q->rb_fragments); 442 } 443 444 FRAG_CB(skb)->ip_defrag_offset = offset; 445 446 return IPFRAG_OK; 447 } 448 EXPORT_SYMBOL(inet_frag_queue_insert); 449 450 void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, 451 struct sk_buff *parent) 452 { 453 struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); 454 void (*destructor)(struct sk_buff *); 455 unsigned int orig_truesize = 0; 456 struct sk_buff **nextp = NULL; 457 struct sock *sk = skb->sk; 458 int delta; 459 460 if (sk && is_skb_wmem(skb)) { 461 /* TX: skb->sk might have been passed as argument to 462 * dst->output and must remain valid until tx completes. 463 * 464 * Move sk to reassembled skb and fix up wmem accounting. 465 */ 466 orig_truesize = skb->truesize; 467 destructor = skb->destructor; 468 } 469 470 if (head != skb) { 471 fp = skb_clone(skb, GFP_ATOMIC); 472 if (!fp) { 473 head = skb; 474 goto out_restore_sk; 475 } 476 FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; 477 if (RB_EMPTY_NODE(&skb->rbnode)) 478 FRAG_CB(parent)->next_frag = fp; 479 else 480 rb_replace_node(&skb->rbnode, &fp->rbnode, 481 &q->rb_fragments); 482 if (q->fragments_tail == skb) 483 q->fragments_tail = fp; 484 485 if (orig_truesize) { 486 /* prevent skb_morph from releasing sk */ 487 skb->sk = NULL; 488 skb->destructor = NULL; 489 } 490 skb_morph(skb, head); 491 FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; 492 rb_replace_node(&head->rbnode, &skb->rbnode, 493 &q->rb_fragments); 494 consume_skb(head); 495 head = skb; 496 } 497 WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); 498 499 delta = -head->truesize; 500 501 /* Head of list must not be cloned. */ 502 if (skb_unclone(head, GFP_ATOMIC)) 503 goto out_restore_sk; 504 505 delta += head->truesize; 506 if (delta) 507 add_frag_mem_limit(q->fqdir, delta); 508 509 /* If the first fragment is fragmented itself, we split 510 * it to two chunks: the first with data and paged part 511 * and the second, holding only fragments. 512 */ 513 if (skb_has_frag_list(head)) { 514 struct sk_buff *clone; 515 int i, plen = 0; 516 517 clone = alloc_skb(0, GFP_ATOMIC); 518 if (!clone) 519 goto out_restore_sk; 520 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 521 skb_frag_list_init(head); 522 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 523 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 524 clone->data_len = head->data_len - plen; 525 clone->len = clone->data_len; 526 head->truesize += clone->truesize; 527 clone->csum = 0; 528 clone->ip_summed = head->ip_summed; 529 add_frag_mem_limit(q->fqdir, clone->truesize); 530 skb_shinfo(head)->frag_list = clone; 531 nextp = &clone->next; 532 } else { 533 nextp = &skb_shinfo(head)->frag_list; 534 } 535 536 out_restore_sk: 537 if (orig_truesize) { 538 int ts_delta = head->truesize - orig_truesize; 539 540 /* if this reassembled skb is fragmented later, 541 * fraglist skbs will get skb->sk assigned from head->sk, 542 * and each frag skb will be released via sock_wfree. 543 * 544 * Update sk_wmem_alloc. 545 */ 546 head->sk = sk; 547 head->destructor = destructor; 548 refcount_add(ts_delta, &sk->sk_wmem_alloc); 549 } 550 551 return nextp; 552 } 553 EXPORT_SYMBOL(inet_frag_reasm_prepare); 554 555 void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, 556 void *reasm_data, bool try_coalesce) 557 { 558 struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; 559 const unsigned int head_truesize = head->truesize; 560 struct sk_buff **nextp = reasm_data; 561 struct rb_node *rbn; 562 struct sk_buff *fp; 563 int sum_truesize; 564 565 skb_push(head, head->data - skb_network_header(head)); 566 567 /* Traverse the tree in order, to build frag_list. */ 568 fp = FRAG_CB(head)->next_frag; 569 rbn = rb_next(&head->rbnode); 570 rb_erase(&head->rbnode, &q->rb_fragments); 571 572 sum_truesize = head->truesize; 573 while (rbn || fp) { 574 /* fp points to the next sk_buff in the current run; 575 * rbn points to the next run. 576 */ 577 /* Go through the current run. */ 578 while (fp) { 579 struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; 580 bool stolen; 581 int delta; 582 583 sum_truesize += fp->truesize; 584 if (head->ip_summed != fp->ip_summed) 585 head->ip_summed = CHECKSUM_NONE; 586 else if (head->ip_summed == CHECKSUM_COMPLETE) 587 head->csum = csum_add(head->csum, fp->csum); 588 589 if (try_coalesce && skb_try_coalesce(head, fp, &stolen, 590 &delta)) { 591 kfree_skb_partial(fp, stolen); 592 } else { 593 fp->prev = NULL; 594 memset(&fp->rbnode, 0, sizeof(fp->rbnode)); 595 fp->sk = NULL; 596 597 head->data_len += fp->len; 598 head->len += fp->len; 599 head->truesize += fp->truesize; 600 601 *nextp = fp; 602 nextp = &fp->next; 603 } 604 605 fp = next_frag; 606 } 607 /* Move to the next run. */ 608 if (rbn) { 609 struct rb_node *rbnext = rb_next(rbn); 610 611 fp = rb_to_skb(rbn); 612 rb_erase(rbn, &q->rb_fragments); 613 rbn = rbnext; 614 } 615 } 616 sub_frag_mem_limit(q->fqdir, sum_truesize); 617 618 *nextp = NULL; 619 skb_mark_not_on_list(head); 620 head->prev = NULL; 621 head->tstamp = q->stamp; 622 head->mono_delivery_time = q->mono_delivery_time; 623 624 if (sk) 625 refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); 626 } 627 EXPORT_SYMBOL(inet_frag_reasm_finish); 628 629 struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) 630 { 631 struct sk_buff *head, *skb; 632 633 head = skb_rb_first(&q->rb_fragments); 634 if (!head) 635 return NULL; 636 skb = FRAG_CB(head)->next_frag; 637 if (skb) 638 rb_replace_node(&head->rbnode, &skb->rbnode, 639 &q->rb_fragments); 640 else 641 rb_erase(&head->rbnode, &q->rb_fragments); 642 memset(&head->rbnode, 0, sizeof(head->rbnode)); 643 barrier(); 644 645 if (head == q->fragments_tail) 646 q->fragments_tail = NULL; 647 648 sub_frag_mem_limit(q->fqdir, head->truesize); 649 650 return head; 651 } 652 EXPORT_SYMBOL(inet_frag_pull_head); 653