1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * inet fragments management 4 * 5 * Authors: Pavel Emelyanov <xemul@openvz.org> 6 * Started as consolidation of ipv4/ip_fragment.c, 7 * ipv6/reassembly. and ipv6 nf conntrack reassembly 8 */ 9 10 #include <linux/list.h> 11 #include <linux/spinlock.h> 12 #include <linux/module.h> 13 #include <linux/timer.h> 14 #include <linux/mm.h> 15 #include <linux/random.h> 16 #include <linux/skbuff.h> 17 #include <linux/rtnetlink.h> 18 #include <linux/slab.h> 19 #include <linux/rhashtable.h> 20 21 #include <net/sock.h> 22 #include <net/inet_frag.h> 23 #include <net/inet_ecn.h> 24 #include <net/ip.h> 25 #include <net/ipv6.h> 26 27 /* Use skb->cb to track consecutive/adjacent fragments coming at 28 * the end of the queue. Nodes in the rb-tree queue will 29 * contain "runs" of one or more adjacent fragments. 30 * 31 * Invariants: 32 * - next_frag is NULL at the tail of a "run"; 33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len. 34 */ 35 struct ipfrag_skb_cb { 36 union { 37 struct inet_skb_parm h4; 38 struct inet6_skb_parm h6; 39 }; 40 struct sk_buff *next_frag; 41 int frag_run_len; 42 }; 43 44 #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) 45 46 static void fragcb_clear(struct sk_buff *skb) 47 { 48 RB_CLEAR_NODE(&skb->rbnode); 49 FRAG_CB(skb)->next_frag = NULL; 50 FRAG_CB(skb)->frag_run_len = skb->len; 51 } 52 53 /* Append skb to the last "run". */ 54 static void fragrun_append_to_last(struct inet_frag_queue *q, 55 struct sk_buff *skb) 56 { 57 fragcb_clear(skb); 58 59 FRAG_CB(q->last_run_head)->frag_run_len += skb->len; 60 FRAG_CB(q->fragments_tail)->next_frag = skb; 61 q->fragments_tail = skb; 62 } 63 64 /* Create a new "run" with the skb. */ 65 static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) 66 { 67 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); 68 fragcb_clear(skb); 69 70 if (q->last_run_head) 71 rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, 72 &q->last_run_head->rbnode.rb_right); 73 else 74 rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); 75 rb_insert_color(&skb->rbnode, &q->rb_fragments); 76 77 q->fragments_tail = skb; 78 q->last_run_head = skb; 79 } 80 81 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 82 * Value : 0xff if frame should be dropped. 83 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 84 */ 85 const u8 ip_frag_ecn_table[16] = { 86 /* at least one fragment had CE, and others ECT_0 or ECT_1 */ 87 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, 88 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 89 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 90 91 /* invalid combinations : drop frame */ 92 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, 93 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, 94 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, 95 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 96 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, 97 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, 98 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 99 }; 100 EXPORT_SYMBOL(ip_frag_ecn_table); 101 102 int inet_frags_init(struct inet_frags *f) 103 { 104 f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, 105 NULL); 106 if (!f->frags_cachep) 107 return -ENOMEM; 108 109 refcount_set(&f->refcnt, 1); 110 init_completion(&f->completion); 111 return 0; 112 } 113 EXPORT_SYMBOL(inet_frags_init); 114 115 void inet_frags_fini(struct inet_frags *f) 116 { 117 if (refcount_dec_and_test(&f->refcnt)) 118 complete(&f->completion); 119 120 wait_for_completion(&f->completion); 121 122 kmem_cache_destroy(f->frags_cachep); 123 f->frags_cachep = NULL; 124 } 125 EXPORT_SYMBOL(inet_frags_fini); 126 127 /* called from rhashtable_free_and_destroy() at netns_frags dismantle */ 128 static void inet_frags_free_cb(void *ptr, void *arg) 129 { 130 struct inet_frag_queue *fq = ptr; 131 int count; 132 133 count = del_timer_sync(&fq->timer) ? 1 : 0; 134 135 spin_lock_bh(&fq->lock); 136 fq->flags |= INET_FRAG_DROP; 137 if (!(fq->flags & INET_FRAG_COMPLETE)) { 138 fq->flags |= INET_FRAG_COMPLETE; 139 count++; 140 } else if (fq->flags & INET_FRAG_HASH_DEAD) { 141 count++; 142 } 143 spin_unlock_bh(&fq->lock); 144 145 if (refcount_sub_and_test(count, &fq->refcnt)) 146 inet_frag_destroy(fq); 147 } 148 149 static LLIST_HEAD(fqdir_free_list); 150 151 static void fqdir_free_fn(struct work_struct *work) 152 { 153 struct llist_node *kill_list; 154 struct fqdir *fqdir, *tmp; 155 struct inet_frags *f; 156 157 /* Atomically snapshot the list of fqdirs to free */ 158 kill_list = llist_del_all(&fqdir_free_list); 159 160 /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) 161 * have completed, since they need to dereference fqdir. 162 * Would it not be nice to have kfree_rcu_barrier() ? :) 163 */ 164 rcu_barrier(); 165 166 llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { 167 f = fqdir->f; 168 if (refcount_dec_and_test(&f->refcnt)) 169 complete(&f->completion); 170 171 kfree(fqdir); 172 } 173 } 174 175 static DECLARE_WORK(fqdir_free_work, fqdir_free_fn); 176 177 static void fqdir_work_fn(struct work_struct *work) 178 { 179 struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); 180 181 rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); 182 183 if (llist_add(&fqdir->free_list, &fqdir_free_list)) 184 queue_work(system_wq, &fqdir_free_work); 185 } 186 187 int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) 188 { 189 struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); 190 int res; 191 192 if (!fqdir) 193 return -ENOMEM; 194 fqdir->f = f; 195 fqdir->net = net; 196 res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); 197 if (res < 0) { 198 kfree(fqdir); 199 return res; 200 } 201 refcount_inc(&f->refcnt); 202 *fqdirp = fqdir; 203 return 0; 204 } 205 EXPORT_SYMBOL(fqdir_init); 206 207 static struct workqueue_struct *inet_frag_wq; 208 209 static int __init inet_frag_wq_init(void) 210 { 211 inet_frag_wq = create_workqueue("inet_frag_wq"); 212 if (!inet_frag_wq) 213 panic("Could not create inet frag workq"); 214 return 0; 215 } 216 217 pure_initcall(inet_frag_wq_init); 218 219 void fqdir_exit(struct fqdir *fqdir) 220 { 221 INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); 222 queue_work(inet_frag_wq, &fqdir->destroy_work); 223 } 224 EXPORT_SYMBOL(fqdir_exit); 225 226 void inet_frag_kill(struct inet_frag_queue *fq) 227 { 228 if (del_timer(&fq->timer)) 229 refcount_dec(&fq->refcnt); 230 231 if (!(fq->flags & INET_FRAG_COMPLETE)) { 232 struct fqdir *fqdir = fq->fqdir; 233 234 fq->flags |= INET_FRAG_COMPLETE; 235 rcu_read_lock(); 236 /* The RCU read lock provides a memory barrier 237 * guaranteeing that if fqdir->dead is false then 238 * the hash table destruction will not start until 239 * after we unlock. Paired with fqdir_pre_exit(). 240 */ 241 if (!READ_ONCE(fqdir->dead)) { 242 rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, 243 fqdir->f->rhash_params); 244 refcount_dec(&fq->refcnt); 245 } else { 246 fq->flags |= INET_FRAG_HASH_DEAD; 247 } 248 rcu_read_unlock(); 249 } 250 } 251 EXPORT_SYMBOL(inet_frag_kill); 252 253 static void inet_frag_destroy_rcu(struct rcu_head *head) 254 { 255 struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, 256 rcu); 257 struct inet_frags *f = q->fqdir->f; 258 259 if (f->destructor) 260 f->destructor(q); 261 kmem_cache_free(f->frags_cachep, q); 262 } 263 264 unsigned int inet_frag_rbtree_purge(struct rb_root *root, 265 enum skb_drop_reason reason) 266 { 267 struct rb_node *p = rb_first(root); 268 unsigned int sum = 0; 269 270 while (p) { 271 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 272 273 p = rb_next(p); 274 rb_erase(&skb->rbnode, root); 275 while (skb) { 276 struct sk_buff *next = FRAG_CB(skb)->next_frag; 277 278 sum += skb->truesize; 279 kfree_skb_reason(skb, reason); 280 skb = next; 281 } 282 } 283 return sum; 284 } 285 EXPORT_SYMBOL(inet_frag_rbtree_purge); 286 287 void inet_frag_destroy(struct inet_frag_queue *q) 288 { 289 unsigned int sum, sum_truesize = 0; 290 enum skb_drop_reason reason; 291 struct inet_frags *f; 292 struct fqdir *fqdir; 293 294 WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); 295 reason = (q->flags & INET_FRAG_DROP) ? 296 SKB_DROP_REASON_FRAG_REASM_TIMEOUT : 297 SKB_CONSUMED; 298 WARN_ON(del_timer(&q->timer) != 0); 299 300 /* Release all fragment data. */ 301 fqdir = q->fqdir; 302 f = fqdir->f; 303 sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); 304 sum = sum_truesize + f->qsize; 305 306 call_rcu(&q->rcu, inet_frag_destroy_rcu); 307 308 sub_frag_mem_limit(fqdir, sum); 309 } 310 EXPORT_SYMBOL(inet_frag_destroy); 311 312 static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, 313 struct inet_frags *f, 314 void *arg) 315 { 316 struct inet_frag_queue *q; 317 318 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 319 if (!q) 320 return NULL; 321 322 q->fqdir = fqdir; 323 f->constructor(q, arg); 324 add_frag_mem_limit(fqdir, f->qsize); 325 326 timer_setup(&q->timer, f->frag_expire, 0); 327 spin_lock_init(&q->lock); 328 refcount_set(&q->refcnt, 3); 329 330 return q; 331 } 332 333 static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, 334 void *arg, 335 struct inet_frag_queue **prev) 336 { 337 struct inet_frags *f = fqdir->f; 338 struct inet_frag_queue *q; 339 340 q = inet_frag_alloc(fqdir, f, arg); 341 if (!q) { 342 *prev = ERR_PTR(-ENOMEM); 343 return NULL; 344 } 345 mod_timer(&q->timer, jiffies + fqdir->timeout); 346 347 *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, 348 &q->node, f->rhash_params); 349 if (*prev) { 350 q->flags |= INET_FRAG_COMPLETE; 351 inet_frag_kill(q); 352 inet_frag_destroy(q); 353 return NULL; 354 } 355 return q; 356 } 357 358 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 359 struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) 360 { 361 /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ 362 long high_thresh = READ_ONCE(fqdir->high_thresh); 363 struct inet_frag_queue *fq = NULL, *prev; 364 365 if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) 366 return NULL; 367 368 rcu_read_lock(); 369 370 prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); 371 if (!prev) 372 fq = inet_frag_create(fqdir, key, &prev); 373 if (!IS_ERR_OR_NULL(prev)) { 374 fq = prev; 375 if (!refcount_inc_not_zero(&fq->refcnt)) 376 fq = NULL; 377 } 378 rcu_read_unlock(); 379 return fq; 380 } 381 EXPORT_SYMBOL(inet_frag_find); 382 383 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, 384 int offset, int end) 385 { 386 struct sk_buff *last = q->fragments_tail; 387 388 /* RFC5722, Section 4, amended by Errata ID : 3089 389 * When reassembling an IPv6 datagram, if 390 * one or more its constituent fragments is determined to be an 391 * overlapping fragment, the entire datagram (and any constituent 392 * fragments) MUST be silently discarded. 393 * 394 * Duplicates, however, should be ignored (i.e. skb dropped, but the 395 * queue/fragments kept for later reassembly). 396 */ 397 if (!last) 398 fragrun_create(q, skb); /* First fragment. */ 399 else if (last->ip_defrag_offset + last->len < end) { 400 /* This is the common case: skb goes to the end. */ 401 /* Detect and discard overlaps. */ 402 if (offset < last->ip_defrag_offset + last->len) 403 return IPFRAG_OVERLAP; 404 if (offset == last->ip_defrag_offset + last->len) 405 fragrun_append_to_last(q, skb); 406 else 407 fragrun_create(q, skb); 408 } else { 409 /* Binary search. Note that skb can become the first fragment, 410 * but not the last (covered above). 411 */ 412 struct rb_node **rbn, *parent; 413 414 rbn = &q->rb_fragments.rb_node; 415 do { 416 struct sk_buff *curr; 417 int curr_run_end; 418 419 parent = *rbn; 420 curr = rb_to_skb(parent); 421 curr_run_end = curr->ip_defrag_offset + 422 FRAG_CB(curr)->frag_run_len; 423 if (end <= curr->ip_defrag_offset) 424 rbn = &parent->rb_left; 425 else if (offset >= curr_run_end) 426 rbn = &parent->rb_right; 427 else if (offset >= curr->ip_defrag_offset && 428 end <= curr_run_end) 429 return IPFRAG_DUP; 430 else 431 return IPFRAG_OVERLAP; 432 } while (*rbn); 433 /* Here we have parent properly set, and rbn pointing to 434 * one of its NULL left/right children. Insert skb. 435 */ 436 fragcb_clear(skb); 437 rb_link_node(&skb->rbnode, parent, rbn); 438 rb_insert_color(&skb->rbnode, &q->rb_fragments); 439 } 440 441 skb->ip_defrag_offset = offset; 442 443 return IPFRAG_OK; 444 } 445 EXPORT_SYMBOL(inet_frag_queue_insert); 446 447 void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, 448 struct sk_buff *parent) 449 { 450 struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); 451 struct sk_buff **nextp; 452 int delta; 453 454 if (head != skb) { 455 fp = skb_clone(skb, GFP_ATOMIC); 456 if (!fp) 457 return NULL; 458 FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; 459 if (RB_EMPTY_NODE(&skb->rbnode)) 460 FRAG_CB(parent)->next_frag = fp; 461 else 462 rb_replace_node(&skb->rbnode, &fp->rbnode, 463 &q->rb_fragments); 464 if (q->fragments_tail == skb) 465 q->fragments_tail = fp; 466 skb_morph(skb, head); 467 FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; 468 rb_replace_node(&head->rbnode, &skb->rbnode, 469 &q->rb_fragments); 470 consume_skb(head); 471 head = skb; 472 } 473 WARN_ON(head->ip_defrag_offset != 0); 474 475 delta = -head->truesize; 476 477 /* Head of list must not be cloned. */ 478 if (skb_unclone(head, GFP_ATOMIC)) 479 return NULL; 480 481 delta += head->truesize; 482 if (delta) 483 add_frag_mem_limit(q->fqdir, delta); 484 485 /* If the first fragment is fragmented itself, we split 486 * it to two chunks: the first with data and paged part 487 * and the second, holding only fragments. 488 */ 489 if (skb_has_frag_list(head)) { 490 struct sk_buff *clone; 491 int i, plen = 0; 492 493 clone = alloc_skb(0, GFP_ATOMIC); 494 if (!clone) 495 return NULL; 496 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 497 skb_frag_list_init(head); 498 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 499 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 500 clone->data_len = head->data_len - plen; 501 clone->len = clone->data_len; 502 head->truesize += clone->truesize; 503 clone->csum = 0; 504 clone->ip_summed = head->ip_summed; 505 add_frag_mem_limit(q->fqdir, clone->truesize); 506 skb_shinfo(head)->frag_list = clone; 507 nextp = &clone->next; 508 } else { 509 nextp = &skb_shinfo(head)->frag_list; 510 } 511 512 return nextp; 513 } 514 EXPORT_SYMBOL(inet_frag_reasm_prepare); 515 516 void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, 517 void *reasm_data, bool try_coalesce) 518 { 519 struct sk_buff **nextp = reasm_data; 520 struct rb_node *rbn; 521 struct sk_buff *fp; 522 int sum_truesize; 523 524 skb_push(head, head->data - skb_network_header(head)); 525 526 /* Traverse the tree in order, to build frag_list. */ 527 fp = FRAG_CB(head)->next_frag; 528 rbn = rb_next(&head->rbnode); 529 rb_erase(&head->rbnode, &q->rb_fragments); 530 531 sum_truesize = head->truesize; 532 while (rbn || fp) { 533 /* fp points to the next sk_buff in the current run; 534 * rbn points to the next run. 535 */ 536 /* Go through the current run. */ 537 while (fp) { 538 struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; 539 bool stolen; 540 int delta; 541 542 sum_truesize += fp->truesize; 543 if (head->ip_summed != fp->ip_summed) 544 head->ip_summed = CHECKSUM_NONE; 545 else if (head->ip_summed == CHECKSUM_COMPLETE) 546 head->csum = csum_add(head->csum, fp->csum); 547 548 if (try_coalesce && skb_try_coalesce(head, fp, &stolen, 549 &delta)) { 550 kfree_skb_partial(fp, stolen); 551 } else { 552 fp->prev = NULL; 553 memset(&fp->rbnode, 0, sizeof(fp->rbnode)); 554 fp->sk = NULL; 555 556 head->data_len += fp->len; 557 head->len += fp->len; 558 head->truesize += fp->truesize; 559 560 *nextp = fp; 561 nextp = &fp->next; 562 } 563 564 fp = next_frag; 565 } 566 /* Move to the next run. */ 567 if (rbn) { 568 struct rb_node *rbnext = rb_next(rbn); 569 570 fp = rb_to_skb(rbn); 571 rb_erase(rbn, &q->rb_fragments); 572 rbn = rbnext; 573 } 574 } 575 sub_frag_mem_limit(q->fqdir, sum_truesize); 576 577 *nextp = NULL; 578 skb_mark_not_on_list(head); 579 head->prev = NULL; 580 head->tstamp = q->stamp; 581 head->mono_delivery_time = q->mono_delivery_time; 582 } 583 EXPORT_SYMBOL(inet_frag_reasm_finish); 584 585 struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) 586 { 587 struct sk_buff *head, *skb; 588 589 head = skb_rb_first(&q->rb_fragments); 590 if (!head) 591 return NULL; 592 skb = FRAG_CB(head)->next_frag; 593 if (skb) 594 rb_replace_node(&head->rbnode, &skb->rbnode, 595 &q->rb_fragments); 596 else 597 rb_erase(&head->rbnode, &q->rb_fragments); 598 memset(&head->rbnode, 0, sizeof(head->rbnode)); 599 barrier(); 600 601 if (head == q->fragments_tail) 602 q->fragments_tail = NULL; 603 604 sub_frag_mem_limit(q->fqdir, head->truesize); 605 606 return head; 607 } 608 EXPORT_SYMBOL(inet_frag_pull_head); 609