1 /* 2 * inet fragments management 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Pavel Emelyanov <xemul@openvz.org> 10 * Started as consolidation of ipv4/ip_fragment.c, 11 * ipv6/reassembly. and ipv6 nf conntrack reassembly 12 */ 13 14 #include <linux/list.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/timer.h> 18 #include <linux/mm.h> 19 #include <linux/random.h> 20 #include <linux/skbuff.h> 21 #include <linux/rtnetlink.h> 22 #include <linux/slab.h> 23 #include <linux/rhashtable.h> 24 25 #include <net/sock.h> 26 #include <net/inet_frag.h> 27 #include <net/inet_ecn.h> 28 #include <net/ip.h> 29 #include <net/ipv6.h> 30 31 /* Use skb->cb to track consecutive/adjacent fragments coming at 32 * the end of the queue. Nodes in the rb-tree queue will 33 * contain "runs" of one or more adjacent fragments. 34 * 35 * Invariants: 36 * - next_frag is NULL at the tail of a "run"; 37 * - the head of a "run" has the sum of all fragment lengths in frag_run_len. 38 */ 39 struct ipfrag_skb_cb { 40 union { 41 struct inet_skb_parm h4; 42 struct inet6_skb_parm h6; 43 }; 44 struct sk_buff *next_frag; 45 int frag_run_len; 46 }; 47 48 #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) 49 50 static void fragcb_clear(struct sk_buff *skb) 51 { 52 RB_CLEAR_NODE(&skb->rbnode); 53 FRAG_CB(skb)->next_frag = NULL; 54 FRAG_CB(skb)->frag_run_len = skb->len; 55 } 56 57 /* Append skb to the last "run". */ 58 static void fragrun_append_to_last(struct inet_frag_queue *q, 59 struct sk_buff *skb) 60 { 61 fragcb_clear(skb); 62 63 FRAG_CB(q->last_run_head)->frag_run_len += skb->len; 64 FRAG_CB(q->fragments_tail)->next_frag = skb; 65 q->fragments_tail = skb; 66 } 67 68 /* Create a new "run" with the skb. */ 69 static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) 70 { 71 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); 72 fragcb_clear(skb); 73 74 if (q->last_run_head) 75 rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, 76 &q->last_run_head->rbnode.rb_right); 77 else 78 rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); 79 rb_insert_color(&skb->rbnode, &q->rb_fragments); 80 81 q->fragments_tail = skb; 82 q->last_run_head = skb; 83 } 84 85 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 86 * Value : 0xff if frame should be dropped. 87 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 88 */ 89 const u8 ip_frag_ecn_table[16] = { 90 /* at least one fragment had CE, and others ECT_0 or ECT_1 */ 91 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, 92 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 93 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 94 95 /* invalid combinations : drop frame */ 96 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, 97 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, 98 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, 99 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 100 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, 101 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, 102 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 103 }; 104 EXPORT_SYMBOL(ip_frag_ecn_table); 105 106 int inet_frags_init(struct inet_frags *f) 107 { 108 f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, 109 NULL); 110 if (!f->frags_cachep) 111 return -ENOMEM; 112 113 return 0; 114 } 115 EXPORT_SYMBOL(inet_frags_init); 116 117 void inet_frags_fini(struct inet_frags *f) 118 { 119 /* We must wait that all inet_frag_destroy_rcu() have completed. */ 120 rcu_barrier(); 121 122 kmem_cache_destroy(f->frags_cachep); 123 f->frags_cachep = NULL; 124 } 125 EXPORT_SYMBOL(inet_frags_fini); 126 127 static void inet_frags_free_cb(void *ptr, void *arg) 128 { 129 struct inet_frag_queue *fq = ptr; 130 131 /* If we can not cancel the timer, it means this frag_queue 132 * is already disappearing, we have nothing to do. 133 * Otherwise, we own a refcount until the end of this function. 134 */ 135 if (!del_timer(&fq->timer)) 136 return; 137 138 spin_lock_bh(&fq->lock); 139 if (!(fq->flags & INET_FRAG_COMPLETE)) { 140 fq->flags |= INET_FRAG_COMPLETE; 141 refcount_dec(&fq->refcnt); 142 } 143 spin_unlock_bh(&fq->lock); 144 145 inet_frag_put(fq); 146 } 147 148 void inet_frags_exit_net(struct netns_frags *nf) 149 { 150 nf->high_thresh = 0; /* prevent creation of new frags */ 151 152 rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); 153 } 154 EXPORT_SYMBOL(inet_frags_exit_net); 155 156 void inet_frag_kill(struct inet_frag_queue *fq) 157 { 158 if (del_timer(&fq->timer)) 159 refcount_dec(&fq->refcnt); 160 161 if (!(fq->flags & INET_FRAG_COMPLETE)) { 162 struct netns_frags *nf = fq->net; 163 164 fq->flags |= INET_FRAG_COMPLETE; 165 rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); 166 refcount_dec(&fq->refcnt); 167 } 168 } 169 EXPORT_SYMBOL(inet_frag_kill); 170 171 static void inet_frag_destroy_rcu(struct rcu_head *head) 172 { 173 struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, 174 rcu); 175 struct inet_frags *f = q->net->f; 176 177 if (f->destructor) 178 f->destructor(q); 179 kmem_cache_free(f->frags_cachep, q); 180 } 181 182 unsigned int inet_frag_rbtree_purge(struct rb_root *root) 183 { 184 struct rb_node *p = rb_first(root); 185 unsigned int sum = 0; 186 187 while (p) { 188 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 189 190 p = rb_next(p); 191 rb_erase(&skb->rbnode, root); 192 while (skb) { 193 struct sk_buff *next = FRAG_CB(skb)->next_frag; 194 195 sum += skb->truesize; 196 kfree_skb(skb); 197 skb = next; 198 } 199 } 200 return sum; 201 } 202 EXPORT_SYMBOL(inet_frag_rbtree_purge); 203 204 void inet_frag_destroy(struct inet_frag_queue *q) 205 { 206 struct sk_buff *fp; 207 struct netns_frags *nf; 208 unsigned int sum, sum_truesize = 0; 209 struct inet_frags *f; 210 211 WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); 212 WARN_ON(del_timer(&q->timer) != 0); 213 214 /* Release all fragment data. */ 215 fp = q->fragments; 216 nf = q->net; 217 f = nf->f; 218 if (fp) { 219 do { 220 struct sk_buff *xp = fp->next; 221 222 sum_truesize += fp->truesize; 223 kfree_skb(fp); 224 fp = xp; 225 } while (fp); 226 } else { 227 sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); 228 } 229 sum = sum_truesize + f->qsize; 230 231 call_rcu(&q->rcu, inet_frag_destroy_rcu); 232 233 sub_frag_mem_limit(nf, sum); 234 } 235 EXPORT_SYMBOL(inet_frag_destroy); 236 237 static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, 238 struct inet_frags *f, 239 void *arg) 240 { 241 struct inet_frag_queue *q; 242 243 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 244 if (!q) 245 return NULL; 246 247 q->net = nf; 248 f->constructor(q, arg); 249 add_frag_mem_limit(nf, f->qsize); 250 251 timer_setup(&q->timer, f->frag_expire, 0); 252 spin_lock_init(&q->lock); 253 refcount_set(&q->refcnt, 3); 254 255 return q; 256 } 257 258 static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, 259 void *arg, 260 struct inet_frag_queue **prev) 261 { 262 struct inet_frags *f = nf->f; 263 struct inet_frag_queue *q; 264 265 q = inet_frag_alloc(nf, f, arg); 266 if (!q) { 267 *prev = ERR_PTR(-ENOMEM); 268 return NULL; 269 } 270 mod_timer(&q->timer, jiffies + nf->timeout); 271 272 *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, 273 &q->node, f->rhash_params); 274 if (*prev) { 275 q->flags |= INET_FRAG_COMPLETE; 276 inet_frag_kill(q); 277 inet_frag_destroy(q); 278 return NULL; 279 } 280 return q; 281 } 282 283 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 284 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) 285 { 286 struct inet_frag_queue *fq = NULL, *prev; 287 288 if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) 289 return NULL; 290 291 rcu_read_lock(); 292 293 prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); 294 if (!prev) 295 fq = inet_frag_create(nf, key, &prev); 296 if (prev && !IS_ERR(prev)) { 297 fq = prev; 298 if (!refcount_inc_not_zero(&fq->refcnt)) 299 fq = NULL; 300 } 301 rcu_read_unlock(); 302 return fq; 303 } 304 EXPORT_SYMBOL(inet_frag_find); 305 306 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, 307 int offset, int end) 308 { 309 struct sk_buff *last = q->fragments_tail; 310 311 /* RFC5722, Section 4, amended by Errata ID : 3089 312 * When reassembling an IPv6 datagram, if 313 * one or more its constituent fragments is determined to be an 314 * overlapping fragment, the entire datagram (and any constituent 315 * fragments) MUST be silently discarded. 316 * 317 * Duplicates, however, should be ignored (i.e. skb dropped, but the 318 * queue/fragments kept for later reassembly). 319 */ 320 if (!last) 321 fragrun_create(q, skb); /* First fragment. */ 322 else if (last->ip_defrag_offset + last->len < end) { 323 /* This is the common case: skb goes to the end. */ 324 /* Detect and discard overlaps. */ 325 if (offset < last->ip_defrag_offset + last->len) 326 return IPFRAG_OVERLAP; 327 if (offset == last->ip_defrag_offset + last->len) 328 fragrun_append_to_last(q, skb); 329 else 330 fragrun_create(q, skb); 331 } else { 332 /* Binary search. Note that skb can become the first fragment, 333 * but not the last (covered above). 334 */ 335 struct rb_node **rbn, *parent; 336 337 rbn = &q->rb_fragments.rb_node; 338 do { 339 struct sk_buff *curr; 340 int curr_run_end; 341 342 parent = *rbn; 343 curr = rb_to_skb(parent); 344 curr_run_end = curr->ip_defrag_offset + 345 FRAG_CB(curr)->frag_run_len; 346 if (end <= curr->ip_defrag_offset) 347 rbn = &parent->rb_left; 348 else if (offset >= curr_run_end) 349 rbn = &parent->rb_right; 350 else if (offset >= curr->ip_defrag_offset && 351 end <= curr_run_end) 352 return IPFRAG_DUP; 353 else 354 return IPFRAG_OVERLAP; 355 } while (*rbn); 356 /* Here we have parent properly set, and rbn pointing to 357 * one of its NULL left/right children. Insert skb. 358 */ 359 fragcb_clear(skb); 360 rb_link_node(&skb->rbnode, parent, rbn); 361 rb_insert_color(&skb->rbnode, &q->rb_fragments); 362 } 363 364 skb->ip_defrag_offset = offset; 365 366 return IPFRAG_OK; 367 } 368 EXPORT_SYMBOL(inet_frag_queue_insert); 369 370 void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, 371 struct sk_buff *parent) 372 { 373 struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); 374 struct sk_buff **nextp; 375 int delta; 376 377 if (head != skb) { 378 fp = skb_clone(skb, GFP_ATOMIC); 379 if (!fp) 380 return NULL; 381 FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; 382 if (RB_EMPTY_NODE(&skb->rbnode)) 383 FRAG_CB(parent)->next_frag = fp; 384 else 385 rb_replace_node(&skb->rbnode, &fp->rbnode, 386 &q->rb_fragments); 387 if (q->fragments_tail == skb) 388 q->fragments_tail = fp; 389 skb_morph(skb, head); 390 FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; 391 rb_replace_node(&head->rbnode, &skb->rbnode, 392 &q->rb_fragments); 393 consume_skb(head); 394 head = skb; 395 } 396 WARN_ON(head->ip_defrag_offset != 0); 397 398 delta = -head->truesize; 399 400 /* Head of list must not be cloned. */ 401 if (skb_unclone(head, GFP_ATOMIC)) 402 return NULL; 403 404 delta += head->truesize; 405 if (delta) 406 add_frag_mem_limit(q->net, delta); 407 408 /* If the first fragment is fragmented itself, we split 409 * it to two chunks: the first with data and paged part 410 * and the second, holding only fragments. 411 */ 412 if (skb_has_frag_list(head)) { 413 struct sk_buff *clone; 414 int i, plen = 0; 415 416 clone = alloc_skb(0, GFP_ATOMIC); 417 if (!clone) 418 return NULL; 419 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 420 skb_frag_list_init(head); 421 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 422 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 423 clone->data_len = head->data_len - plen; 424 clone->len = clone->data_len; 425 head->truesize += clone->truesize; 426 clone->csum = 0; 427 clone->ip_summed = head->ip_summed; 428 add_frag_mem_limit(q->net, clone->truesize); 429 skb_shinfo(head)->frag_list = clone; 430 nextp = &clone->next; 431 } else { 432 nextp = &skb_shinfo(head)->frag_list; 433 } 434 435 return nextp; 436 } 437 EXPORT_SYMBOL(inet_frag_reasm_prepare); 438 439 void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, 440 void *reasm_data) 441 { 442 struct sk_buff **nextp = (struct sk_buff **)reasm_data; 443 struct rb_node *rbn; 444 struct sk_buff *fp; 445 446 skb_push(head, head->data - skb_network_header(head)); 447 448 /* Traverse the tree in order, to build frag_list. */ 449 fp = FRAG_CB(head)->next_frag; 450 rbn = rb_next(&head->rbnode); 451 rb_erase(&head->rbnode, &q->rb_fragments); 452 while (rbn || fp) { 453 /* fp points to the next sk_buff in the current run; 454 * rbn points to the next run. 455 */ 456 /* Go through the current run. */ 457 while (fp) { 458 *nextp = fp; 459 nextp = &fp->next; 460 fp->prev = NULL; 461 memset(&fp->rbnode, 0, sizeof(fp->rbnode)); 462 fp->sk = NULL; 463 head->data_len += fp->len; 464 head->len += fp->len; 465 if (head->ip_summed != fp->ip_summed) 466 head->ip_summed = CHECKSUM_NONE; 467 else if (head->ip_summed == CHECKSUM_COMPLETE) 468 head->csum = csum_add(head->csum, fp->csum); 469 head->truesize += fp->truesize; 470 fp = FRAG_CB(fp)->next_frag; 471 } 472 /* Move to the next run. */ 473 if (rbn) { 474 struct rb_node *rbnext = rb_next(rbn); 475 476 fp = rb_to_skb(rbn); 477 rb_erase(rbn, &q->rb_fragments); 478 rbn = rbnext; 479 } 480 } 481 sub_frag_mem_limit(q->net, head->truesize); 482 483 *nextp = NULL; 484 skb_mark_not_on_list(head); 485 head->prev = NULL; 486 head->tstamp = q->stamp; 487 } 488 EXPORT_SYMBOL(inet_frag_reasm_finish); 489 490 struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) 491 { 492 struct sk_buff *head; 493 494 if (q->fragments) { 495 head = q->fragments; 496 q->fragments = head->next; 497 } else { 498 struct sk_buff *skb; 499 500 head = skb_rb_first(&q->rb_fragments); 501 if (!head) 502 return NULL; 503 skb = FRAG_CB(head)->next_frag; 504 if (skb) 505 rb_replace_node(&head->rbnode, &skb->rbnode, 506 &q->rb_fragments); 507 else 508 rb_erase(&head->rbnode, &q->rb_fragments); 509 memset(&head->rbnode, 0, sizeof(head->rbnode)); 510 barrier(); 511 } 512 if (head == q->fragments_tail) 513 q->fragments_tail = NULL; 514 515 sub_frag_mem_limit(q->net, head->truesize); 516 517 return head; 518 } 519 EXPORT_SYMBOL(inet_frag_pull_head); 520