1 /* 2 * net/sched/cls_flow.c Generic flow classifier 3 * 4 * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 2 9 * of the License, or (at your option) any later version. 10 */ 11 12 #include <linux/kernel.h> 13 #include <linux/init.h> 14 #include <linux/list.h> 15 #include <linux/jhash.h> 16 #include <linux/random.h> 17 #include <linux/pkt_cls.h> 18 #include <linux/skbuff.h> 19 #include <linux/in.h> 20 #include <linux/ip.h> 21 #include <linux/ipv6.h> 22 #include <linux/if_vlan.h> 23 #include <linux/slab.h> 24 #include <linux/module.h> 25 26 #include <net/pkt_cls.h> 27 #include <net/ip.h> 28 #include <net/route.h> 29 #include <net/flow_keys.h> 30 31 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 32 #include <net/netfilter/nf_conntrack.h> 33 #endif 34 35 struct flow_head { 36 struct list_head filters; 37 }; 38 39 struct flow_filter { 40 struct list_head list; 41 struct tcf_exts exts; 42 struct tcf_ematch_tree ematches; 43 struct timer_list perturb_timer; 44 u32 perturb_period; 45 u32 handle; 46 47 u32 nkeys; 48 u32 keymask; 49 u32 mode; 50 u32 mask; 51 u32 xor; 52 u32 rshift; 53 u32 addend; 54 u32 divisor; 55 u32 baseclass; 56 u32 hashrnd; 57 }; 58 59 static const struct tcf_ext_map flow_ext_map = { 60 .action = TCA_FLOW_ACT, 61 .police = TCA_FLOW_POLICE, 62 }; 63 64 static inline u32 addr_fold(void *addr) 65 { 66 unsigned long a = (unsigned long)addr; 67 68 return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0); 69 } 70 71 static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) 72 { 73 if (flow->src) 74 return ntohl(flow->src); 75 return addr_fold(skb->sk); 76 } 77 78 static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) 79 { 80 if (flow->dst) 81 return ntohl(flow->dst); 82 return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; 83 } 84 85 static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) 86 { 87 return flow->ip_proto; 88 } 89 90 static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) 91 { 92 if (flow->ports) 93 return ntohs(flow->port16[0]); 94 95 return addr_fold(skb->sk); 96 } 97 98 static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) 99 { 100 if (flow->ports) 101 return ntohs(flow->port16[1]); 102 103 return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; 104 } 105 106 static u32 flow_get_iif(const struct sk_buff *skb) 107 { 108 return skb->skb_iif; 109 } 110 111 static u32 flow_get_priority(const struct sk_buff *skb) 112 { 113 return skb->priority; 114 } 115 116 static u32 flow_get_mark(const struct sk_buff *skb) 117 { 118 return skb->mark; 119 } 120 121 static u32 flow_get_nfct(const struct sk_buff *skb) 122 { 123 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 124 return addr_fold(skb->nfct); 125 #else 126 return 0; 127 #endif 128 } 129 130 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 131 #define CTTUPLE(skb, member) \ 132 ({ \ 133 enum ip_conntrack_info ctinfo; \ 134 const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \ 135 if (ct == NULL) \ 136 goto fallback; \ 137 ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \ 138 }) 139 #else 140 #define CTTUPLE(skb, member) \ 141 ({ \ 142 goto fallback; \ 143 0; \ 144 }) 145 #endif 146 147 static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) 148 { 149 switch (skb->protocol) { 150 case htons(ETH_P_IP): 151 return ntohl(CTTUPLE(skb, src.u3.ip)); 152 case htons(ETH_P_IPV6): 153 return ntohl(CTTUPLE(skb, src.u3.ip6[3])); 154 } 155 fallback: 156 return flow_get_src(skb, flow); 157 } 158 159 static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) 160 { 161 switch (skb->protocol) { 162 case htons(ETH_P_IP): 163 return ntohl(CTTUPLE(skb, dst.u3.ip)); 164 case htons(ETH_P_IPV6): 165 return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); 166 } 167 fallback: 168 return flow_get_dst(skb, flow); 169 } 170 171 static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) 172 { 173 return ntohs(CTTUPLE(skb, src.u.all)); 174 fallback: 175 return flow_get_proto_src(skb, flow); 176 } 177 178 static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) 179 { 180 return ntohs(CTTUPLE(skb, dst.u.all)); 181 fallback: 182 return flow_get_proto_dst(skb, flow); 183 } 184 185 static u32 flow_get_rtclassid(const struct sk_buff *skb) 186 { 187 #ifdef CONFIG_IP_ROUTE_CLASSID 188 if (skb_dst(skb)) 189 return skb_dst(skb)->tclassid; 190 #endif 191 return 0; 192 } 193 194 static u32 flow_get_skuid(const struct sk_buff *skb) 195 { 196 if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { 197 kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid; 198 return from_kuid(&init_user_ns, skuid); 199 } 200 return 0; 201 } 202 203 static u32 flow_get_skgid(const struct sk_buff *skb) 204 { 205 if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { 206 kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid; 207 return from_kgid(&init_user_ns, skgid); 208 } 209 return 0; 210 } 211 212 static u32 flow_get_vlan_tag(const struct sk_buff *skb) 213 { 214 u16 uninitialized_var(tag); 215 216 if (vlan_get_tag(skb, &tag) < 0) 217 return 0; 218 return tag & VLAN_VID_MASK; 219 } 220 221 static u32 flow_get_rxhash(struct sk_buff *skb) 222 { 223 return skb_get_rxhash(skb); 224 } 225 226 static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) 227 { 228 switch (key) { 229 case FLOW_KEY_SRC: 230 return flow_get_src(skb, flow); 231 case FLOW_KEY_DST: 232 return flow_get_dst(skb, flow); 233 case FLOW_KEY_PROTO: 234 return flow_get_proto(skb, flow); 235 case FLOW_KEY_PROTO_SRC: 236 return flow_get_proto_src(skb, flow); 237 case FLOW_KEY_PROTO_DST: 238 return flow_get_proto_dst(skb, flow); 239 case FLOW_KEY_IIF: 240 return flow_get_iif(skb); 241 case FLOW_KEY_PRIORITY: 242 return flow_get_priority(skb); 243 case FLOW_KEY_MARK: 244 return flow_get_mark(skb); 245 case FLOW_KEY_NFCT: 246 return flow_get_nfct(skb); 247 case FLOW_KEY_NFCT_SRC: 248 return flow_get_nfct_src(skb, flow); 249 case FLOW_KEY_NFCT_DST: 250 return flow_get_nfct_dst(skb, flow); 251 case FLOW_KEY_NFCT_PROTO_SRC: 252 return flow_get_nfct_proto_src(skb, flow); 253 case FLOW_KEY_NFCT_PROTO_DST: 254 return flow_get_nfct_proto_dst(skb, flow); 255 case FLOW_KEY_RTCLASSID: 256 return flow_get_rtclassid(skb); 257 case FLOW_KEY_SKUID: 258 return flow_get_skuid(skb); 259 case FLOW_KEY_SKGID: 260 return flow_get_skgid(skb); 261 case FLOW_KEY_VLAN_TAG: 262 return flow_get_vlan_tag(skb); 263 case FLOW_KEY_RXHASH: 264 return flow_get_rxhash(skb); 265 default: 266 WARN_ON(1); 267 return 0; 268 } 269 } 270 271 #define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \ 272 (1 << FLOW_KEY_DST) | \ 273 (1 << FLOW_KEY_PROTO) | \ 274 (1 << FLOW_KEY_PROTO_SRC) | \ 275 (1 << FLOW_KEY_PROTO_DST) | \ 276 (1 << FLOW_KEY_NFCT_SRC) | \ 277 (1 << FLOW_KEY_NFCT_DST) | \ 278 (1 << FLOW_KEY_NFCT_PROTO_SRC) | \ 279 (1 << FLOW_KEY_NFCT_PROTO_DST)) 280 281 static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, 282 struct tcf_result *res) 283 { 284 struct flow_head *head = tp->root; 285 struct flow_filter *f; 286 u32 keymask; 287 u32 classid; 288 unsigned int n, key; 289 int r; 290 291 list_for_each_entry(f, &head->filters, list) { 292 u32 keys[FLOW_KEY_MAX + 1]; 293 struct flow_keys flow_keys; 294 295 if (!tcf_em_tree_match(skb, &f->ematches, NULL)) 296 continue; 297 298 keymask = f->keymask; 299 if (keymask & FLOW_KEYS_NEEDED) 300 skb_flow_dissect(skb, &flow_keys); 301 302 for (n = 0; n < f->nkeys; n++) { 303 key = ffs(keymask) - 1; 304 keymask &= ~(1 << key); 305 keys[n] = flow_key_get(skb, key, &flow_keys); 306 } 307 308 if (f->mode == FLOW_MODE_HASH) 309 classid = jhash2(keys, f->nkeys, f->hashrnd); 310 else { 311 classid = keys[0]; 312 classid = (classid & f->mask) ^ f->xor; 313 classid = (classid >> f->rshift) + f->addend; 314 } 315 316 if (f->divisor) 317 classid %= f->divisor; 318 319 res->class = 0; 320 res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid); 321 322 r = tcf_exts_exec(skb, &f->exts, res); 323 if (r < 0) 324 continue; 325 return r; 326 } 327 return -1; 328 } 329 330 static void flow_perturbation(unsigned long arg) 331 { 332 struct flow_filter *f = (struct flow_filter *)arg; 333 334 get_random_bytes(&f->hashrnd, 4); 335 if (f->perturb_period) 336 mod_timer(&f->perturb_timer, jiffies + f->perturb_period); 337 } 338 339 static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { 340 [TCA_FLOW_KEYS] = { .type = NLA_U32 }, 341 [TCA_FLOW_MODE] = { .type = NLA_U32 }, 342 [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, 343 [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, 344 [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, 345 [TCA_FLOW_MASK] = { .type = NLA_U32 }, 346 [TCA_FLOW_XOR] = { .type = NLA_U32 }, 347 [TCA_FLOW_DIVISOR] = { .type = NLA_U32 }, 348 [TCA_FLOW_ACT] = { .type = NLA_NESTED }, 349 [TCA_FLOW_POLICE] = { .type = NLA_NESTED }, 350 [TCA_FLOW_EMATCHES] = { .type = NLA_NESTED }, 351 [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, 352 }; 353 354 static int flow_change(struct sk_buff *in_skb, 355 struct tcf_proto *tp, unsigned long base, 356 u32 handle, struct nlattr **tca, 357 unsigned long *arg) 358 { 359 struct flow_head *head = tp->root; 360 struct flow_filter *f; 361 struct nlattr *opt = tca[TCA_OPTIONS]; 362 struct nlattr *tb[TCA_FLOW_MAX + 1]; 363 struct tcf_exts e; 364 struct tcf_ematch_tree t; 365 unsigned int nkeys = 0; 366 unsigned int perturb_period = 0; 367 u32 baseclass = 0; 368 u32 keymask = 0; 369 u32 mode; 370 int err; 371 372 if (opt == NULL) 373 return -EINVAL; 374 375 err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy); 376 if (err < 0) 377 return err; 378 379 if (tb[TCA_FLOW_BASECLASS]) { 380 baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]); 381 if (TC_H_MIN(baseclass) == 0) 382 return -EINVAL; 383 } 384 385 if (tb[TCA_FLOW_KEYS]) { 386 keymask = nla_get_u32(tb[TCA_FLOW_KEYS]); 387 388 nkeys = hweight32(keymask); 389 if (nkeys == 0) 390 return -EINVAL; 391 392 if (fls(keymask) - 1 > FLOW_KEY_MAX) 393 return -EOPNOTSUPP; 394 395 if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && 396 sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns) 397 return -EOPNOTSUPP; 398 } 399 400 err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); 401 if (err < 0) 402 return err; 403 404 err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t); 405 if (err < 0) 406 goto err1; 407 408 f = (struct flow_filter *)*arg; 409 if (f != NULL) { 410 err = -EINVAL; 411 if (f->handle != handle && handle) 412 goto err2; 413 414 mode = f->mode; 415 if (tb[TCA_FLOW_MODE]) 416 mode = nla_get_u32(tb[TCA_FLOW_MODE]); 417 if (mode != FLOW_MODE_HASH && nkeys > 1) 418 goto err2; 419 420 if (mode == FLOW_MODE_HASH) 421 perturb_period = f->perturb_period; 422 if (tb[TCA_FLOW_PERTURB]) { 423 if (mode != FLOW_MODE_HASH) 424 goto err2; 425 perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; 426 } 427 } else { 428 err = -EINVAL; 429 if (!handle) 430 goto err2; 431 if (!tb[TCA_FLOW_KEYS]) 432 goto err2; 433 434 mode = FLOW_MODE_MAP; 435 if (tb[TCA_FLOW_MODE]) 436 mode = nla_get_u32(tb[TCA_FLOW_MODE]); 437 if (mode != FLOW_MODE_HASH && nkeys > 1) 438 goto err2; 439 440 if (tb[TCA_FLOW_PERTURB]) { 441 if (mode != FLOW_MODE_HASH) 442 goto err2; 443 perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; 444 } 445 446 if (TC_H_MAJ(baseclass) == 0) 447 baseclass = TC_H_MAKE(tp->q->handle, baseclass); 448 if (TC_H_MIN(baseclass) == 0) 449 baseclass = TC_H_MAKE(baseclass, 1); 450 451 err = -ENOBUFS; 452 f = kzalloc(sizeof(*f), GFP_KERNEL); 453 if (f == NULL) 454 goto err2; 455 456 f->handle = handle; 457 f->mask = ~0U; 458 459 get_random_bytes(&f->hashrnd, 4); 460 f->perturb_timer.function = flow_perturbation; 461 f->perturb_timer.data = (unsigned long)f; 462 init_timer_deferrable(&f->perturb_timer); 463 } 464 465 tcf_exts_change(tp, &f->exts, &e); 466 tcf_em_tree_change(tp, &f->ematches, &t); 467 468 tcf_tree_lock(tp); 469 470 if (tb[TCA_FLOW_KEYS]) { 471 f->keymask = keymask; 472 f->nkeys = nkeys; 473 } 474 475 f->mode = mode; 476 477 if (tb[TCA_FLOW_MASK]) 478 f->mask = nla_get_u32(tb[TCA_FLOW_MASK]); 479 if (tb[TCA_FLOW_XOR]) 480 f->xor = nla_get_u32(tb[TCA_FLOW_XOR]); 481 if (tb[TCA_FLOW_RSHIFT]) 482 f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); 483 if (tb[TCA_FLOW_ADDEND]) 484 f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); 485 486 if (tb[TCA_FLOW_DIVISOR]) 487 f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); 488 if (baseclass) 489 f->baseclass = baseclass; 490 491 f->perturb_period = perturb_period; 492 del_timer(&f->perturb_timer); 493 if (perturb_period) 494 mod_timer(&f->perturb_timer, jiffies + perturb_period); 495 496 if (*arg == 0) 497 list_add_tail(&f->list, &head->filters); 498 499 tcf_tree_unlock(tp); 500 501 *arg = (unsigned long)f; 502 return 0; 503 504 err2: 505 tcf_em_tree_destroy(tp, &t); 506 err1: 507 tcf_exts_destroy(tp, &e); 508 return err; 509 } 510 511 static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f) 512 { 513 del_timer_sync(&f->perturb_timer); 514 tcf_exts_destroy(tp, &f->exts); 515 tcf_em_tree_destroy(tp, &f->ematches); 516 kfree(f); 517 } 518 519 static int flow_delete(struct tcf_proto *tp, unsigned long arg) 520 { 521 struct flow_filter *f = (struct flow_filter *)arg; 522 523 tcf_tree_lock(tp); 524 list_del(&f->list); 525 tcf_tree_unlock(tp); 526 flow_destroy_filter(tp, f); 527 return 0; 528 } 529 530 static int flow_init(struct tcf_proto *tp) 531 { 532 struct flow_head *head; 533 534 head = kzalloc(sizeof(*head), GFP_KERNEL); 535 if (head == NULL) 536 return -ENOBUFS; 537 INIT_LIST_HEAD(&head->filters); 538 tp->root = head; 539 return 0; 540 } 541 542 static void flow_destroy(struct tcf_proto *tp) 543 { 544 struct flow_head *head = tp->root; 545 struct flow_filter *f, *next; 546 547 list_for_each_entry_safe(f, next, &head->filters, list) { 548 list_del(&f->list); 549 flow_destroy_filter(tp, f); 550 } 551 kfree(head); 552 } 553 554 static unsigned long flow_get(struct tcf_proto *tp, u32 handle) 555 { 556 struct flow_head *head = tp->root; 557 struct flow_filter *f; 558 559 list_for_each_entry(f, &head->filters, list) 560 if (f->handle == handle) 561 return (unsigned long)f; 562 return 0; 563 } 564 565 static void flow_put(struct tcf_proto *tp, unsigned long f) 566 { 567 } 568 569 static int flow_dump(struct tcf_proto *tp, unsigned long fh, 570 struct sk_buff *skb, struct tcmsg *t) 571 { 572 struct flow_filter *f = (struct flow_filter *)fh; 573 struct nlattr *nest; 574 575 if (f == NULL) 576 return skb->len; 577 578 t->tcm_handle = f->handle; 579 580 nest = nla_nest_start(skb, TCA_OPTIONS); 581 if (nest == NULL) 582 goto nla_put_failure; 583 584 if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) || 585 nla_put_u32(skb, TCA_FLOW_MODE, f->mode)) 586 goto nla_put_failure; 587 588 if (f->mask != ~0 || f->xor != 0) { 589 if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) || 590 nla_put_u32(skb, TCA_FLOW_XOR, f->xor)) 591 goto nla_put_failure; 592 } 593 if (f->rshift && 594 nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift)) 595 goto nla_put_failure; 596 if (f->addend && 597 nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend)) 598 goto nla_put_failure; 599 600 if (f->divisor && 601 nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor)) 602 goto nla_put_failure; 603 if (f->baseclass && 604 nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass)) 605 goto nla_put_failure; 606 607 if (f->perturb_period && 608 nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) 609 goto nla_put_failure; 610 611 if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) 612 goto nla_put_failure; 613 #ifdef CONFIG_NET_EMATCH 614 if (f->ematches.hdr.nmatches && 615 tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0) 616 goto nla_put_failure; 617 #endif 618 nla_nest_end(skb, nest); 619 620 if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) 621 goto nla_put_failure; 622 623 return skb->len; 624 625 nla_put_failure: 626 nlmsg_trim(skb, nest); 627 return -1; 628 } 629 630 static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg) 631 { 632 struct flow_head *head = tp->root; 633 struct flow_filter *f; 634 635 list_for_each_entry(f, &head->filters, list) { 636 if (arg->count < arg->skip) 637 goto skip; 638 if (arg->fn(tp, (unsigned long)f, arg) < 0) { 639 arg->stop = 1; 640 break; 641 } 642 skip: 643 arg->count++; 644 } 645 } 646 647 static struct tcf_proto_ops cls_flow_ops __read_mostly = { 648 .kind = "flow", 649 .classify = flow_classify, 650 .init = flow_init, 651 .destroy = flow_destroy, 652 .change = flow_change, 653 .delete = flow_delete, 654 .get = flow_get, 655 .put = flow_put, 656 .dump = flow_dump, 657 .walk = flow_walk, 658 .owner = THIS_MODULE, 659 }; 660 661 static int __init cls_flow_init(void) 662 { 663 return register_tcf_proto_ops(&cls_flow_ops); 664 } 665 666 static void __exit cls_flow_exit(void) 667 { 668 unregister_tcf_proto_ops(&cls_flow_ops); 669 } 670 671 module_init(cls_flow_init); 672 module_exit(cls_flow_exit); 673 674 MODULE_LICENSE("GPL"); 675 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); 676 MODULE_DESCRIPTION("TC flow classifier"); 677