1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2015 Nicira, Inc. 4 */ 5 6 #include <linux/module.h> 7 #include <linux/openvswitch.h> 8 #include <linux/tcp.h> 9 #include <linux/udp.h> 10 #include <linux/sctp.h> 11 #include <linux/static_key.h> 12 #include <net/ip.h> 13 #include <net/genetlink.h> 14 #include <net/netfilter/nf_conntrack_core.h> 15 #include <net/netfilter/nf_conntrack_count.h> 16 #include <net/netfilter/nf_conntrack_helper.h> 17 #include <net/netfilter/nf_conntrack_labels.h> 18 #include <net/netfilter/nf_conntrack_seqadj.h> 19 #include <net/netfilter/nf_conntrack_timeout.h> 20 #include <net/netfilter/nf_conntrack_zones.h> 21 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 22 #include <net/ipv6_frag.h> 23 24 #if IS_ENABLED(CONFIG_NF_NAT) 25 #include <net/netfilter/nf_nat.h> 26 #endif 27 28 #include "datapath.h" 29 #include "conntrack.h" 30 #include "flow.h" 31 #include "flow_netlink.h" 32 33 struct ovs_ct_len_tbl { 34 int maxlen; 35 int minlen; 36 }; 37 38 /* Metadata mark for masked write to conntrack mark */ 39 struct md_mark { 40 u32 value; 41 u32 mask; 42 }; 43 44 /* Metadata label for masked write to conntrack label. */ 45 struct md_labels { 46 struct ovs_key_ct_labels value; 47 struct ovs_key_ct_labels mask; 48 }; 49 50 enum ovs_ct_nat { 51 OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ 52 OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ 53 OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ 54 }; 55 56 /* Conntrack action context for execution. */ 57 struct ovs_conntrack_info { 58 struct nf_conntrack_helper *helper; 59 struct nf_conntrack_zone zone; 60 struct nf_conn *ct; 61 u8 commit : 1; 62 u8 nat : 3; /* enum ovs_ct_nat */ 63 u8 force : 1; 64 u8 have_eventmask : 1; 65 u16 family; 66 u32 eventmask; /* Mask of 1 << IPCT_*. */ 67 struct md_mark mark; 68 struct md_labels labels; 69 char timeout[CTNL_TIMEOUT_NAME_MAX]; 70 struct nf_ct_timeout *nf_ct_timeout; 71 #if IS_ENABLED(CONFIG_NF_NAT) 72 struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ 73 #endif 74 }; 75 76 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 77 #define OVS_CT_LIMIT_UNLIMITED 0 78 #define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED 79 #define CT_LIMIT_HASH_BUCKETS 512 80 static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled); 81 82 struct ovs_ct_limit { 83 /* Elements in ovs_ct_limit_info->limits hash table */ 84 struct hlist_node hlist_node; 85 struct rcu_head rcu; 86 u16 zone; 87 u32 limit; 88 }; 89 90 struct ovs_ct_limit_info { 91 u32 default_limit; 92 struct hlist_head *limits; 93 struct nf_conncount_data *data; 94 }; 95 96 static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = { 97 [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, }, 98 }; 99 #endif 100 101 static bool labels_nonzero(const struct ovs_key_ct_labels *labels); 102 103 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 104 105 static u16 key_to_nfproto(const struct sw_flow_key *key) 106 { 107 switch (ntohs(key->eth.type)) { 108 case ETH_P_IP: 109 return NFPROTO_IPV4; 110 case ETH_P_IPV6: 111 return NFPROTO_IPV6; 112 default: 113 return NFPROTO_UNSPEC; 114 } 115 } 116 117 /* Map SKB connection state into the values used by flow definition. */ 118 static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) 119 { 120 u8 ct_state = OVS_CS_F_TRACKED; 121 122 switch (ctinfo) { 123 case IP_CT_ESTABLISHED_REPLY: 124 case IP_CT_RELATED_REPLY: 125 ct_state |= OVS_CS_F_REPLY_DIR; 126 break; 127 default: 128 break; 129 } 130 131 switch (ctinfo) { 132 case IP_CT_ESTABLISHED: 133 case IP_CT_ESTABLISHED_REPLY: 134 ct_state |= OVS_CS_F_ESTABLISHED; 135 break; 136 case IP_CT_RELATED: 137 case IP_CT_RELATED_REPLY: 138 ct_state |= OVS_CS_F_RELATED; 139 break; 140 case IP_CT_NEW: 141 ct_state |= OVS_CS_F_NEW; 142 break; 143 default: 144 break; 145 } 146 147 return ct_state; 148 } 149 150 static u32 ovs_ct_get_mark(const struct nf_conn *ct) 151 { 152 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 153 return ct ? ct->mark : 0; 154 #else 155 return 0; 156 #endif 157 } 158 159 /* Guard against conntrack labels max size shrinking below 128 bits. */ 160 #if NF_CT_LABELS_MAX_SIZE < 16 161 #error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes 162 #endif 163 164 static void ovs_ct_get_labels(const struct nf_conn *ct, 165 struct ovs_key_ct_labels *labels) 166 { 167 struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; 168 169 if (cl) 170 memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); 171 else 172 memset(labels, 0, OVS_CT_LABELS_LEN); 173 } 174 175 static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key, 176 const struct nf_conntrack_tuple *orig, 177 u8 icmp_proto) 178 { 179 key->ct_orig_proto = orig->dst.protonum; 180 if (orig->dst.protonum == icmp_proto) { 181 key->ct.orig_tp.src = htons(orig->dst.u.icmp.type); 182 key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code); 183 } else { 184 key->ct.orig_tp.src = orig->src.u.all; 185 key->ct.orig_tp.dst = orig->dst.u.all; 186 } 187 } 188 189 static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, 190 const struct nf_conntrack_zone *zone, 191 const struct nf_conn *ct) 192 { 193 key->ct_state = state; 194 key->ct_zone = zone->id; 195 key->ct.mark = ovs_ct_get_mark(ct); 196 ovs_ct_get_labels(ct, &key->ct.labels); 197 198 if (ct) { 199 const struct nf_conntrack_tuple *orig; 200 201 /* Use the master if we have one. */ 202 if (ct->master) 203 ct = ct->master; 204 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 205 206 /* IP version must match with the master connection. */ 207 if (key->eth.type == htons(ETH_P_IP) && 208 nf_ct_l3num(ct) == NFPROTO_IPV4) { 209 key->ipv4.ct_orig.src = orig->src.u3.ip; 210 key->ipv4.ct_orig.dst = orig->dst.u3.ip; 211 __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP); 212 return; 213 } else if (key->eth.type == htons(ETH_P_IPV6) && 214 !sw_flow_key_is_nd(key) && 215 nf_ct_l3num(ct) == NFPROTO_IPV6) { 216 key->ipv6.ct_orig.src = orig->src.u3.in6; 217 key->ipv6.ct_orig.dst = orig->dst.u3.in6; 218 __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP); 219 return; 220 } 221 } 222 /* Clear 'ct_orig_proto' to mark the non-existence of conntrack 223 * original direction key fields. 224 */ 225 key->ct_orig_proto = 0; 226 } 227 228 /* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has 229 * previously sent the packet to conntrack via the ct action. If 230 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are 231 * initialized from the connection status. 232 */ 233 static void ovs_ct_update_key(const struct sk_buff *skb, 234 const struct ovs_conntrack_info *info, 235 struct sw_flow_key *key, bool post_ct, 236 bool keep_nat_flags) 237 { 238 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 239 enum ip_conntrack_info ctinfo; 240 struct nf_conn *ct; 241 u8 state = 0; 242 243 ct = nf_ct_get(skb, &ctinfo); 244 if (ct) { 245 state = ovs_ct_get_state(ctinfo); 246 /* All unconfirmed entries are NEW connections. */ 247 if (!nf_ct_is_confirmed(ct)) 248 state |= OVS_CS_F_NEW; 249 /* OVS persists the related flag for the duration of the 250 * connection. 251 */ 252 if (ct->master) 253 state |= OVS_CS_F_RELATED; 254 if (keep_nat_flags) { 255 state |= key->ct_state & OVS_CS_F_NAT_MASK; 256 } else { 257 if (ct->status & IPS_SRC_NAT) 258 state |= OVS_CS_F_SRC_NAT; 259 if (ct->status & IPS_DST_NAT) 260 state |= OVS_CS_F_DST_NAT; 261 } 262 zone = nf_ct_zone(ct); 263 } else if (post_ct) { 264 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; 265 if (info) 266 zone = &info->zone; 267 } 268 __ovs_ct_update_key(key, state, zone, ct); 269 } 270 271 /* This is called to initialize CT key fields possibly coming in from the local 272 * stack. 273 */ 274 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) 275 { 276 ovs_ct_update_key(skb, NULL, key, false, false); 277 } 278 279 #define IN6_ADDR_INITIALIZER(ADDR) \ 280 { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \ 281 (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] } 282 283 int ovs_ct_put_key(const struct sw_flow_key *swkey, 284 const struct sw_flow_key *output, struct sk_buff *skb) 285 { 286 if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state)) 287 return -EMSGSIZE; 288 289 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 290 nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone)) 291 return -EMSGSIZE; 292 293 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 294 nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark)) 295 return -EMSGSIZE; 296 297 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 298 nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels), 299 &output->ct.labels)) 300 return -EMSGSIZE; 301 302 if (swkey->ct_orig_proto) { 303 if (swkey->eth.type == htons(ETH_P_IP)) { 304 struct ovs_key_ct_tuple_ipv4 orig = { 305 output->ipv4.ct_orig.src, 306 output->ipv4.ct_orig.dst, 307 output->ct.orig_tp.src, 308 output->ct.orig_tp.dst, 309 output->ct_orig_proto, 310 }; 311 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, 312 sizeof(orig), &orig)) 313 return -EMSGSIZE; 314 } else if (swkey->eth.type == htons(ETH_P_IPV6)) { 315 struct ovs_key_ct_tuple_ipv6 orig = { 316 IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src), 317 IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst), 318 output->ct.orig_tp.src, 319 output->ct.orig_tp.dst, 320 output->ct_orig_proto, 321 }; 322 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, 323 sizeof(orig), &orig)) 324 return -EMSGSIZE; 325 } 326 } 327 328 return 0; 329 } 330 331 static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, 332 u32 ct_mark, u32 mask) 333 { 334 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 335 u32 new_mark; 336 337 new_mark = ct_mark | (ct->mark & ~(mask)); 338 if (ct->mark != new_mark) { 339 ct->mark = new_mark; 340 if (nf_ct_is_confirmed(ct)) 341 nf_conntrack_event_cache(IPCT_MARK, ct); 342 key->ct.mark = new_mark; 343 } 344 345 return 0; 346 #else 347 return -ENOTSUPP; 348 #endif 349 } 350 351 static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct) 352 { 353 struct nf_conn_labels *cl; 354 355 cl = nf_ct_labels_find(ct); 356 if (!cl) { 357 nf_ct_labels_ext_add(ct); 358 cl = nf_ct_labels_find(ct); 359 } 360 361 return cl; 362 } 363 364 /* Initialize labels for a new, yet to be committed conntrack entry. Note that 365 * since the new connection is not yet confirmed, and thus no-one else has 366 * access to it's labels, we simply write them over. 367 */ 368 static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key, 369 const struct ovs_key_ct_labels *labels, 370 const struct ovs_key_ct_labels *mask) 371 { 372 struct nf_conn_labels *cl, *master_cl; 373 bool have_mask = labels_nonzero(mask); 374 375 /* Inherit master's labels to the related connection? */ 376 master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL; 377 378 if (!master_cl && !have_mask) 379 return 0; /* Nothing to do. */ 380 381 cl = ovs_ct_get_conn_labels(ct); 382 if (!cl) 383 return -ENOSPC; 384 385 /* Inherit the master's labels, if any. */ 386 if (master_cl) 387 *cl = *master_cl; 388 389 if (have_mask) { 390 u32 *dst = (u32 *)cl->bits; 391 int i; 392 393 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) 394 dst[i] = (dst[i] & ~mask->ct_labels_32[i]) | 395 (labels->ct_labels_32[i] 396 & mask->ct_labels_32[i]); 397 } 398 399 /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the 400 * IPCT_LABEL bit is set in the event cache. 401 */ 402 nf_conntrack_event_cache(IPCT_LABEL, ct); 403 404 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); 405 406 return 0; 407 } 408 409 static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, 410 const struct ovs_key_ct_labels *labels, 411 const struct ovs_key_ct_labels *mask) 412 { 413 struct nf_conn_labels *cl; 414 int err; 415 416 cl = ovs_ct_get_conn_labels(ct); 417 if (!cl) 418 return -ENOSPC; 419 420 err = nf_connlabels_replace(ct, labels->ct_labels_32, 421 mask->ct_labels_32, 422 OVS_CT_LABELS_LEN_32); 423 if (err) 424 return err; 425 426 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); 427 428 return 0; 429 } 430 431 /* 'skb' should already be pulled to nh_ofs. */ 432 static int ovs_ct_helper(struct sk_buff *skb, u16 proto) 433 { 434 const struct nf_conntrack_helper *helper; 435 const struct nf_conn_help *help; 436 enum ip_conntrack_info ctinfo; 437 unsigned int protoff; 438 struct nf_conn *ct; 439 int err; 440 441 ct = nf_ct_get(skb, &ctinfo); 442 if (!ct || ctinfo == IP_CT_RELATED_REPLY) 443 return NF_ACCEPT; 444 445 help = nfct_help(ct); 446 if (!help) 447 return NF_ACCEPT; 448 449 helper = rcu_dereference(help->helper); 450 if (!helper) 451 return NF_ACCEPT; 452 453 switch (proto) { 454 case NFPROTO_IPV4: 455 protoff = ip_hdrlen(skb); 456 break; 457 case NFPROTO_IPV6: { 458 u8 nexthdr = ipv6_hdr(skb)->nexthdr; 459 __be16 frag_off; 460 int ofs; 461 462 ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, 463 &frag_off); 464 if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { 465 pr_debug("proto header not found\n"); 466 return NF_ACCEPT; 467 } 468 protoff = ofs; 469 break; 470 } 471 default: 472 WARN_ONCE(1, "helper invoked on non-IP family!"); 473 return NF_DROP; 474 } 475 476 err = helper->help(skb, protoff, ct, ctinfo); 477 if (err != NF_ACCEPT) 478 return err; 479 480 /* Adjust seqs after helper. This is needed due to some helpers (e.g., 481 * FTP with NAT) adusting the TCP payload size when mangling IP 482 * addresses and/or port numbers in the text-based control connection. 483 */ 484 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 485 !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) 486 return NF_DROP; 487 return NF_ACCEPT; 488 } 489 490 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero 491 * value if 'skb' is freed. 492 */ 493 static int handle_fragments(struct net *net, struct sw_flow_key *key, 494 u16 zone, struct sk_buff *skb) 495 { 496 struct ovs_skb_cb ovs_cb = *OVS_CB(skb); 497 int err; 498 499 if (key->eth.type == htons(ETH_P_IP)) { 500 enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; 501 502 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); 503 err = ip_defrag(net, skb, user); 504 if (err) 505 return err; 506 507 ovs_cb.mru = IPCB(skb)->frag_max_size; 508 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) 509 } else if (key->eth.type == htons(ETH_P_IPV6)) { 510 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; 511 512 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); 513 err = nf_ct_frag6_gather(net, skb, user); 514 if (err) { 515 if (err != -EINPROGRESS) 516 kfree_skb(skb); 517 return err; 518 } 519 520 key->ip.proto = ipv6_hdr(skb)->nexthdr; 521 ovs_cb.mru = IP6CB(skb)->frag_max_size; 522 #endif 523 } else { 524 kfree_skb(skb); 525 return -EPFNOSUPPORT; 526 } 527 528 key->ip.frag = OVS_FRAG_TYPE_NONE; 529 skb_clear_hash(skb); 530 skb->ignore_df = 1; 531 *OVS_CB(skb) = ovs_cb; 532 533 return 0; 534 } 535 536 static struct nf_conntrack_expect * 537 ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, 538 u16 proto, const struct sk_buff *skb) 539 { 540 struct nf_conntrack_tuple tuple; 541 struct nf_conntrack_expect *exp; 542 543 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) 544 return NULL; 545 546 exp = __nf_ct_expect_find(net, zone, &tuple); 547 if (exp) { 548 struct nf_conntrack_tuple_hash *h; 549 550 /* Delete existing conntrack entry, if it clashes with the 551 * expectation. This can happen since conntrack ALGs do not 552 * check for clashes between (new) expectations and existing 553 * conntrack entries. nf_conntrack_in() will check the 554 * expectations only if a conntrack entry can not be found, 555 * which can lead to OVS finding the expectation (here) in the 556 * init direction, but which will not be removed by the 557 * nf_conntrack_in() call, if a matching conntrack entry is 558 * found instead. In this case all init direction packets 559 * would be reported as new related packets, while reply 560 * direction packets would be reported as un-related 561 * established packets. 562 */ 563 h = nf_conntrack_find_get(net, zone, &tuple); 564 if (h) { 565 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 566 567 nf_ct_delete(ct, 0, 0); 568 nf_conntrack_put(&ct->ct_general); 569 } 570 } 571 572 return exp; 573 } 574 575 /* This replicates logic from nf_conntrack_core.c that is not exported. */ 576 static enum ip_conntrack_info 577 ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) 578 { 579 const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 580 581 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) 582 return IP_CT_ESTABLISHED_REPLY; 583 /* Once we've had two way comms, always ESTABLISHED. */ 584 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 585 return IP_CT_ESTABLISHED; 586 if (test_bit(IPS_EXPECTED_BIT, &ct->status)) 587 return IP_CT_RELATED; 588 return IP_CT_NEW; 589 } 590 591 /* Find an existing connection which this packet belongs to without 592 * re-attributing statistics or modifying the connection state. This allows an 593 * skb->_nfct lost due to an upcall to be recovered during actions execution. 594 * 595 * Must be called with rcu_read_lock. 596 * 597 * On success, populates skb->_nfct and returns the connection. Returns NULL 598 * if there is no existing entry. 599 */ 600 static struct nf_conn * 601 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, 602 u8 l3num, struct sk_buff *skb, bool natted) 603 { 604 struct nf_conntrack_tuple tuple; 605 struct nf_conntrack_tuple_hash *h; 606 struct nf_conn *ct; 607 608 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, 609 net, &tuple)) { 610 pr_debug("ovs_ct_find_existing: Can't get tuple\n"); 611 return NULL; 612 } 613 614 /* Must invert the tuple if skb has been transformed by NAT. */ 615 if (natted) { 616 struct nf_conntrack_tuple inverse; 617 618 if (!nf_ct_invert_tuple(&inverse, &tuple)) { 619 pr_debug("ovs_ct_find_existing: Inversion failed!\n"); 620 return NULL; 621 } 622 tuple = inverse; 623 } 624 625 /* look for tuple match */ 626 h = nf_conntrack_find_get(net, zone, &tuple); 627 if (!h) 628 return NULL; /* Not found. */ 629 630 ct = nf_ct_tuplehash_to_ctrack(h); 631 632 /* Inverted packet tuple matches the reverse direction conntrack tuple, 633 * select the other tuplehash to get the right 'ctinfo' bits for this 634 * packet. 635 */ 636 if (natted) 637 h = &ct->tuplehash[!h->tuple.dst.dir]; 638 639 nf_ct_set(skb, ct, ovs_ct_get_info(h)); 640 return ct; 641 } 642 643 static 644 struct nf_conn *ovs_ct_executed(struct net *net, 645 const struct sw_flow_key *key, 646 const struct ovs_conntrack_info *info, 647 struct sk_buff *skb, 648 bool *ct_executed) 649 { 650 struct nf_conn *ct = NULL; 651 652 /* If no ct, check if we have evidence that an existing conntrack entry 653 * might be found for this skb. This happens when we lose a skb->_nfct 654 * due to an upcall, or if the direction is being forced. If the 655 * connection was not confirmed, it is not cached and needs to be run 656 * through conntrack again. 657 */ 658 *ct_executed = (key->ct_state & OVS_CS_F_TRACKED) && 659 !(key->ct_state & OVS_CS_F_INVALID) && 660 (key->ct_zone == info->zone.id); 661 662 if (*ct_executed || (!key->ct_state && info->force)) { 663 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, 664 !!(key->ct_state & 665 OVS_CS_F_NAT_MASK)); 666 } 667 668 return ct; 669 } 670 671 /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ 672 static bool skb_nfct_cached(struct net *net, 673 const struct sw_flow_key *key, 674 const struct ovs_conntrack_info *info, 675 struct sk_buff *skb) 676 { 677 enum ip_conntrack_info ctinfo; 678 struct nf_conn *ct; 679 bool ct_executed = true; 680 681 ct = nf_ct_get(skb, &ctinfo); 682 if (!ct) 683 ct = ovs_ct_executed(net, key, info, skb, &ct_executed); 684 685 if (ct) 686 nf_ct_get(skb, &ctinfo); 687 else 688 return false; 689 690 if (!net_eq(net, read_pnet(&ct->ct_net))) 691 return false; 692 if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) 693 return false; 694 if (info->helper) { 695 struct nf_conn_help *help; 696 697 help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); 698 if (help && rcu_access_pointer(help->helper) != info->helper) 699 return false; 700 } 701 if (info->nf_ct_timeout) { 702 struct nf_conn_timeout *timeout_ext; 703 704 timeout_ext = nf_ct_timeout_find(ct); 705 if (!timeout_ext || info->nf_ct_timeout != 706 rcu_dereference(timeout_ext->timeout)) 707 return false; 708 } 709 /* Force conntrack entry direction to the current packet? */ 710 if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { 711 /* Delete the conntrack entry if confirmed, else just release 712 * the reference. 713 */ 714 if (nf_ct_is_confirmed(ct)) 715 nf_ct_delete(ct, 0, 0); 716 717 nf_conntrack_put(&ct->ct_general); 718 nf_ct_set(skb, NULL, 0); 719 return false; 720 } 721 722 return ct_executed; 723 } 724 725 #if IS_ENABLED(CONFIG_NF_NAT) 726 /* Modelled after nf_nat_ipv[46]_fn(). 727 * range is only used for new, uninitialized NAT state. 728 * Returns either NF_ACCEPT or NF_DROP. 729 */ 730 static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, 731 enum ip_conntrack_info ctinfo, 732 const struct nf_nat_range2 *range, 733 enum nf_nat_manip_type maniptype) 734 { 735 int hooknum, nh_off, err = NF_ACCEPT; 736 737 nh_off = skb_network_offset(skb); 738 skb_pull_rcsum(skb, nh_off); 739 740 /* See HOOK2MANIP(). */ 741 if (maniptype == NF_NAT_MANIP_SRC) 742 hooknum = NF_INET_LOCAL_IN; /* Source NAT */ 743 else 744 hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ 745 746 switch (ctinfo) { 747 case IP_CT_RELATED: 748 case IP_CT_RELATED_REPLY: 749 if (IS_ENABLED(CONFIG_NF_NAT) && 750 skb->protocol == htons(ETH_P_IP) && 751 ip_hdr(skb)->protocol == IPPROTO_ICMP) { 752 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, 753 hooknum)) 754 err = NF_DROP; 755 goto push; 756 } else if (IS_ENABLED(CONFIG_IPV6) && 757 skb->protocol == htons(ETH_P_IPV6)) { 758 __be16 frag_off; 759 u8 nexthdr = ipv6_hdr(skb)->nexthdr; 760 int hdrlen = ipv6_skip_exthdr(skb, 761 sizeof(struct ipv6hdr), 762 &nexthdr, &frag_off); 763 764 if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { 765 if (!nf_nat_icmpv6_reply_translation(skb, ct, 766 ctinfo, 767 hooknum, 768 hdrlen)) 769 err = NF_DROP; 770 goto push; 771 } 772 } 773 /* Non-ICMP, fall thru to initialize if needed. */ 774 /* fall through */ 775 case IP_CT_NEW: 776 /* Seen it before? This can happen for loopback, retrans, 777 * or local packets. 778 */ 779 if (!nf_nat_initialized(ct, maniptype)) { 780 /* Initialize according to the NAT action. */ 781 err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) 782 /* Action is set up to establish a new 783 * mapping. 784 */ 785 ? nf_nat_setup_info(ct, range, maniptype) 786 : nf_nat_alloc_null_binding(ct, hooknum); 787 if (err != NF_ACCEPT) 788 goto push; 789 } 790 break; 791 792 case IP_CT_ESTABLISHED: 793 case IP_CT_ESTABLISHED_REPLY: 794 break; 795 796 default: 797 err = NF_DROP; 798 goto push; 799 } 800 801 err = nf_nat_packet(ct, ctinfo, hooknum, skb); 802 push: 803 skb_push(skb, nh_off); 804 skb_postpush_rcsum(skb, skb->data, nh_off); 805 806 return err; 807 } 808 809 static void ovs_nat_update_key(struct sw_flow_key *key, 810 const struct sk_buff *skb, 811 enum nf_nat_manip_type maniptype) 812 { 813 if (maniptype == NF_NAT_MANIP_SRC) { 814 __be16 src; 815 816 key->ct_state |= OVS_CS_F_SRC_NAT; 817 if (key->eth.type == htons(ETH_P_IP)) 818 key->ipv4.addr.src = ip_hdr(skb)->saddr; 819 else if (key->eth.type == htons(ETH_P_IPV6)) 820 memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, 821 sizeof(key->ipv6.addr.src)); 822 else 823 return; 824 825 if (key->ip.proto == IPPROTO_UDP) 826 src = udp_hdr(skb)->source; 827 else if (key->ip.proto == IPPROTO_TCP) 828 src = tcp_hdr(skb)->source; 829 else if (key->ip.proto == IPPROTO_SCTP) 830 src = sctp_hdr(skb)->source; 831 else 832 return; 833 834 key->tp.src = src; 835 } else { 836 __be16 dst; 837 838 key->ct_state |= OVS_CS_F_DST_NAT; 839 if (key->eth.type == htons(ETH_P_IP)) 840 key->ipv4.addr.dst = ip_hdr(skb)->daddr; 841 else if (key->eth.type == htons(ETH_P_IPV6)) 842 memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, 843 sizeof(key->ipv6.addr.dst)); 844 else 845 return; 846 847 if (key->ip.proto == IPPROTO_UDP) 848 dst = udp_hdr(skb)->dest; 849 else if (key->ip.proto == IPPROTO_TCP) 850 dst = tcp_hdr(skb)->dest; 851 else if (key->ip.proto == IPPROTO_SCTP) 852 dst = sctp_hdr(skb)->dest; 853 else 854 return; 855 856 key->tp.dst = dst; 857 } 858 } 859 860 /* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ 861 static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, 862 const struct ovs_conntrack_info *info, 863 struct sk_buff *skb, struct nf_conn *ct, 864 enum ip_conntrack_info ctinfo) 865 { 866 enum nf_nat_manip_type maniptype; 867 int err; 868 869 /* Add NAT extension if not confirmed yet. */ 870 if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) 871 return NF_ACCEPT; /* Can't NAT. */ 872 873 /* Determine NAT type. 874 * Check if the NAT type can be deduced from the tracked connection. 875 * Make sure new expected connections (IP_CT_RELATED) are NATted only 876 * when committing. 877 */ 878 if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && 879 ct->status & IPS_NAT_MASK && 880 (ctinfo != IP_CT_RELATED || info->commit)) { 881 /* NAT an established or related connection like before. */ 882 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) 883 /* This is the REPLY direction for a connection 884 * for which NAT was applied in the forward 885 * direction. Do the reverse NAT. 886 */ 887 maniptype = ct->status & IPS_SRC_NAT 888 ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; 889 else 890 maniptype = ct->status & IPS_SRC_NAT 891 ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; 892 } else if (info->nat & OVS_CT_SRC_NAT) { 893 maniptype = NF_NAT_MANIP_SRC; 894 } else if (info->nat & OVS_CT_DST_NAT) { 895 maniptype = NF_NAT_MANIP_DST; 896 } else { 897 return NF_ACCEPT; /* Connection is not NATed. */ 898 } 899 err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); 900 901 /* Mark NAT done if successful and update the flow key. */ 902 if (err == NF_ACCEPT) 903 ovs_nat_update_key(key, skb, maniptype); 904 905 return err; 906 } 907 #else /* !CONFIG_NF_NAT */ 908 static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, 909 const struct ovs_conntrack_info *info, 910 struct sk_buff *skb, struct nf_conn *ct, 911 enum ip_conntrack_info ctinfo) 912 { 913 return NF_ACCEPT; 914 } 915 #endif 916 917 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if 918 * not done already. Update key with new CT state after passing the packet 919 * through conntrack. 920 * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be 921 * set to NULL and 0 will be returned. 922 */ 923 static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 924 const struct ovs_conntrack_info *info, 925 struct sk_buff *skb) 926 { 927 /* If we are recirculating packets to match on conntrack fields and 928 * committing with a separate conntrack action, then we don't need to 929 * actually run the packet through conntrack twice unless it's for a 930 * different zone. 931 */ 932 bool cached = skb_nfct_cached(net, key, info, skb); 933 enum ip_conntrack_info ctinfo; 934 struct nf_conn *ct; 935 936 if (!cached) { 937 struct nf_hook_state state = { 938 .hook = NF_INET_PRE_ROUTING, 939 .pf = info->family, 940 .net = net, 941 }; 942 struct nf_conn *tmpl = info->ct; 943 int err; 944 945 /* Associate skb with specified zone. */ 946 if (tmpl) { 947 if (skb_nfct(skb)) 948 nf_conntrack_put(skb_nfct(skb)); 949 nf_conntrack_get(&tmpl->ct_general); 950 nf_ct_set(skb, tmpl, IP_CT_NEW); 951 } 952 953 err = nf_conntrack_in(skb, &state); 954 if (err != NF_ACCEPT) 955 return -ENOENT; 956 957 /* Clear CT state NAT flags to mark that we have not yet done 958 * NAT after the nf_conntrack_in() call. We can actually clear 959 * the whole state, as it will be re-initialized below. 960 */ 961 key->ct_state = 0; 962 963 /* Update the key, but keep the NAT flags. */ 964 ovs_ct_update_key(skb, info, key, true, true); 965 } 966 967 ct = nf_ct_get(skb, &ctinfo); 968 if (ct) { 969 /* Packets starting a new connection must be NATted before the 970 * helper, so that the helper knows about the NAT. We enforce 971 * this by delaying both NAT and helper calls for unconfirmed 972 * connections until the committing CT action. For later 973 * packets NAT and Helper may be called in either order. 974 * 975 * NAT will be done only if the CT action has NAT, and only 976 * once per packet (per zone), as guarded by the NAT bits in 977 * the key->ct_state. 978 */ 979 if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) && 980 (nf_ct_is_confirmed(ct) || info->commit) && 981 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { 982 return -EINVAL; 983 } 984 985 /* Userspace may decide to perform a ct lookup without a helper 986 * specified followed by a (recirculate and) commit with one. 987 * Therefore, for unconfirmed connections which we will commit, 988 * we need to attach the helper here. 989 */ 990 if (!nf_ct_is_confirmed(ct) && info->commit && 991 info->helper && !nfct_help(ct)) { 992 int err = __nf_ct_try_assign_helper(ct, info->ct, 993 GFP_ATOMIC); 994 if (err) 995 return err; 996 997 /* helper installed, add seqadj if NAT is required */ 998 if (info->nat && !nfct_seqadj(ct)) { 999 if (!nfct_seqadj_ext_add(ct)) 1000 return -EINVAL; 1001 } 1002 } 1003 1004 /* Call the helper only if: 1005 * - nf_conntrack_in() was executed above ("!cached") for a 1006 * confirmed connection, or 1007 * - When committing an unconfirmed connection. 1008 */ 1009 if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && 1010 ovs_ct_helper(skb, info->family) != NF_ACCEPT) { 1011 return -EINVAL; 1012 } 1013 } 1014 1015 return 0; 1016 } 1017 1018 /* Lookup connection and read fields into key. */ 1019 static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 1020 const struct ovs_conntrack_info *info, 1021 struct sk_buff *skb) 1022 { 1023 struct nf_conntrack_expect *exp; 1024 1025 /* If we pass an expected packet through nf_conntrack_in() the 1026 * expectation is typically removed, but the packet could still be 1027 * lost in upcall processing. To prevent this from happening we 1028 * perform an explicit expectation lookup. Expected connections are 1029 * always new, and will be passed through conntrack only when they are 1030 * committed, as it is OK to remove the expectation at that time. 1031 */ 1032 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); 1033 if (exp) { 1034 u8 state; 1035 1036 /* NOTE: New connections are NATted and Helped only when 1037 * committed, so we are not calling into NAT here. 1038 */ 1039 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; 1040 __ovs_ct_update_key(key, state, &info->zone, exp->master); 1041 } else { 1042 struct nf_conn *ct; 1043 int err; 1044 1045 err = __ovs_ct_lookup(net, key, info, skb); 1046 if (err) 1047 return err; 1048 1049 ct = (struct nf_conn *)skb_nfct(skb); 1050 if (ct) 1051 nf_ct_deliver_cached_events(ct); 1052 } 1053 1054 return 0; 1055 } 1056 1057 static bool labels_nonzero(const struct ovs_key_ct_labels *labels) 1058 { 1059 size_t i; 1060 1061 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) 1062 if (labels->ct_labels_32[i]) 1063 return true; 1064 1065 return false; 1066 } 1067 1068 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 1069 static struct hlist_head *ct_limit_hash_bucket( 1070 const struct ovs_ct_limit_info *info, u16 zone) 1071 { 1072 return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)]; 1073 } 1074 1075 /* Call with ovs_mutex */ 1076 static void ct_limit_set(const struct ovs_ct_limit_info *info, 1077 struct ovs_ct_limit *new_ct_limit) 1078 { 1079 struct ovs_ct_limit *ct_limit; 1080 struct hlist_head *head; 1081 1082 head = ct_limit_hash_bucket(info, new_ct_limit->zone); 1083 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { 1084 if (ct_limit->zone == new_ct_limit->zone) { 1085 hlist_replace_rcu(&ct_limit->hlist_node, 1086 &new_ct_limit->hlist_node); 1087 kfree_rcu(ct_limit, rcu); 1088 return; 1089 } 1090 } 1091 1092 hlist_add_head_rcu(&new_ct_limit->hlist_node, head); 1093 } 1094 1095 /* Call with ovs_mutex */ 1096 static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone) 1097 { 1098 struct ovs_ct_limit *ct_limit; 1099 struct hlist_head *head; 1100 struct hlist_node *n; 1101 1102 head = ct_limit_hash_bucket(info, zone); 1103 hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) { 1104 if (ct_limit->zone == zone) { 1105 hlist_del_rcu(&ct_limit->hlist_node); 1106 kfree_rcu(ct_limit, rcu); 1107 return; 1108 } 1109 } 1110 } 1111 1112 /* Call with RCU read lock */ 1113 static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone) 1114 { 1115 struct ovs_ct_limit *ct_limit; 1116 struct hlist_head *head; 1117 1118 head = ct_limit_hash_bucket(info, zone); 1119 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { 1120 if (ct_limit->zone == zone) 1121 return ct_limit->limit; 1122 } 1123 1124 return info->default_limit; 1125 } 1126 1127 static int ovs_ct_check_limit(struct net *net, 1128 const struct ovs_conntrack_info *info, 1129 const struct nf_conntrack_tuple *tuple) 1130 { 1131 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 1132 const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; 1133 u32 per_zone_limit, connections; 1134 u32 conncount_key; 1135 1136 conncount_key = info->zone.id; 1137 1138 per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id); 1139 if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) 1140 return 0; 1141 1142 connections = nf_conncount_count(net, ct_limit_info->data, 1143 &conncount_key, tuple, &info->zone); 1144 if (connections > per_zone_limit) 1145 return -ENOMEM; 1146 1147 return 0; 1148 } 1149 #endif 1150 1151 /* Lookup connection and confirm if unconfirmed. */ 1152 static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, 1153 const struct ovs_conntrack_info *info, 1154 struct sk_buff *skb) 1155 { 1156 enum ip_conntrack_info ctinfo; 1157 struct nf_conn *ct; 1158 int err; 1159 1160 err = __ovs_ct_lookup(net, key, info, skb); 1161 if (err) 1162 return err; 1163 1164 /* The connection could be invalid, in which case this is a no-op.*/ 1165 ct = nf_ct_get(skb, &ctinfo); 1166 if (!ct) 1167 return 0; 1168 1169 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 1170 if (static_branch_unlikely(&ovs_ct_limit_enabled)) { 1171 if (!nf_ct_is_confirmed(ct)) { 1172 err = ovs_ct_check_limit(net, info, 1173 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 1174 if (err) { 1175 net_warn_ratelimited("openvswitch: zone: %u " 1176 "exceeds conntrack limit\n", 1177 info->zone.id); 1178 return err; 1179 } 1180 } 1181 } 1182 #endif 1183 1184 /* Set the conntrack event mask if given. NEW and DELETE events have 1185 * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener 1186 * typically would receive many kinds of updates. Setting the event 1187 * mask allows those events to be filtered. The set event mask will 1188 * remain in effect for the lifetime of the connection unless changed 1189 * by a further CT action with both the commit flag and the eventmask 1190 * option. */ 1191 if (info->have_eventmask) { 1192 struct nf_conntrack_ecache *cache = nf_ct_ecache_find(ct); 1193 1194 if (cache) 1195 cache->ctmask = info->eventmask; 1196 } 1197 1198 /* Apply changes before confirming the connection so that the initial 1199 * conntrack NEW netlink event carries the values given in the CT 1200 * action. 1201 */ 1202 if (info->mark.mask) { 1203 err = ovs_ct_set_mark(ct, key, info->mark.value, 1204 info->mark.mask); 1205 if (err) 1206 return err; 1207 } 1208 if (!nf_ct_is_confirmed(ct)) { 1209 err = ovs_ct_init_labels(ct, key, &info->labels.value, 1210 &info->labels.mask); 1211 if (err) 1212 return err; 1213 } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1214 labels_nonzero(&info->labels.mask)) { 1215 err = ovs_ct_set_labels(ct, key, &info->labels.value, 1216 &info->labels.mask); 1217 if (err) 1218 return err; 1219 } 1220 /* This will take care of sending queued events even if the connection 1221 * is already confirmed. 1222 */ 1223 if (nf_conntrack_confirm(skb) != NF_ACCEPT) 1224 return -EINVAL; 1225 1226 return 0; 1227 } 1228 1229 /* Trim the skb to the length specified by the IP/IPv6 header, 1230 * removing any trailing lower-layer padding. This prepares the skb 1231 * for higher-layer processing that assumes skb->len excludes padding 1232 * (such as nf_ip_checksum). The caller needs to pull the skb to the 1233 * network header, and ensure ip_hdr/ipv6_hdr points to valid data. 1234 */ 1235 static int ovs_skb_network_trim(struct sk_buff *skb) 1236 { 1237 unsigned int len; 1238 int err; 1239 1240 switch (skb->protocol) { 1241 case htons(ETH_P_IP): 1242 len = ntohs(ip_hdr(skb)->tot_len); 1243 break; 1244 case htons(ETH_P_IPV6): 1245 len = sizeof(struct ipv6hdr) 1246 + ntohs(ipv6_hdr(skb)->payload_len); 1247 break; 1248 default: 1249 len = skb->len; 1250 } 1251 1252 err = pskb_trim_rcsum(skb, len); 1253 if (err) 1254 kfree_skb(skb); 1255 1256 return err; 1257 } 1258 1259 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero 1260 * value if 'skb' is freed. 1261 */ 1262 int ovs_ct_execute(struct net *net, struct sk_buff *skb, 1263 struct sw_flow_key *key, 1264 const struct ovs_conntrack_info *info) 1265 { 1266 int nh_ofs; 1267 int err; 1268 1269 /* The conntrack module expects to be working at L3. */ 1270 nh_ofs = skb_network_offset(skb); 1271 skb_pull_rcsum(skb, nh_ofs); 1272 1273 err = ovs_skb_network_trim(skb); 1274 if (err) 1275 return err; 1276 1277 if (key->ip.frag != OVS_FRAG_TYPE_NONE) { 1278 err = handle_fragments(net, key, info->zone.id, skb); 1279 if (err) 1280 return err; 1281 } 1282 1283 if (info->commit) 1284 err = ovs_ct_commit(net, key, info, skb); 1285 else 1286 err = ovs_ct_lookup(net, key, info, skb); 1287 1288 skb_push(skb, nh_ofs); 1289 skb_postpush_rcsum(skb, skb->data, nh_ofs); 1290 if (err) 1291 kfree_skb(skb); 1292 return err; 1293 } 1294 1295 int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) 1296 { 1297 if (skb_nfct(skb)) { 1298 nf_conntrack_put(skb_nfct(skb)); 1299 nf_ct_set(skb, NULL, IP_CT_UNTRACKED); 1300 ovs_ct_fill_key(skb, key); 1301 } 1302 1303 return 0; 1304 } 1305 1306 static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, 1307 const struct sw_flow_key *key, bool log) 1308 { 1309 struct nf_conntrack_helper *helper; 1310 struct nf_conn_help *help; 1311 int ret = 0; 1312 1313 helper = nf_conntrack_helper_try_module_get(name, info->family, 1314 key->ip.proto); 1315 if (!helper) { 1316 OVS_NLERR(log, "Unknown helper \"%s\"", name); 1317 return -EINVAL; 1318 } 1319 1320 help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL); 1321 if (!help) { 1322 nf_conntrack_helper_put(helper); 1323 return -ENOMEM; 1324 } 1325 1326 #if IS_ENABLED(CONFIG_NF_NAT) 1327 if (info->nat) { 1328 ret = nf_nat_helper_try_module_get(name, info->family, 1329 key->ip.proto); 1330 if (ret) { 1331 nf_conntrack_helper_put(helper); 1332 OVS_NLERR(log, "Failed to load \"%s\" NAT helper, error: %d", 1333 name, ret); 1334 return ret; 1335 } 1336 } 1337 #endif 1338 rcu_assign_pointer(help->helper, helper); 1339 info->helper = helper; 1340 return ret; 1341 } 1342 1343 #if IS_ENABLED(CONFIG_NF_NAT) 1344 static int parse_nat(const struct nlattr *attr, 1345 struct ovs_conntrack_info *info, bool log) 1346 { 1347 struct nlattr *a; 1348 int rem; 1349 bool have_ip_max = false; 1350 bool have_proto_max = false; 1351 bool ip_vers = (info->family == NFPROTO_IPV6); 1352 1353 nla_for_each_nested(a, attr, rem) { 1354 static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { 1355 [OVS_NAT_ATTR_SRC] = {0, 0}, 1356 [OVS_NAT_ATTR_DST] = {0, 0}, 1357 [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), 1358 sizeof(struct in6_addr)}, 1359 [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), 1360 sizeof(struct in6_addr)}, 1361 [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)}, 1362 [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)}, 1363 [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, 1364 [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, 1365 [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, 1366 }; 1367 int type = nla_type(a); 1368 1369 if (type > OVS_NAT_ATTR_MAX) { 1370 OVS_NLERR(log, "Unknown NAT attribute (type=%d, max=%d)", 1371 type, OVS_NAT_ATTR_MAX); 1372 return -EINVAL; 1373 } 1374 1375 if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) { 1376 OVS_NLERR(log, "NAT attribute type %d has unexpected length (%d != %d)", 1377 type, nla_len(a), 1378 ovs_nat_attr_lens[type][ip_vers]); 1379 return -EINVAL; 1380 } 1381 1382 switch (type) { 1383 case OVS_NAT_ATTR_SRC: 1384 case OVS_NAT_ATTR_DST: 1385 if (info->nat) { 1386 OVS_NLERR(log, "Only one type of NAT may be specified"); 1387 return -ERANGE; 1388 } 1389 info->nat |= OVS_CT_NAT; 1390 info->nat |= ((type == OVS_NAT_ATTR_SRC) 1391 ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT); 1392 break; 1393 1394 case OVS_NAT_ATTR_IP_MIN: 1395 nla_memcpy(&info->range.min_addr, a, 1396 sizeof(info->range.min_addr)); 1397 info->range.flags |= NF_NAT_RANGE_MAP_IPS; 1398 break; 1399 1400 case OVS_NAT_ATTR_IP_MAX: 1401 have_ip_max = true; 1402 nla_memcpy(&info->range.max_addr, a, 1403 sizeof(info->range.max_addr)); 1404 info->range.flags |= NF_NAT_RANGE_MAP_IPS; 1405 break; 1406 1407 case OVS_NAT_ATTR_PROTO_MIN: 1408 info->range.min_proto.all = htons(nla_get_u16(a)); 1409 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1410 break; 1411 1412 case OVS_NAT_ATTR_PROTO_MAX: 1413 have_proto_max = true; 1414 info->range.max_proto.all = htons(nla_get_u16(a)); 1415 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1416 break; 1417 1418 case OVS_NAT_ATTR_PERSISTENT: 1419 info->range.flags |= NF_NAT_RANGE_PERSISTENT; 1420 break; 1421 1422 case OVS_NAT_ATTR_PROTO_HASH: 1423 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; 1424 break; 1425 1426 case OVS_NAT_ATTR_PROTO_RANDOM: 1427 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY; 1428 break; 1429 1430 default: 1431 OVS_NLERR(log, "Unknown nat attribute (%d)", type); 1432 return -EINVAL; 1433 } 1434 } 1435 1436 if (rem > 0) { 1437 OVS_NLERR(log, "NAT attribute has %d unknown bytes", rem); 1438 return -EINVAL; 1439 } 1440 if (!info->nat) { 1441 /* Do not allow flags if no type is given. */ 1442 if (info->range.flags) { 1443 OVS_NLERR(log, 1444 "NAT flags may be given only when NAT range (SRC or DST) is also specified." 1445 ); 1446 return -EINVAL; 1447 } 1448 info->nat = OVS_CT_NAT; /* NAT existing connections. */ 1449 } else if (!info->commit) { 1450 OVS_NLERR(log, 1451 "NAT attributes may be specified only when CT COMMIT flag is also specified." 1452 ); 1453 return -EINVAL; 1454 } 1455 /* Allow missing IP_MAX. */ 1456 if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) { 1457 memcpy(&info->range.max_addr, &info->range.min_addr, 1458 sizeof(info->range.max_addr)); 1459 } 1460 /* Allow missing PROTO_MAX. */ 1461 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && 1462 !have_proto_max) { 1463 info->range.max_proto.all = info->range.min_proto.all; 1464 } 1465 return 0; 1466 } 1467 #endif 1468 1469 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { 1470 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1471 [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1472 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), 1473 .maxlen = sizeof(u16) }, 1474 [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), 1475 .maxlen = sizeof(struct md_mark) }, 1476 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), 1477 .maxlen = sizeof(struct md_labels) }, 1478 [OVS_CT_ATTR_HELPER] = { .minlen = 1, 1479 .maxlen = NF_CT_HELPER_NAME_LEN }, 1480 #if IS_ENABLED(CONFIG_NF_NAT) 1481 /* NAT length is checked when parsing the nested attributes. */ 1482 [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, 1483 #endif 1484 [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32), 1485 .maxlen = sizeof(u32) }, 1486 [OVS_CT_ATTR_TIMEOUT] = { .minlen = 1, 1487 .maxlen = CTNL_TIMEOUT_NAME_MAX }, 1488 }; 1489 1490 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, 1491 const char **helper, bool log) 1492 { 1493 struct nlattr *a; 1494 int rem; 1495 1496 nla_for_each_nested(a, attr, rem) { 1497 int type = nla_type(a); 1498 int maxlen; 1499 int minlen; 1500 1501 if (type > OVS_CT_ATTR_MAX) { 1502 OVS_NLERR(log, 1503 "Unknown conntrack attr (type=%d, max=%d)", 1504 type, OVS_CT_ATTR_MAX); 1505 return -EINVAL; 1506 } 1507 1508 maxlen = ovs_ct_attr_lens[type].maxlen; 1509 minlen = ovs_ct_attr_lens[type].minlen; 1510 if (nla_len(a) < minlen || nla_len(a) > maxlen) { 1511 OVS_NLERR(log, 1512 "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", 1513 type, nla_len(a), maxlen); 1514 return -EINVAL; 1515 } 1516 1517 switch (type) { 1518 case OVS_CT_ATTR_FORCE_COMMIT: 1519 info->force = true; 1520 /* fall through. */ 1521 case OVS_CT_ATTR_COMMIT: 1522 info->commit = true; 1523 break; 1524 #ifdef CONFIG_NF_CONNTRACK_ZONES 1525 case OVS_CT_ATTR_ZONE: 1526 info->zone.id = nla_get_u16(a); 1527 break; 1528 #endif 1529 #ifdef CONFIG_NF_CONNTRACK_MARK 1530 case OVS_CT_ATTR_MARK: { 1531 struct md_mark *mark = nla_data(a); 1532 1533 if (!mark->mask) { 1534 OVS_NLERR(log, "ct_mark mask cannot be 0"); 1535 return -EINVAL; 1536 } 1537 info->mark = *mark; 1538 break; 1539 } 1540 #endif 1541 #ifdef CONFIG_NF_CONNTRACK_LABELS 1542 case OVS_CT_ATTR_LABELS: { 1543 struct md_labels *labels = nla_data(a); 1544 1545 if (!labels_nonzero(&labels->mask)) { 1546 OVS_NLERR(log, "ct_labels mask cannot be 0"); 1547 return -EINVAL; 1548 } 1549 info->labels = *labels; 1550 break; 1551 } 1552 #endif 1553 case OVS_CT_ATTR_HELPER: 1554 *helper = nla_data(a); 1555 if (!memchr(*helper, '\0', nla_len(a))) { 1556 OVS_NLERR(log, "Invalid conntrack helper"); 1557 return -EINVAL; 1558 } 1559 break; 1560 #if IS_ENABLED(CONFIG_NF_NAT) 1561 case OVS_CT_ATTR_NAT: { 1562 int err = parse_nat(a, info, log); 1563 1564 if (err) 1565 return err; 1566 break; 1567 } 1568 #endif 1569 case OVS_CT_ATTR_EVENTMASK: 1570 info->have_eventmask = true; 1571 info->eventmask = nla_get_u32(a); 1572 break; 1573 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT 1574 case OVS_CT_ATTR_TIMEOUT: 1575 memcpy(info->timeout, nla_data(a), nla_len(a)); 1576 if (!memchr(info->timeout, '\0', nla_len(a))) { 1577 OVS_NLERR(log, "Invalid conntrack timeout"); 1578 return -EINVAL; 1579 } 1580 break; 1581 #endif 1582 1583 default: 1584 OVS_NLERR(log, "Unknown conntrack attr (%d)", 1585 type); 1586 return -EINVAL; 1587 } 1588 } 1589 1590 #ifdef CONFIG_NF_CONNTRACK_MARK 1591 if (!info->commit && info->mark.mask) { 1592 OVS_NLERR(log, 1593 "Setting conntrack mark requires 'commit' flag."); 1594 return -EINVAL; 1595 } 1596 #endif 1597 #ifdef CONFIG_NF_CONNTRACK_LABELS 1598 if (!info->commit && labels_nonzero(&info->labels.mask)) { 1599 OVS_NLERR(log, 1600 "Setting conntrack labels requires 'commit' flag."); 1601 return -EINVAL; 1602 } 1603 #endif 1604 if (rem > 0) { 1605 OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); 1606 return -EINVAL; 1607 } 1608 1609 return 0; 1610 } 1611 1612 bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) 1613 { 1614 if (attr == OVS_KEY_ATTR_CT_STATE) 1615 return true; 1616 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1617 attr == OVS_KEY_ATTR_CT_ZONE) 1618 return true; 1619 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 1620 attr == OVS_KEY_ATTR_CT_MARK) 1621 return true; 1622 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1623 attr == OVS_KEY_ATTR_CT_LABELS) { 1624 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 1625 1626 return ovs_net->xt_label; 1627 } 1628 1629 return false; 1630 } 1631 1632 int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, 1633 const struct sw_flow_key *key, 1634 struct sw_flow_actions **sfa, bool log) 1635 { 1636 struct ovs_conntrack_info ct_info; 1637 const char *helper = NULL; 1638 u16 family; 1639 int err; 1640 1641 family = key_to_nfproto(key); 1642 if (family == NFPROTO_UNSPEC) { 1643 OVS_NLERR(log, "ct family unspecified"); 1644 return -EINVAL; 1645 } 1646 1647 memset(&ct_info, 0, sizeof(ct_info)); 1648 ct_info.family = family; 1649 1650 nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, 1651 NF_CT_DEFAULT_ZONE_DIR, 0); 1652 1653 err = parse_ct(attr, &ct_info, &helper, log); 1654 if (err) 1655 return err; 1656 1657 /* Set up template for tracking connections in specific zones. */ 1658 ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); 1659 if (!ct_info.ct) { 1660 OVS_NLERR(log, "Failed to allocate conntrack template"); 1661 return -ENOMEM; 1662 } 1663 1664 if (ct_info.timeout[0]) { 1665 if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, 1666 ct_info.timeout)) 1667 pr_info_ratelimited("Failed to associated timeout " 1668 "policy `%s'\n", ct_info.timeout); 1669 else 1670 ct_info.nf_ct_timeout = rcu_dereference( 1671 nf_ct_timeout_find(ct_info.ct)->timeout); 1672 1673 } 1674 1675 if (helper) { 1676 err = ovs_ct_add_helper(&ct_info, helper, key, log); 1677 if (err) 1678 goto err_free_ct; 1679 } 1680 1681 err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, 1682 sizeof(ct_info), log); 1683 if (err) 1684 goto err_free_ct; 1685 1686 __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); 1687 nf_conntrack_get(&ct_info.ct->ct_general); 1688 return 0; 1689 err_free_ct: 1690 __ovs_ct_free_action(&ct_info); 1691 return err; 1692 } 1693 1694 #if IS_ENABLED(CONFIG_NF_NAT) 1695 static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, 1696 struct sk_buff *skb) 1697 { 1698 struct nlattr *start; 1699 1700 start = nla_nest_start_noflag(skb, OVS_CT_ATTR_NAT); 1701 if (!start) 1702 return false; 1703 1704 if (info->nat & OVS_CT_SRC_NAT) { 1705 if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) 1706 return false; 1707 } else if (info->nat & OVS_CT_DST_NAT) { 1708 if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) 1709 return false; 1710 } else { 1711 goto out; 1712 } 1713 1714 if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { 1715 if (IS_ENABLED(CONFIG_NF_NAT) && 1716 info->family == NFPROTO_IPV4) { 1717 if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, 1718 info->range.min_addr.ip) || 1719 (info->range.max_addr.ip 1720 != info->range.min_addr.ip && 1721 (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, 1722 info->range.max_addr.ip)))) 1723 return false; 1724 } else if (IS_ENABLED(CONFIG_IPV6) && 1725 info->family == NFPROTO_IPV6) { 1726 if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, 1727 &info->range.min_addr.in6) || 1728 (memcmp(&info->range.max_addr.in6, 1729 &info->range.min_addr.in6, 1730 sizeof(info->range.max_addr.in6)) && 1731 (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, 1732 &info->range.max_addr.in6)))) 1733 return false; 1734 } else { 1735 return false; 1736 } 1737 } 1738 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && 1739 (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN, 1740 ntohs(info->range.min_proto.all)) || 1741 (info->range.max_proto.all != info->range.min_proto.all && 1742 nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX, 1743 ntohs(info->range.max_proto.all))))) 1744 return false; 1745 1746 if (info->range.flags & NF_NAT_RANGE_PERSISTENT && 1747 nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT)) 1748 return false; 1749 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM && 1750 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH)) 1751 return false; 1752 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY && 1753 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM)) 1754 return false; 1755 out: 1756 nla_nest_end(skb, start); 1757 1758 return true; 1759 } 1760 #endif 1761 1762 int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, 1763 struct sk_buff *skb) 1764 { 1765 struct nlattr *start; 1766 1767 start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CT); 1768 if (!start) 1769 return -EMSGSIZE; 1770 1771 if (ct_info->commit && nla_put_flag(skb, ct_info->force 1772 ? OVS_CT_ATTR_FORCE_COMMIT 1773 : OVS_CT_ATTR_COMMIT)) 1774 return -EMSGSIZE; 1775 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1776 nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) 1777 return -EMSGSIZE; 1778 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && 1779 nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), 1780 &ct_info->mark)) 1781 return -EMSGSIZE; 1782 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1783 labels_nonzero(&ct_info->labels.mask) && 1784 nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), 1785 &ct_info->labels)) 1786 return -EMSGSIZE; 1787 if (ct_info->helper) { 1788 if (nla_put_string(skb, OVS_CT_ATTR_HELPER, 1789 ct_info->helper->name)) 1790 return -EMSGSIZE; 1791 } 1792 if (ct_info->have_eventmask && 1793 nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask)) 1794 return -EMSGSIZE; 1795 if (ct_info->timeout[0]) { 1796 if (nla_put_string(skb, OVS_CT_ATTR_TIMEOUT, ct_info->timeout)) 1797 return -EMSGSIZE; 1798 } 1799 1800 #if IS_ENABLED(CONFIG_NF_NAT) 1801 if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) 1802 return -EMSGSIZE; 1803 #endif 1804 nla_nest_end(skb, start); 1805 1806 return 0; 1807 } 1808 1809 void ovs_ct_free_action(const struct nlattr *a) 1810 { 1811 struct ovs_conntrack_info *ct_info = nla_data(a); 1812 1813 __ovs_ct_free_action(ct_info); 1814 } 1815 1816 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) 1817 { 1818 if (ct_info->helper) { 1819 #if IS_ENABLED(CONFIG_NF_NAT) 1820 if (ct_info->nat) 1821 nf_nat_helper_put(ct_info->helper); 1822 #endif 1823 nf_conntrack_helper_put(ct_info->helper); 1824 } 1825 if (ct_info->ct) { 1826 if (ct_info->timeout[0]) 1827 nf_ct_destroy_timeout(ct_info->ct); 1828 nf_ct_tmpl_free(ct_info->ct); 1829 } 1830 } 1831 1832 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 1833 static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net) 1834 { 1835 int i, err; 1836 1837 ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info), 1838 GFP_KERNEL); 1839 if (!ovs_net->ct_limit_info) 1840 return -ENOMEM; 1841 1842 ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT; 1843 ovs_net->ct_limit_info->limits = 1844 kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head), 1845 GFP_KERNEL); 1846 if (!ovs_net->ct_limit_info->limits) { 1847 kfree(ovs_net->ct_limit_info); 1848 return -ENOMEM; 1849 } 1850 1851 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++) 1852 INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]); 1853 1854 ovs_net->ct_limit_info->data = 1855 nf_conncount_init(net, NFPROTO_INET, sizeof(u32)); 1856 1857 if (IS_ERR(ovs_net->ct_limit_info->data)) { 1858 err = PTR_ERR(ovs_net->ct_limit_info->data); 1859 kfree(ovs_net->ct_limit_info->limits); 1860 kfree(ovs_net->ct_limit_info); 1861 pr_err("openvswitch: failed to init nf_conncount %d\n", err); 1862 return err; 1863 } 1864 return 0; 1865 } 1866 1867 static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) 1868 { 1869 const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info; 1870 int i; 1871 1872 nf_conncount_destroy(net, NFPROTO_INET, info->data); 1873 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { 1874 struct hlist_head *head = &info->limits[i]; 1875 struct ovs_ct_limit *ct_limit; 1876 1877 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) 1878 kfree_rcu(ct_limit, rcu); 1879 } 1880 kfree(ovs_net->ct_limit_info->limits); 1881 kfree(ovs_net->ct_limit_info); 1882 } 1883 1884 static struct sk_buff * 1885 ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd, 1886 struct ovs_header **ovs_reply_header) 1887 { 1888 struct ovs_header *ovs_header = info->userhdr; 1889 struct sk_buff *skb; 1890 1891 skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 1892 if (!skb) 1893 return ERR_PTR(-ENOMEM); 1894 1895 *ovs_reply_header = genlmsg_put(skb, info->snd_portid, 1896 info->snd_seq, 1897 &dp_ct_limit_genl_family, 0, cmd); 1898 1899 if (!*ovs_reply_header) { 1900 nlmsg_free(skb); 1901 return ERR_PTR(-EMSGSIZE); 1902 } 1903 (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; 1904 1905 return skb; 1906 } 1907 1908 static bool check_zone_id(int zone_id, u16 *pzone) 1909 { 1910 if (zone_id >= 0 && zone_id <= 65535) { 1911 *pzone = (u16)zone_id; 1912 return true; 1913 } 1914 return false; 1915 } 1916 1917 static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit, 1918 struct ovs_ct_limit_info *info) 1919 { 1920 struct ovs_zone_limit *zone_limit; 1921 int rem; 1922 u16 zone; 1923 1924 rem = NLA_ALIGN(nla_len(nla_zone_limit)); 1925 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); 1926 1927 while (rem >= sizeof(*zone_limit)) { 1928 if (unlikely(zone_limit->zone_id == 1929 OVS_ZONE_LIMIT_DEFAULT_ZONE)) { 1930 ovs_lock(); 1931 info->default_limit = zone_limit->limit; 1932 ovs_unlock(); 1933 } else if (unlikely(!check_zone_id( 1934 zone_limit->zone_id, &zone))) { 1935 OVS_NLERR(true, "zone id is out of range"); 1936 } else { 1937 struct ovs_ct_limit *ct_limit; 1938 1939 ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL); 1940 if (!ct_limit) 1941 return -ENOMEM; 1942 1943 ct_limit->zone = zone; 1944 ct_limit->limit = zone_limit->limit; 1945 1946 ovs_lock(); 1947 ct_limit_set(info, ct_limit); 1948 ovs_unlock(); 1949 } 1950 rem -= NLA_ALIGN(sizeof(*zone_limit)); 1951 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + 1952 NLA_ALIGN(sizeof(*zone_limit))); 1953 } 1954 1955 if (rem) 1956 OVS_NLERR(true, "set zone limit has %d unknown bytes", rem); 1957 1958 return 0; 1959 } 1960 1961 static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit, 1962 struct ovs_ct_limit_info *info) 1963 { 1964 struct ovs_zone_limit *zone_limit; 1965 int rem; 1966 u16 zone; 1967 1968 rem = NLA_ALIGN(nla_len(nla_zone_limit)); 1969 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); 1970 1971 while (rem >= sizeof(*zone_limit)) { 1972 if (unlikely(zone_limit->zone_id == 1973 OVS_ZONE_LIMIT_DEFAULT_ZONE)) { 1974 ovs_lock(); 1975 info->default_limit = OVS_CT_LIMIT_DEFAULT; 1976 ovs_unlock(); 1977 } else if (unlikely(!check_zone_id( 1978 zone_limit->zone_id, &zone))) { 1979 OVS_NLERR(true, "zone id is out of range"); 1980 } else { 1981 ovs_lock(); 1982 ct_limit_del(info, zone); 1983 ovs_unlock(); 1984 } 1985 rem -= NLA_ALIGN(sizeof(*zone_limit)); 1986 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + 1987 NLA_ALIGN(sizeof(*zone_limit))); 1988 } 1989 1990 if (rem) 1991 OVS_NLERR(true, "del zone limit has %d unknown bytes", rem); 1992 1993 return 0; 1994 } 1995 1996 static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info, 1997 struct sk_buff *reply) 1998 { 1999 struct ovs_zone_limit zone_limit; 2000 int err; 2001 2002 zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE; 2003 zone_limit.limit = info->default_limit; 2004 err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); 2005 if (err) 2006 return err; 2007 2008 return 0; 2009 } 2010 2011 static int __ovs_ct_limit_get_zone_limit(struct net *net, 2012 struct nf_conncount_data *data, 2013 u16 zone_id, u32 limit, 2014 struct sk_buff *reply) 2015 { 2016 struct nf_conntrack_zone ct_zone; 2017 struct ovs_zone_limit zone_limit; 2018 u32 conncount_key = zone_id; 2019 2020 zone_limit.zone_id = zone_id; 2021 zone_limit.limit = limit; 2022 nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); 2023 2024 zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, 2025 &ct_zone); 2026 return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); 2027 } 2028 2029 static int ovs_ct_limit_get_zone_limit(struct net *net, 2030 struct nlattr *nla_zone_limit, 2031 struct ovs_ct_limit_info *info, 2032 struct sk_buff *reply) 2033 { 2034 struct ovs_zone_limit *zone_limit; 2035 int rem, err; 2036 u32 limit; 2037 u16 zone; 2038 2039 rem = NLA_ALIGN(nla_len(nla_zone_limit)); 2040 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); 2041 2042 while (rem >= sizeof(*zone_limit)) { 2043 if (unlikely(zone_limit->zone_id == 2044 OVS_ZONE_LIMIT_DEFAULT_ZONE)) { 2045 err = ovs_ct_limit_get_default_limit(info, reply); 2046 if (err) 2047 return err; 2048 } else if (unlikely(!check_zone_id(zone_limit->zone_id, 2049 &zone))) { 2050 OVS_NLERR(true, "zone id is out of range"); 2051 } else { 2052 rcu_read_lock(); 2053 limit = ct_limit_get(info, zone); 2054 rcu_read_unlock(); 2055 2056 err = __ovs_ct_limit_get_zone_limit( 2057 net, info->data, zone, limit, reply); 2058 if (err) 2059 return err; 2060 } 2061 rem -= NLA_ALIGN(sizeof(*zone_limit)); 2062 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + 2063 NLA_ALIGN(sizeof(*zone_limit))); 2064 } 2065 2066 if (rem) 2067 OVS_NLERR(true, "get zone limit has %d unknown bytes", rem); 2068 2069 return 0; 2070 } 2071 2072 static int ovs_ct_limit_get_all_zone_limit(struct net *net, 2073 struct ovs_ct_limit_info *info, 2074 struct sk_buff *reply) 2075 { 2076 struct ovs_ct_limit *ct_limit; 2077 struct hlist_head *head; 2078 int i, err = 0; 2079 2080 err = ovs_ct_limit_get_default_limit(info, reply); 2081 if (err) 2082 return err; 2083 2084 rcu_read_lock(); 2085 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { 2086 head = &info->limits[i]; 2087 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { 2088 err = __ovs_ct_limit_get_zone_limit(net, info->data, 2089 ct_limit->zone, ct_limit->limit, reply); 2090 if (err) 2091 goto exit_err; 2092 } 2093 } 2094 2095 exit_err: 2096 rcu_read_unlock(); 2097 return err; 2098 } 2099 2100 static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info) 2101 { 2102 struct nlattr **a = info->attrs; 2103 struct sk_buff *reply; 2104 struct ovs_header *ovs_reply_header; 2105 struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); 2106 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; 2107 int err; 2108 2109 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET, 2110 &ovs_reply_header); 2111 if (IS_ERR(reply)) 2112 return PTR_ERR(reply); 2113 2114 if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { 2115 err = -EINVAL; 2116 goto exit_err; 2117 } 2118 2119 err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], 2120 ct_limit_info); 2121 if (err) 2122 goto exit_err; 2123 2124 static_branch_enable(&ovs_ct_limit_enabled); 2125 2126 genlmsg_end(reply, ovs_reply_header); 2127 return genlmsg_reply(reply, info); 2128 2129 exit_err: 2130 nlmsg_free(reply); 2131 return err; 2132 } 2133 2134 static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info) 2135 { 2136 struct nlattr **a = info->attrs; 2137 struct sk_buff *reply; 2138 struct ovs_header *ovs_reply_header; 2139 struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); 2140 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; 2141 int err; 2142 2143 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL, 2144 &ovs_reply_header); 2145 if (IS_ERR(reply)) 2146 return PTR_ERR(reply); 2147 2148 if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { 2149 err = -EINVAL; 2150 goto exit_err; 2151 } 2152 2153 err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], 2154 ct_limit_info); 2155 if (err) 2156 goto exit_err; 2157 2158 genlmsg_end(reply, ovs_reply_header); 2159 return genlmsg_reply(reply, info); 2160 2161 exit_err: 2162 nlmsg_free(reply); 2163 return err; 2164 } 2165 2166 static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info) 2167 { 2168 struct nlattr **a = info->attrs; 2169 struct nlattr *nla_reply; 2170 struct sk_buff *reply; 2171 struct ovs_header *ovs_reply_header; 2172 struct net *net = sock_net(skb->sk); 2173 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 2174 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; 2175 int err; 2176 2177 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET, 2178 &ovs_reply_header); 2179 if (IS_ERR(reply)) 2180 return PTR_ERR(reply); 2181 2182 nla_reply = nla_nest_start_noflag(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); 2183 if (!nla_reply) { 2184 err = -EMSGSIZE; 2185 goto exit_err; 2186 } 2187 2188 if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { 2189 err = ovs_ct_limit_get_zone_limit( 2190 net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info, 2191 reply); 2192 if (err) 2193 goto exit_err; 2194 } else { 2195 err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info, 2196 reply); 2197 if (err) 2198 goto exit_err; 2199 } 2200 2201 nla_nest_end(reply, nla_reply); 2202 genlmsg_end(reply, ovs_reply_header); 2203 return genlmsg_reply(reply, info); 2204 2205 exit_err: 2206 nlmsg_free(reply); 2207 return err; 2208 } 2209 2210 static struct genl_ops ct_limit_genl_ops[] = { 2211 { .cmd = OVS_CT_LIMIT_CMD_SET, 2212 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2213 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN 2214 * privilege. */ 2215 .doit = ovs_ct_limit_cmd_set, 2216 }, 2217 { .cmd = OVS_CT_LIMIT_CMD_DEL, 2218 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2219 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN 2220 * privilege. */ 2221 .doit = ovs_ct_limit_cmd_del, 2222 }, 2223 { .cmd = OVS_CT_LIMIT_CMD_GET, 2224 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2225 .flags = 0, /* OK for unprivileged users. */ 2226 .doit = ovs_ct_limit_cmd_get, 2227 }, 2228 }; 2229 2230 static const struct genl_multicast_group ovs_ct_limit_multicast_group = { 2231 .name = OVS_CT_LIMIT_MCGROUP, 2232 }; 2233 2234 struct genl_family dp_ct_limit_genl_family __ro_after_init = { 2235 .hdrsize = sizeof(struct ovs_header), 2236 .name = OVS_CT_LIMIT_FAMILY, 2237 .version = OVS_CT_LIMIT_VERSION, 2238 .maxattr = OVS_CT_LIMIT_ATTR_MAX, 2239 .policy = ct_limit_policy, 2240 .netnsok = true, 2241 .parallel_ops = true, 2242 .ops = ct_limit_genl_ops, 2243 .n_ops = ARRAY_SIZE(ct_limit_genl_ops), 2244 .mcgrps = &ovs_ct_limit_multicast_group, 2245 .n_mcgrps = 1, 2246 .module = THIS_MODULE, 2247 }; 2248 #endif 2249 2250 int ovs_ct_init(struct net *net) 2251 { 2252 unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; 2253 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 2254 2255 if (nf_connlabels_get(net, n_bits - 1)) { 2256 ovs_net->xt_label = false; 2257 OVS_NLERR(true, "Failed to set connlabel length"); 2258 } else { 2259 ovs_net->xt_label = true; 2260 } 2261 2262 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 2263 return ovs_ct_limit_init(net, ovs_net); 2264 #else 2265 return 0; 2266 #endif 2267 } 2268 2269 void ovs_ct_exit(struct net *net) 2270 { 2271 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 2272 2273 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 2274 ovs_ct_limit_exit(net, ovs_net); 2275 #endif 2276 2277 if (ovs_net->xt_label) 2278 nf_connlabels_put(net); 2279 } 2280