1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2015 Nicira, Inc. 4 */ 5 6 #include <linux/module.h> 7 #include <linux/openvswitch.h> 8 #include <linux/tcp.h> 9 #include <linux/udp.h> 10 #include <linux/sctp.h> 11 #include <linux/static_key.h> 12 #include <linux/string_helpers.h> 13 #include <net/ip.h> 14 #include <net/genetlink.h> 15 #include <net/netfilter/nf_conntrack_core.h> 16 #include <net/netfilter/nf_conntrack_count.h> 17 #include <net/netfilter/nf_conntrack_helper.h> 18 #include <net/netfilter/nf_conntrack_labels.h> 19 #include <net/netfilter/nf_conntrack_seqadj.h> 20 #include <net/netfilter/nf_conntrack_timeout.h> 21 #include <net/netfilter/nf_conntrack_zones.h> 22 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 23 #include <net/ipv6_frag.h> 24 25 #if IS_ENABLED(CONFIG_NF_NAT) 26 #include <net/netfilter/nf_nat.h> 27 #endif 28 29 #include <net/netfilter/nf_conntrack_act_ct.h> 30 31 #include "datapath.h" 32 #include "conntrack.h" 33 #include "flow.h" 34 #include "flow_netlink.h" 35 36 struct ovs_ct_len_tbl { 37 int maxlen; 38 int minlen; 39 }; 40 41 /* Metadata mark for masked write to conntrack mark */ 42 struct md_mark { 43 u32 value; 44 u32 mask; 45 }; 46 47 /* Metadata label for masked write to conntrack label. */ 48 struct md_labels { 49 struct ovs_key_ct_labels value; 50 struct ovs_key_ct_labels mask; 51 }; 52 53 enum ovs_ct_nat { 54 OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ 55 OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ 56 OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ 57 }; 58 59 /* Conntrack action context for execution. */ 60 struct ovs_conntrack_info { 61 struct nf_conntrack_helper *helper; 62 struct nf_conntrack_zone zone; 63 struct nf_conn *ct; 64 u8 commit : 1; 65 u8 nat : 3; /* enum ovs_ct_nat */ 66 u8 force : 1; 67 u8 have_eventmask : 1; 68 u16 family; 69 u32 eventmask; /* Mask of 1 << IPCT_*. */ 70 struct md_mark mark; 71 struct md_labels labels; 72 char timeout[CTNL_TIMEOUT_NAME_MAX]; 73 struct nf_ct_timeout *nf_ct_timeout; 74 #if IS_ENABLED(CONFIG_NF_NAT) 75 struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ 76 #endif 77 }; 78 79 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 80 #define OVS_CT_LIMIT_UNLIMITED 0 81 #define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED 82 #define CT_LIMIT_HASH_BUCKETS 512 83 static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled); 84 85 struct ovs_ct_limit { 86 /* Elements in ovs_ct_limit_info->limits hash table */ 87 struct hlist_node hlist_node; 88 struct rcu_head rcu; 89 u16 zone; 90 u32 limit; 91 }; 92 93 struct ovs_ct_limit_info { 94 u32 default_limit; 95 struct hlist_head *limits; 96 struct nf_conncount_data *data; 97 }; 98 99 static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = { 100 [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, }, 101 }; 102 #endif 103 104 static bool labels_nonzero(const struct ovs_key_ct_labels *labels); 105 106 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 107 108 static u16 key_to_nfproto(const struct sw_flow_key *key) 109 { 110 switch (ntohs(key->eth.type)) { 111 case ETH_P_IP: 112 return NFPROTO_IPV4; 113 case ETH_P_IPV6: 114 return NFPROTO_IPV6; 115 default: 116 return NFPROTO_UNSPEC; 117 } 118 } 119 120 /* Map SKB connection state into the values used by flow definition. */ 121 static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) 122 { 123 u8 ct_state = OVS_CS_F_TRACKED; 124 125 switch (ctinfo) { 126 case IP_CT_ESTABLISHED_REPLY: 127 case IP_CT_RELATED_REPLY: 128 ct_state |= OVS_CS_F_REPLY_DIR; 129 break; 130 default: 131 break; 132 } 133 134 switch (ctinfo) { 135 case IP_CT_ESTABLISHED: 136 case IP_CT_ESTABLISHED_REPLY: 137 ct_state |= OVS_CS_F_ESTABLISHED; 138 break; 139 case IP_CT_RELATED: 140 case IP_CT_RELATED_REPLY: 141 ct_state |= OVS_CS_F_RELATED; 142 break; 143 case IP_CT_NEW: 144 ct_state |= OVS_CS_F_NEW; 145 break; 146 default: 147 break; 148 } 149 150 return ct_state; 151 } 152 153 static u32 ovs_ct_get_mark(const struct nf_conn *ct) 154 { 155 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 156 return ct ? READ_ONCE(ct->mark) : 0; 157 #else 158 return 0; 159 #endif 160 } 161 162 /* Guard against conntrack labels max size shrinking below 128 bits. */ 163 #if NF_CT_LABELS_MAX_SIZE < 16 164 #error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes 165 #endif 166 167 static void ovs_ct_get_labels(const struct nf_conn *ct, 168 struct ovs_key_ct_labels *labels) 169 { 170 struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; 171 172 if (cl) 173 memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); 174 else 175 memset(labels, 0, OVS_CT_LABELS_LEN); 176 } 177 178 static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key, 179 const struct nf_conntrack_tuple *orig, 180 u8 icmp_proto) 181 { 182 key->ct_orig_proto = orig->dst.protonum; 183 if (orig->dst.protonum == icmp_proto) { 184 key->ct.orig_tp.src = htons(orig->dst.u.icmp.type); 185 key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code); 186 } else { 187 key->ct.orig_tp.src = orig->src.u.all; 188 key->ct.orig_tp.dst = orig->dst.u.all; 189 } 190 } 191 192 static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, 193 const struct nf_conntrack_zone *zone, 194 const struct nf_conn *ct) 195 { 196 key->ct_state = state; 197 key->ct_zone = zone->id; 198 key->ct.mark = ovs_ct_get_mark(ct); 199 ovs_ct_get_labels(ct, &key->ct.labels); 200 201 if (ct) { 202 const struct nf_conntrack_tuple *orig; 203 204 /* Use the master if we have one. */ 205 if (ct->master) 206 ct = ct->master; 207 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 208 209 /* IP version must match with the master connection. */ 210 if (key->eth.type == htons(ETH_P_IP) && 211 nf_ct_l3num(ct) == NFPROTO_IPV4) { 212 key->ipv4.ct_orig.src = orig->src.u3.ip; 213 key->ipv4.ct_orig.dst = orig->dst.u3.ip; 214 __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP); 215 return; 216 } else if (key->eth.type == htons(ETH_P_IPV6) && 217 !sw_flow_key_is_nd(key) && 218 nf_ct_l3num(ct) == NFPROTO_IPV6) { 219 key->ipv6.ct_orig.src = orig->src.u3.in6; 220 key->ipv6.ct_orig.dst = orig->dst.u3.in6; 221 __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP); 222 return; 223 } 224 } 225 /* Clear 'ct_orig_proto' to mark the non-existence of conntrack 226 * original direction key fields. 227 */ 228 key->ct_orig_proto = 0; 229 } 230 231 /* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has 232 * previously sent the packet to conntrack via the ct action. If 233 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are 234 * initialized from the connection status. 235 */ 236 static void ovs_ct_update_key(const struct sk_buff *skb, 237 const struct ovs_conntrack_info *info, 238 struct sw_flow_key *key, bool post_ct, 239 bool keep_nat_flags) 240 { 241 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 242 enum ip_conntrack_info ctinfo; 243 struct nf_conn *ct; 244 u8 state = 0; 245 246 ct = nf_ct_get(skb, &ctinfo); 247 if (ct) { 248 state = ovs_ct_get_state(ctinfo); 249 /* All unconfirmed entries are NEW connections. */ 250 if (!nf_ct_is_confirmed(ct)) 251 state |= OVS_CS_F_NEW; 252 /* OVS persists the related flag for the duration of the 253 * connection. 254 */ 255 if (ct->master) 256 state |= OVS_CS_F_RELATED; 257 if (keep_nat_flags) { 258 state |= key->ct_state & OVS_CS_F_NAT_MASK; 259 } else { 260 if (ct->status & IPS_SRC_NAT) 261 state |= OVS_CS_F_SRC_NAT; 262 if (ct->status & IPS_DST_NAT) 263 state |= OVS_CS_F_DST_NAT; 264 } 265 zone = nf_ct_zone(ct); 266 } else if (post_ct) { 267 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; 268 if (info) 269 zone = &info->zone; 270 } 271 __ovs_ct_update_key(key, state, zone, ct); 272 } 273 274 /* This is called to initialize CT key fields possibly coming in from the local 275 * stack. 276 */ 277 void ovs_ct_fill_key(const struct sk_buff *skb, 278 struct sw_flow_key *key, 279 bool post_ct) 280 { 281 ovs_ct_update_key(skb, NULL, key, post_ct, false); 282 } 283 284 int ovs_ct_put_key(const struct sw_flow_key *swkey, 285 const struct sw_flow_key *output, struct sk_buff *skb) 286 { 287 if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state)) 288 return -EMSGSIZE; 289 290 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 291 nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone)) 292 return -EMSGSIZE; 293 294 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 295 nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark)) 296 return -EMSGSIZE; 297 298 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 299 nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels), 300 &output->ct.labels)) 301 return -EMSGSIZE; 302 303 if (swkey->ct_orig_proto) { 304 if (swkey->eth.type == htons(ETH_P_IP)) { 305 struct ovs_key_ct_tuple_ipv4 orig; 306 307 memset(&orig, 0, sizeof(orig)); 308 orig.ipv4_src = output->ipv4.ct_orig.src; 309 orig.ipv4_dst = output->ipv4.ct_orig.dst; 310 orig.src_port = output->ct.orig_tp.src; 311 orig.dst_port = output->ct.orig_tp.dst; 312 orig.ipv4_proto = output->ct_orig_proto; 313 314 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, 315 sizeof(orig), &orig)) 316 return -EMSGSIZE; 317 } else if (swkey->eth.type == htons(ETH_P_IPV6)) { 318 struct ovs_key_ct_tuple_ipv6 orig; 319 320 memset(&orig, 0, sizeof(orig)); 321 memcpy(orig.ipv6_src, output->ipv6.ct_orig.src.s6_addr32, 322 sizeof(orig.ipv6_src)); 323 memcpy(orig.ipv6_dst, output->ipv6.ct_orig.dst.s6_addr32, 324 sizeof(orig.ipv6_dst)); 325 orig.src_port = output->ct.orig_tp.src; 326 orig.dst_port = output->ct.orig_tp.dst; 327 orig.ipv6_proto = output->ct_orig_proto; 328 329 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, 330 sizeof(orig), &orig)) 331 return -EMSGSIZE; 332 } 333 } 334 335 return 0; 336 } 337 338 static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, 339 u32 ct_mark, u32 mask) 340 { 341 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 342 u32 new_mark; 343 344 new_mark = ct_mark | (READ_ONCE(ct->mark) & ~(mask)); 345 if (READ_ONCE(ct->mark) != new_mark) { 346 WRITE_ONCE(ct->mark, new_mark); 347 if (nf_ct_is_confirmed(ct)) 348 nf_conntrack_event_cache(IPCT_MARK, ct); 349 key->ct.mark = new_mark; 350 } 351 352 return 0; 353 #else 354 return -ENOTSUPP; 355 #endif 356 } 357 358 static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct) 359 { 360 struct nf_conn_labels *cl; 361 362 cl = nf_ct_labels_find(ct); 363 if (!cl) { 364 nf_ct_labels_ext_add(ct); 365 cl = nf_ct_labels_find(ct); 366 } 367 368 return cl; 369 } 370 371 /* Initialize labels for a new, yet to be committed conntrack entry. Note that 372 * since the new connection is not yet confirmed, and thus no-one else has 373 * access to it's labels, we simply write them over. 374 */ 375 static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key, 376 const struct ovs_key_ct_labels *labels, 377 const struct ovs_key_ct_labels *mask) 378 { 379 struct nf_conn_labels *cl, *master_cl; 380 bool have_mask = labels_nonzero(mask); 381 382 /* Inherit master's labels to the related connection? */ 383 master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL; 384 385 if (!master_cl && !have_mask) 386 return 0; /* Nothing to do. */ 387 388 cl = ovs_ct_get_conn_labels(ct); 389 if (!cl) 390 return -ENOSPC; 391 392 /* Inherit the master's labels, if any. */ 393 if (master_cl) 394 *cl = *master_cl; 395 396 if (have_mask) { 397 u32 *dst = (u32 *)cl->bits; 398 int i; 399 400 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) 401 dst[i] = (dst[i] & ~mask->ct_labels_32[i]) | 402 (labels->ct_labels_32[i] 403 & mask->ct_labels_32[i]); 404 } 405 406 /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the 407 * IPCT_LABEL bit is set in the event cache. 408 */ 409 nf_conntrack_event_cache(IPCT_LABEL, ct); 410 411 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); 412 413 return 0; 414 } 415 416 static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, 417 const struct ovs_key_ct_labels *labels, 418 const struct ovs_key_ct_labels *mask) 419 { 420 struct nf_conn_labels *cl; 421 int err; 422 423 cl = ovs_ct_get_conn_labels(ct); 424 if (!cl) 425 return -ENOSPC; 426 427 err = nf_connlabels_replace(ct, labels->ct_labels_32, 428 mask->ct_labels_32, 429 OVS_CT_LABELS_LEN_32); 430 if (err) 431 return err; 432 433 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); 434 435 return 0; 436 } 437 438 static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key, 439 u16 zone, int family, struct sk_buff *skb) 440 { 441 struct ovs_skb_cb ovs_cb = *OVS_CB(skb); 442 int err; 443 444 err = nf_ct_handle_fragments(net, skb, zone, family, &key->ip.proto, &ovs_cb.mru); 445 if (err) 446 return err; 447 448 /* The key extracted from the fragment that completed this datagram 449 * likely didn't have an L4 header, so regenerate it. 450 */ 451 ovs_flow_key_update_l3l4(skb, key); 452 key->ip.frag = OVS_FRAG_TYPE_NONE; 453 *OVS_CB(skb) = ovs_cb; 454 455 return 0; 456 } 457 458 static struct nf_conntrack_expect * 459 ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, 460 u16 proto, const struct sk_buff *skb) 461 { 462 struct nf_conntrack_tuple tuple; 463 struct nf_conntrack_expect *exp; 464 465 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) 466 return NULL; 467 468 exp = __nf_ct_expect_find(net, zone, &tuple); 469 if (exp) { 470 struct nf_conntrack_tuple_hash *h; 471 472 /* Delete existing conntrack entry, if it clashes with the 473 * expectation. This can happen since conntrack ALGs do not 474 * check for clashes between (new) expectations and existing 475 * conntrack entries. nf_conntrack_in() will check the 476 * expectations only if a conntrack entry can not be found, 477 * which can lead to OVS finding the expectation (here) in the 478 * init direction, but which will not be removed by the 479 * nf_conntrack_in() call, if a matching conntrack entry is 480 * found instead. In this case all init direction packets 481 * would be reported as new related packets, while reply 482 * direction packets would be reported as un-related 483 * established packets. 484 */ 485 h = nf_conntrack_find_get(net, zone, &tuple); 486 if (h) { 487 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 488 489 nf_ct_delete(ct, 0, 0); 490 nf_ct_put(ct); 491 } 492 } 493 494 return exp; 495 } 496 497 /* This replicates logic from nf_conntrack_core.c that is not exported. */ 498 static enum ip_conntrack_info 499 ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) 500 { 501 const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 502 503 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) 504 return IP_CT_ESTABLISHED_REPLY; 505 /* Once we've had two way comms, always ESTABLISHED. */ 506 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 507 return IP_CT_ESTABLISHED; 508 if (test_bit(IPS_EXPECTED_BIT, &ct->status)) 509 return IP_CT_RELATED; 510 return IP_CT_NEW; 511 } 512 513 /* Find an existing connection which this packet belongs to without 514 * re-attributing statistics or modifying the connection state. This allows an 515 * skb->_nfct lost due to an upcall to be recovered during actions execution. 516 * 517 * Must be called with rcu_read_lock. 518 * 519 * On success, populates skb->_nfct and returns the connection. Returns NULL 520 * if there is no existing entry. 521 */ 522 static struct nf_conn * 523 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, 524 u8 l3num, struct sk_buff *skb, bool natted) 525 { 526 struct nf_conntrack_tuple tuple; 527 struct nf_conntrack_tuple_hash *h; 528 struct nf_conn *ct; 529 530 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, 531 net, &tuple)) { 532 pr_debug("ovs_ct_find_existing: Can't get tuple\n"); 533 return NULL; 534 } 535 536 /* Must invert the tuple if skb has been transformed by NAT. */ 537 if (natted) { 538 struct nf_conntrack_tuple inverse; 539 540 if (!nf_ct_invert_tuple(&inverse, &tuple)) { 541 pr_debug("ovs_ct_find_existing: Inversion failed!\n"); 542 return NULL; 543 } 544 tuple = inverse; 545 } 546 547 /* look for tuple match */ 548 h = nf_conntrack_find_get(net, zone, &tuple); 549 if (!h) 550 return NULL; /* Not found. */ 551 552 ct = nf_ct_tuplehash_to_ctrack(h); 553 554 /* Inverted packet tuple matches the reverse direction conntrack tuple, 555 * select the other tuplehash to get the right 'ctinfo' bits for this 556 * packet. 557 */ 558 if (natted) 559 h = &ct->tuplehash[!h->tuple.dst.dir]; 560 561 nf_ct_set(skb, ct, ovs_ct_get_info(h)); 562 return ct; 563 } 564 565 static 566 struct nf_conn *ovs_ct_executed(struct net *net, 567 const struct sw_flow_key *key, 568 const struct ovs_conntrack_info *info, 569 struct sk_buff *skb, 570 bool *ct_executed) 571 { 572 struct nf_conn *ct = NULL; 573 574 /* If no ct, check if we have evidence that an existing conntrack entry 575 * might be found for this skb. This happens when we lose a skb->_nfct 576 * due to an upcall, or if the direction is being forced. If the 577 * connection was not confirmed, it is not cached and needs to be run 578 * through conntrack again. 579 */ 580 *ct_executed = (key->ct_state & OVS_CS_F_TRACKED) && 581 !(key->ct_state & OVS_CS_F_INVALID) && 582 (key->ct_zone == info->zone.id); 583 584 if (*ct_executed || (!key->ct_state && info->force)) { 585 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, 586 !!(key->ct_state & 587 OVS_CS_F_NAT_MASK)); 588 } 589 590 return ct; 591 } 592 593 /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ 594 static bool skb_nfct_cached(struct net *net, 595 const struct sw_flow_key *key, 596 const struct ovs_conntrack_info *info, 597 struct sk_buff *skb) 598 { 599 enum ip_conntrack_info ctinfo; 600 struct nf_conn *ct; 601 bool ct_executed = true; 602 603 ct = nf_ct_get(skb, &ctinfo); 604 if (!ct) 605 ct = ovs_ct_executed(net, key, info, skb, &ct_executed); 606 607 if (ct) 608 nf_ct_get(skb, &ctinfo); 609 else 610 return false; 611 612 if (!net_eq(net, read_pnet(&ct->ct_net))) 613 return false; 614 if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) 615 return false; 616 if (info->helper) { 617 struct nf_conn_help *help; 618 619 help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); 620 if (help && rcu_access_pointer(help->helper) != info->helper) 621 return false; 622 } 623 if (info->nf_ct_timeout) { 624 struct nf_conn_timeout *timeout_ext; 625 626 timeout_ext = nf_ct_timeout_find(ct); 627 if (!timeout_ext || info->nf_ct_timeout != 628 rcu_dereference(timeout_ext->timeout)) 629 return false; 630 } 631 /* Force conntrack entry direction to the current packet? */ 632 if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { 633 /* Delete the conntrack entry if confirmed, else just release 634 * the reference. 635 */ 636 if (nf_ct_is_confirmed(ct)) 637 nf_ct_delete(ct, 0, 0); 638 639 nf_ct_put(ct); 640 nf_ct_set(skb, NULL, 0); 641 return false; 642 } 643 644 return ct_executed; 645 } 646 647 #if IS_ENABLED(CONFIG_NF_NAT) 648 static void ovs_nat_update_key(struct sw_flow_key *key, 649 const struct sk_buff *skb, 650 enum nf_nat_manip_type maniptype) 651 { 652 if (maniptype == NF_NAT_MANIP_SRC) { 653 __be16 src; 654 655 key->ct_state |= OVS_CS_F_SRC_NAT; 656 if (key->eth.type == htons(ETH_P_IP)) 657 key->ipv4.addr.src = ip_hdr(skb)->saddr; 658 else if (key->eth.type == htons(ETH_P_IPV6)) 659 memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, 660 sizeof(key->ipv6.addr.src)); 661 else 662 return; 663 664 if (key->ip.proto == IPPROTO_UDP) 665 src = udp_hdr(skb)->source; 666 else if (key->ip.proto == IPPROTO_TCP) 667 src = tcp_hdr(skb)->source; 668 else if (key->ip.proto == IPPROTO_SCTP) 669 src = sctp_hdr(skb)->source; 670 else 671 return; 672 673 key->tp.src = src; 674 } else { 675 __be16 dst; 676 677 key->ct_state |= OVS_CS_F_DST_NAT; 678 if (key->eth.type == htons(ETH_P_IP)) 679 key->ipv4.addr.dst = ip_hdr(skb)->daddr; 680 else if (key->eth.type == htons(ETH_P_IPV6)) 681 memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, 682 sizeof(key->ipv6.addr.dst)); 683 else 684 return; 685 686 if (key->ip.proto == IPPROTO_UDP) 687 dst = udp_hdr(skb)->dest; 688 else if (key->ip.proto == IPPROTO_TCP) 689 dst = tcp_hdr(skb)->dest; 690 else if (key->ip.proto == IPPROTO_SCTP) 691 dst = sctp_hdr(skb)->dest; 692 else 693 return; 694 695 key->tp.dst = dst; 696 } 697 } 698 699 /* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ 700 static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, 701 const struct ovs_conntrack_info *info, 702 struct sk_buff *skb, struct nf_conn *ct, 703 enum ip_conntrack_info ctinfo) 704 { 705 int err, action = 0; 706 707 if (!(info->nat & OVS_CT_NAT)) 708 return NF_ACCEPT; 709 if (info->nat & OVS_CT_SRC_NAT) 710 action |= BIT(NF_NAT_MANIP_SRC); 711 if (info->nat & OVS_CT_DST_NAT) 712 action |= BIT(NF_NAT_MANIP_DST); 713 714 err = nf_ct_nat(skb, ct, ctinfo, &action, &info->range, info->commit); 715 716 if (action & BIT(NF_NAT_MANIP_SRC)) 717 ovs_nat_update_key(key, skb, NF_NAT_MANIP_SRC); 718 if (action & BIT(NF_NAT_MANIP_DST)) 719 ovs_nat_update_key(key, skb, NF_NAT_MANIP_DST); 720 721 return err; 722 } 723 #else /* !CONFIG_NF_NAT */ 724 static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, 725 const struct ovs_conntrack_info *info, 726 struct sk_buff *skb, struct nf_conn *ct, 727 enum ip_conntrack_info ctinfo) 728 { 729 return NF_ACCEPT; 730 } 731 #endif 732 733 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if 734 * not done already. Update key with new CT state after passing the packet 735 * through conntrack. 736 * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be 737 * set to NULL and 0 will be returned. 738 */ 739 static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 740 const struct ovs_conntrack_info *info, 741 struct sk_buff *skb) 742 { 743 /* If we are recirculating packets to match on conntrack fields and 744 * committing with a separate conntrack action, then we don't need to 745 * actually run the packet through conntrack twice unless it's for a 746 * different zone. 747 */ 748 bool cached = skb_nfct_cached(net, key, info, skb); 749 enum ip_conntrack_info ctinfo; 750 struct nf_conn *ct; 751 752 if (!cached) { 753 struct nf_hook_state state = { 754 .hook = NF_INET_PRE_ROUTING, 755 .pf = info->family, 756 .net = net, 757 }; 758 struct nf_conn *tmpl = info->ct; 759 int err; 760 761 /* Associate skb with specified zone. */ 762 if (tmpl) { 763 ct = nf_ct_get(skb, &ctinfo); 764 nf_ct_put(ct); 765 nf_conntrack_get(&tmpl->ct_general); 766 nf_ct_set(skb, tmpl, IP_CT_NEW); 767 } 768 769 err = nf_conntrack_in(skb, &state); 770 if (err != NF_ACCEPT) 771 return -ENOENT; 772 773 /* Clear CT state NAT flags to mark that we have not yet done 774 * NAT after the nf_conntrack_in() call. We can actually clear 775 * the whole state, as it will be re-initialized below. 776 */ 777 key->ct_state = 0; 778 779 /* Update the key, but keep the NAT flags. */ 780 ovs_ct_update_key(skb, info, key, true, true); 781 } 782 783 ct = nf_ct_get(skb, &ctinfo); 784 if (ct) { 785 bool add_helper = false; 786 787 /* Packets starting a new connection must be NATted before the 788 * helper, so that the helper knows about the NAT. We enforce 789 * this by delaying both NAT and helper calls for unconfirmed 790 * connections until the committing CT action. For later 791 * packets NAT and Helper may be called in either order. 792 * 793 * NAT will be done only if the CT action has NAT, and only 794 * once per packet (per zone), as guarded by the NAT bits in 795 * the key->ct_state. 796 */ 797 if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) && 798 (nf_ct_is_confirmed(ct) || info->commit) && 799 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { 800 return -EINVAL; 801 } 802 803 /* Userspace may decide to perform a ct lookup without a helper 804 * specified followed by a (recirculate and) commit with one, 805 * or attach a helper in a later commit. Therefore, for 806 * connections which we will commit, we may need to attach 807 * the helper here. 808 */ 809 if (!nf_ct_is_confirmed(ct) && info->commit && 810 info->helper && !nfct_help(ct)) { 811 int err = __nf_ct_try_assign_helper(ct, info->ct, 812 GFP_ATOMIC); 813 if (err) 814 return err; 815 add_helper = true; 816 817 /* helper installed, add seqadj if NAT is required */ 818 if (info->nat && !nfct_seqadj(ct)) { 819 if (!nfct_seqadj_ext_add(ct)) 820 return -EINVAL; 821 } 822 } 823 824 /* Call the helper only if: 825 * - nf_conntrack_in() was executed above ("!cached") or a 826 * helper was just attached ("add_helper") for a confirmed 827 * connection, or 828 * - When committing an unconfirmed connection. 829 */ 830 if ((nf_ct_is_confirmed(ct) ? !cached || add_helper : 831 info->commit) && 832 nf_ct_helper(skb, ct, ctinfo, info->family) != NF_ACCEPT) { 833 return -EINVAL; 834 } 835 836 if (nf_ct_protonum(ct) == IPPROTO_TCP && 837 nf_ct_is_confirmed(ct) && nf_conntrack_tcp_established(ct)) { 838 /* Be liberal for tcp packets so that out-of-window 839 * packets are not marked invalid. 840 */ 841 nf_ct_set_tcp_be_liberal(ct); 842 } 843 844 nf_conn_act_ct_ext_fill(skb, ct, ctinfo); 845 } 846 847 return 0; 848 } 849 850 /* Lookup connection and read fields into key. */ 851 static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 852 const struct ovs_conntrack_info *info, 853 struct sk_buff *skb) 854 { 855 struct nf_conntrack_expect *exp; 856 857 /* If we pass an expected packet through nf_conntrack_in() the 858 * expectation is typically removed, but the packet could still be 859 * lost in upcall processing. To prevent this from happening we 860 * perform an explicit expectation lookup. Expected connections are 861 * always new, and will be passed through conntrack only when they are 862 * committed, as it is OK to remove the expectation at that time. 863 */ 864 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); 865 if (exp) { 866 u8 state; 867 868 /* NOTE: New connections are NATted and Helped only when 869 * committed, so we are not calling into NAT here. 870 */ 871 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; 872 __ovs_ct_update_key(key, state, &info->zone, exp->master); 873 } else { 874 struct nf_conn *ct; 875 int err; 876 877 err = __ovs_ct_lookup(net, key, info, skb); 878 if (err) 879 return err; 880 881 ct = (struct nf_conn *)skb_nfct(skb); 882 if (ct) 883 nf_ct_deliver_cached_events(ct); 884 } 885 886 return 0; 887 } 888 889 static bool labels_nonzero(const struct ovs_key_ct_labels *labels) 890 { 891 size_t i; 892 893 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) 894 if (labels->ct_labels_32[i]) 895 return true; 896 897 return false; 898 } 899 900 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 901 static struct hlist_head *ct_limit_hash_bucket( 902 const struct ovs_ct_limit_info *info, u16 zone) 903 { 904 return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)]; 905 } 906 907 /* Call with ovs_mutex */ 908 static void ct_limit_set(const struct ovs_ct_limit_info *info, 909 struct ovs_ct_limit *new_ct_limit) 910 { 911 struct ovs_ct_limit *ct_limit; 912 struct hlist_head *head; 913 914 head = ct_limit_hash_bucket(info, new_ct_limit->zone); 915 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { 916 if (ct_limit->zone == new_ct_limit->zone) { 917 hlist_replace_rcu(&ct_limit->hlist_node, 918 &new_ct_limit->hlist_node); 919 kfree_rcu(ct_limit, rcu); 920 return; 921 } 922 } 923 924 hlist_add_head_rcu(&new_ct_limit->hlist_node, head); 925 } 926 927 /* Call with ovs_mutex */ 928 static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone) 929 { 930 struct ovs_ct_limit *ct_limit; 931 struct hlist_head *head; 932 struct hlist_node *n; 933 934 head = ct_limit_hash_bucket(info, zone); 935 hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) { 936 if (ct_limit->zone == zone) { 937 hlist_del_rcu(&ct_limit->hlist_node); 938 kfree_rcu(ct_limit, rcu); 939 return; 940 } 941 } 942 } 943 944 /* Call with RCU read lock */ 945 static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone) 946 { 947 struct ovs_ct_limit *ct_limit; 948 struct hlist_head *head; 949 950 head = ct_limit_hash_bucket(info, zone); 951 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { 952 if (ct_limit->zone == zone) 953 return ct_limit->limit; 954 } 955 956 return info->default_limit; 957 } 958 959 static int ovs_ct_check_limit(struct net *net, 960 const struct ovs_conntrack_info *info, 961 const struct nf_conntrack_tuple *tuple) 962 { 963 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 964 const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; 965 u32 per_zone_limit, connections; 966 u32 conncount_key; 967 968 conncount_key = info->zone.id; 969 970 per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id); 971 if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) 972 return 0; 973 974 connections = nf_conncount_count(net, ct_limit_info->data, 975 &conncount_key, tuple, &info->zone); 976 if (connections > per_zone_limit) 977 return -ENOMEM; 978 979 return 0; 980 } 981 #endif 982 983 /* Lookup connection and confirm if unconfirmed. */ 984 static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, 985 const struct ovs_conntrack_info *info, 986 struct sk_buff *skb) 987 { 988 enum ip_conntrack_info ctinfo; 989 struct nf_conn *ct; 990 int err; 991 992 err = __ovs_ct_lookup(net, key, info, skb); 993 if (err) 994 return err; 995 996 /* The connection could be invalid, in which case this is a no-op.*/ 997 ct = nf_ct_get(skb, &ctinfo); 998 if (!ct) 999 return 0; 1000 1001 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 1002 if (static_branch_unlikely(&ovs_ct_limit_enabled)) { 1003 if (!nf_ct_is_confirmed(ct)) { 1004 err = ovs_ct_check_limit(net, info, 1005 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 1006 if (err) { 1007 net_warn_ratelimited("openvswitch: zone: %u " 1008 "exceeds conntrack limit\n", 1009 info->zone.id); 1010 return err; 1011 } 1012 } 1013 } 1014 #endif 1015 1016 /* Set the conntrack event mask if given. NEW and DELETE events have 1017 * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener 1018 * typically would receive many kinds of updates. Setting the event 1019 * mask allows those events to be filtered. The set event mask will 1020 * remain in effect for the lifetime of the connection unless changed 1021 * by a further CT action with both the commit flag and the eventmask 1022 * option. */ 1023 if (info->have_eventmask) { 1024 struct nf_conntrack_ecache *cache = nf_ct_ecache_find(ct); 1025 1026 if (cache) 1027 cache->ctmask = info->eventmask; 1028 } 1029 1030 /* Apply changes before confirming the connection so that the initial 1031 * conntrack NEW netlink event carries the values given in the CT 1032 * action. 1033 */ 1034 if (info->mark.mask) { 1035 err = ovs_ct_set_mark(ct, key, info->mark.value, 1036 info->mark.mask); 1037 if (err) 1038 return err; 1039 } 1040 if (!nf_ct_is_confirmed(ct)) { 1041 err = ovs_ct_init_labels(ct, key, &info->labels.value, 1042 &info->labels.mask); 1043 if (err) 1044 return err; 1045 1046 nf_conn_act_ct_ext_add(ct); 1047 } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1048 labels_nonzero(&info->labels.mask)) { 1049 err = ovs_ct_set_labels(ct, key, &info->labels.value, 1050 &info->labels.mask); 1051 if (err) 1052 return err; 1053 } 1054 /* This will take care of sending queued events even if the connection 1055 * is already confirmed. 1056 */ 1057 if (nf_conntrack_confirm(skb) != NF_ACCEPT) 1058 return -EINVAL; 1059 1060 return 0; 1061 } 1062 1063 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero 1064 * value if 'skb' is freed. 1065 */ 1066 int ovs_ct_execute(struct net *net, struct sk_buff *skb, 1067 struct sw_flow_key *key, 1068 const struct ovs_conntrack_info *info) 1069 { 1070 int nh_ofs; 1071 int err; 1072 1073 /* The conntrack module expects to be working at L3. */ 1074 nh_ofs = skb_network_offset(skb); 1075 skb_pull_rcsum(skb, nh_ofs); 1076 1077 err = nf_ct_skb_network_trim(skb, info->family); 1078 if (err) { 1079 kfree_skb(skb); 1080 return err; 1081 } 1082 1083 if (key->ip.frag != OVS_FRAG_TYPE_NONE) { 1084 err = ovs_ct_handle_fragments(net, key, info->zone.id, 1085 info->family, skb); 1086 if (err) 1087 return err; 1088 } 1089 1090 if (info->commit) 1091 err = ovs_ct_commit(net, key, info, skb); 1092 else 1093 err = ovs_ct_lookup(net, key, info, skb); 1094 1095 skb_push_rcsum(skb, nh_ofs); 1096 if (err) 1097 kfree_skb(skb); 1098 return err; 1099 } 1100 1101 int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) 1102 { 1103 enum ip_conntrack_info ctinfo; 1104 struct nf_conn *ct; 1105 1106 ct = nf_ct_get(skb, &ctinfo); 1107 1108 nf_ct_put(ct); 1109 nf_ct_set(skb, NULL, IP_CT_UNTRACKED); 1110 1111 if (key) 1112 ovs_ct_fill_key(skb, key, false); 1113 1114 return 0; 1115 } 1116 1117 #if IS_ENABLED(CONFIG_NF_NAT) 1118 static int parse_nat(const struct nlattr *attr, 1119 struct ovs_conntrack_info *info, bool log) 1120 { 1121 struct nlattr *a; 1122 int rem; 1123 bool have_ip_max = false; 1124 bool have_proto_max = false; 1125 bool ip_vers = (info->family == NFPROTO_IPV6); 1126 1127 nla_for_each_nested(a, attr, rem) { 1128 static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { 1129 [OVS_NAT_ATTR_SRC] = {0, 0}, 1130 [OVS_NAT_ATTR_DST] = {0, 0}, 1131 [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), 1132 sizeof(struct in6_addr)}, 1133 [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), 1134 sizeof(struct in6_addr)}, 1135 [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)}, 1136 [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)}, 1137 [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, 1138 [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, 1139 [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, 1140 }; 1141 int type = nla_type(a); 1142 1143 if (type > OVS_NAT_ATTR_MAX) { 1144 OVS_NLERR(log, "Unknown NAT attribute (type=%d, max=%d)", 1145 type, OVS_NAT_ATTR_MAX); 1146 return -EINVAL; 1147 } 1148 1149 if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) { 1150 OVS_NLERR(log, "NAT attribute type %d has unexpected length (%d != %d)", 1151 type, nla_len(a), 1152 ovs_nat_attr_lens[type][ip_vers]); 1153 return -EINVAL; 1154 } 1155 1156 switch (type) { 1157 case OVS_NAT_ATTR_SRC: 1158 case OVS_NAT_ATTR_DST: 1159 if (info->nat) { 1160 OVS_NLERR(log, "Only one type of NAT may be specified"); 1161 return -ERANGE; 1162 } 1163 info->nat |= OVS_CT_NAT; 1164 info->nat |= ((type == OVS_NAT_ATTR_SRC) 1165 ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT); 1166 break; 1167 1168 case OVS_NAT_ATTR_IP_MIN: 1169 nla_memcpy(&info->range.min_addr, a, 1170 sizeof(info->range.min_addr)); 1171 info->range.flags |= NF_NAT_RANGE_MAP_IPS; 1172 break; 1173 1174 case OVS_NAT_ATTR_IP_MAX: 1175 have_ip_max = true; 1176 nla_memcpy(&info->range.max_addr, a, 1177 sizeof(info->range.max_addr)); 1178 info->range.flags |= NF_NAT_RANGE_MAP_IPS; 1179 break; 1180 1181 case OVS_NAT_ATTR_PROTO_MIN: 1182 info->range.min_proto.all = htons(nla_get_u16(a)); 1183 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1184 break; 1185 1186 case OVS_NAT_ATTR_PROTO_MAX: 1187 have_proto_max = true; 1188 info->range.max_proto.all = htons(nla_get_u16(a)); 1189 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1190 break; 1191 1192 case OVS_NAT_ATTR_PERSISTENT: 1193 info->range.flags |= NF_NAT_RANGE_PERSISTENT; 1194 break; 1195 1196 case OVS_NAT_ATTR_PROTO_HASH: 1197 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; 1198 break; 1199 1200 case OVS_NAT_ATTR_PROTO_RANDOM: 1201 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY; 1202 break; 1203 1204 default: 1205 OVS_NLERR(log, "Unknown nat attribute (%d)", type); 1206 return -EINVAL; 1207 } 1208 } 1209 1210 if (rem > 0) { 1211 OVS_NLERR(log, "NAT attribute has %d unknown bytes", rem); 1212 return -EINVAL; 1213 } 1214 if (!info->nat) { 1215 /* Do not allow flags if no type is given. */ 1216 if (info->range.flags) { 1217 OVS_NLERR(log, 1218 "NAT flags may be given only when NAT range (SRC or DST) is also specified." 1219 ); 1220 return -EINVAL; 1221 } 1222 info->nat = OVS_CT_NAT; /* NAT existing connections. */ 1223 } else if (!info->commit) { 1224 OVS_NLERR(log, 1225 "NAT attributes may be specified only when CT COMMIT flag is also specified." 1226 ); 1227 return -EINVAL; 1228 } 1229 /* Allow missing IP_MAX. */ 1230 if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) { 1231 memcpy(&info->range.max_addr, &info->range.min_addr, 1232 sizeof(info->range.max_addr)); 1233 } 1234 /* Allow missing PROTO_MAX. */ 1235 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && 1236 !have_proto_max) { 1237 info->range.max_proto.all = info->range.min_proto.all; 1238 } 1239 return 0; 1240 } 1241 #endif 1242 1243 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { 1244 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1245 [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1246 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), 1247 .maxlen = sizeof(u16) }, 1248 [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), 1249 .maxlen = sizeof(struct md_mark) }, 1250 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), 1251 .maxlen = sizeof(struct md_labels) }, 1252 [OVS_CT_ATTR_HELPER] = { .minlen = 1, 1253 .maxlen = NF_CT_HELPER_NAME_LEN }, 1254 #if IS_ENABLED(CONFIG_NF_NAT) 1255 /* NAT length is checked when parsing the nested attributes. */ 1256 [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, 1257 #endif 1258 [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32), 1259 .maxlen = sizeof(u32) }, 1260 [OVS_CT_ATTR_TIMEOUT] = { .minlen = 1, 1261 .maxlen = CTNL_TIMEOUT_NAME_MAX }, 1262 }; 1263 1264 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, 1265 const char **helper, bool log) 1266 { 1267 struct nlattr *a; 1268 int rem; 1269 1270 nla_for_each_nested(a, attr, rem) { 1271 int type = nla_type(a); 1272 int maxlen; 1273 int minlen; 1274 1275 if (type > OVS_CT_ATTR_MAX) { 1276 OVS_NLERR(log, 1277 "Unknown conntrack attr (type=%d, max=%d)", 1278 type, OVS_CT_ATTR_MAX); 1279 return -EINVAL; 1280 } 1281 1282 maxlen = ovs_ct_attr_lens[type].maxlen; 1283 minlen = ovs_ct_attr_lens[type].minlen; 1284 if (nla_len(a) < minlen || nla_len(a) > maxlen) { 1285 OVS_NLERR(log, 1286 "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", 1287 type, nla_len(a), maxlen); 1288 return -EINVAL; 1289 } 1290 1291 switch (type) { 1292 case OVS_CT_ATTR_FORCE_COMMIT: 1293 info->force = true; 1294 fallthrough; 1295 case OVS_CT_ATTR_COMMIT: 1296 info->commit = true; 1297 break; 1298 #ifdef CONFIG_NF_CONNTRACK_ZONES 1299 case OVS_CT_ATTR_ZONE: 1300 info->zone.id = nla_get_u16(a); 1301 break; 1302 #endif 1303 #ifdef CONFIG_NF_CONNTRACK_MARK 1304 case OVS_CT_ATTR_MARK: { 1305 struct md_mark *mark = nla_data(a); 1306 1307 if (!mark->mask) { 1308 OVS_NLERR(log, "ct_mark mask cannot be 0"); 1309 return -EINVAL; 1310 } 1311 info->mark = *mark; 1312 break; 1313 } 1314 #endif 1315 #ifdef CONFIG_NF_CONNTRACK_LABELS 1316 case OVS_CT_ATTR_LABELS: { 1317 struct md_labels *labels = nla_data(a); 1318 1319 if (!labels_nonzero(&labels->mask)) { 1320 OVS_NLERR(log, "ct_labels mask cannot be 0"); 1321 return -EINVAL; 1322 } 1323 info->labels = *labels; 1324 break; 1325 } 1326 #endif 1327 case OVS_CT_ATTR_HELPER: 1328 *helper = nla_data(a); 1329 if (!string_is_terminated(*helper, nla_len(a))) { 1330 OVS_NLERR(log, "Invalid conntrack helper"); 1331 return -EINVAL; 1332 } 1333 break; 1334 #if IS_ENABLED(CONFIG_NF_NAT) 1335 case OVS_CT_ATTR_NAT: { 1336 int err = parse_nat(a, info, log); 1337 1338 if (err) 1339 return err; 1340 break; 1341 } 1342 #endif 1343 case OVS_CT_ATTR_EVENTMASK: 1344 info->have_eventmask = true; 1345 info->eventmask = nla_get_u32(a); 1346 break; 1347 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT 1348 case OVS_CT_ATTR_TIMEOUT: 1349 memcpy(info->timeout, nla_data(a), nla_len(a)); 1350 if (!string_is_terminated(info->timeout, nla_len(a))) { 1351 OVS_NLERR(log, "Invalid conntrack timeout"); 1352 return -EINVAL; 1353 } 1354 break; 1355 #endif 1356 1357 default: 1358 OVS_NLERR(log, "Unknown conntrack attr (%d)", 1359 type); 1360 return -EINVAL; 1361 } 1362 } 1363 1364 #ifdef CONFIG_NF_CONNTRACK_MARK 1365 if (!info->commit && info->mark.mask) { 1366 OVS_NLERR(log, 1367 "Setting conntrack mark requires 'commit' flag."); 1368 return -EINVAL; 1369 } 1370 #endif 1371 #ifdef CONFIG_NF_CONNTRACK_LABELS 1372 if (!info->commit && labels_nonzero(&info->labels.mask)) { 1373 OVS_NLERR(log, 1374 "Setting conntrack labels requires 'commit' flag."); 1375 return -EINVAL; 1376 } 1377 #endif 1378 if (rem > 0) { 1379 OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); 1380 return -EINVAL; 1381 } 1382 1383 return 0; 1384 } 1385 1386 bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) 1387 { 1388 if (attr == OVS_KEY_ATTR_CT_STATE) 1389 return true; 1390 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1391 attr == OVS_KEY_ATTR_CT_ZONE) 1392 return true; 1393 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 1394 attr == OVS_KEY_ATTR_CT_MARK) 1395 return true; 1396 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1397 attr == OVS_KEY_ATTR_CT_LABELS) { 1398 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 1399 1400 return ovs_net->xt_label; 1401 } 1402 1403 return false; 1404 } 1405 1406 int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, 1407 const struct sw_flow_key *key, 1408 struct sw_flow_actions **sfa, bool log) 1409 { 1410 struct ovs_conntrack_info ct_info; 1411 const char *helper = NULL; 1412 u16 family; 1413 int err; 1414 1415 family = key_to_nfproto(key); 1416 if (family == NFPROTO_UNSPEC) { 1417 OVS_NLERR(log, "ct family unspecified"); 1418 return -EINVAL; 1419 } 1420 1421 memset(&ct_info, 0, sizeof(ct_info)); 1422 ct_info.family = family; 1423 1424 nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, 1425 NF_CT_DEFAULT_ZONE_DIR, 0); 1426 1427 err = parse_ct(attr, &ct_info, &helper, log); 1428 if (err) 1429 return err; 1430 1431 /* Set up template for tracking connections in specific zones. */ 1432 ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); 1433 if (!ct_info.ct) { 1434 OVS_NLERR(log, "Failed to allocate conntrack template"); 1435 return -ENOMEM; 1436 } 1437 1438 if (ct_info.timeout[0]) { 1439 if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, 1440 ct_info.timeout)) 1441 pr_info_ratelimited("Failed to associated timeout " 1442 "policy `%s'\n", ct_info.timeout); 1443 else 1444 ct_info.nf_ct_timeout = rcu_dereference( 1445 nf_ct_timeout_find(ct_info.ct)->timeout); 1446 1447 } 1448 1449 if (helper) { 1450 err = nf_ct_add_helper(ct_info.ct, helper, ct_info.family, 1451 key->ip.proto, ct_info.nat, &ct_info.helper); 1452 if (err) { 1453 OVS_NLERR(log, "Failed to add %s helper %d", helper, err); 1454 goto err_free_ct; 1455 } 1456 } 1457 1458 err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, 1459 sizeof(ct_info), log); 1460 if (err) 1461 goto err_free_ct; 1462 1463 __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); 1464 return 0; 1465 err_free_ct: 1466 __ovs_ct_free_action(&ct_info); 1467 return err; 1468 } 1469 1470 #if IS_ENABLED(CONFIG_NF_NAT) 1471 static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, 1472 struct sk_buff *skb) 1473 { 1474 struct nlattr *start; 1475 1476 start = nla_nest_start_noflag(skb, OVS_CT_ATTR_NAT); 1477 if (!start) 1478 return false; 1479 1480 if (info->nat & OVS_CT_SRC_NAT) { 1481 if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) 1482 return false; 1483 } else if (info->nat & OVS_CT_DST_NAT) { 1484 if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) 1485 return false; 1486 } else { 1487 goto out; 1488 } 1489 1490 if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { 1491 if (IS_ENABLED(CONFIG_NF_NAT) && 1492 info->family == NFPROTO_IPV4) { 1493 if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, 1494 info->range.min_addr.ip) || 1495 (info->range.max_addr.ip 1496 != info->range.min_addr.ip && 1497 (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, 1498 info->range.max_addr.ip)))) 1499 return false; 1500 } else if (IS_ENABLED(CONFIG_IPV6) && 1501 info->family == NFPROTO_IPV6) { 1502 if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, 1503 &info->range.min_addr.in6) || 1504 (memcmp(&info->range.max_addr.in6, 1505 &info->range.min_addr.in6, 1506 sizeof(info->range.max_addr.in6)) && 1507 (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, 1508 &info->range.max_addr.in6)))) 1509 return false; 1510 } else { 1511 return false; 1512 } 1513 } 1514 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && 1515 (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN, 1516 ntohs(info->range.min_proto.all)) || 1517 (info->range.max_proto.all != info->range.min_proto.all && 1518 nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX, 1519 ntohs(info->range.max_proto.all))))) 1520 return false; 1521 1522 if (info->range.flags & NF_NAT_RANGE_PERSISTENT && 1523 nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT)) 1524 return false; 1525 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM && 1526 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH)) 1527 return false; 1528 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY && 1529 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM)) 1530 return false; 1531 out: 1532 nla_nest_end(skb, start); 1533 1534 return true; 1535 } 1536 #endif 1537 1538 int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, 1539 struct sk_buff *skb) 1540 { 1541 struct nlattr *start; 1542 1543 start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CT); 1544 if (!start) 1545 return -EMSGSIZE; 1546 1547 if (ct_info->commit && nla_put_flag(skb, ct_info->force 1548 ? OVS_CT_ATTR_FORCE_COMMIT 1549 : OVS_CT_ATTR_COMMIT)) 1550 return -EMSGSIZE; 1551 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1552 nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) 1553 return -EMSGSIZE; 1554 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && 1555 nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), 1556 &ct_info->mark)) 1557 return -EMSGSIZE; 1558 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1559 labels_nonzero(&ct_info->labels.mask) && 1560 nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), 1561 &ct_info->labels)) 1562 return -EMSGSIZE; 1563 if (ct_info->helper) { 1564 if (nla_put_string(skb, OVS_CT_ATTR_HELPER, 1565 ct_info->helper->name)) 1566 return -EMSGSIZE; 1567 } 1568 if (ct_info->have_eventmask && 1569 nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask)) 1570 return -EMSGSIZE; 1571 if (ct_info->timeout[0]) { 1572 if (nla_put_string(skb, OVS_CT_ATTR_TIMEOUT, ct_info->timeout)) 1573 return -EMSGSIZE; 1574 } 1575 1576 #if IS_ENABLED(CONFIG_NF_NAT) 1577 if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) 1578 return -EMSGSIZE; 1579 #endif 1580 nla_nest_end(skb, start); 1581 1582 return 0; 1583 } 1584 1585 void ovs_ct_free_action(const struct nlattr *a) 1586 { 1587 struct ovs_conntrack_info *ct_info = nla_data(a); 1588 1589 __ovs_ct_free_action(ct_info); 1590 } 1591 1592 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) 1593 { 1594 if (ct_info->helper) { 1595 #if IS_ENABLED(CONFIG_NF_NAT) 1596 if (ct_info->nat) 1597 nf_nat_helper_put(ct_info->helper); 1598 #endif 1599 nf_conntrack_helper_put(ct_info->helper); 1600 } 1601 if (ct_info->ct) { 1602 if (ct_info->timeout[0]) 1603 nf_ct_destroy_timeout(ct_info->ct); 1604 nf_ct_tmpl_free(ct_info->ct); 1605 } 1606 } 1607 1608 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 1609 static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net) 1610 { 1611 int i, err; 1612 1613 ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info), 1614 GFP_KERNEL); 1615 if (!ovs_net->ct_limit_info) 1616 return -ENOMEM; 1617 1618 ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT; 1619 ovs_net->ct_limit_info->limits = 1620 kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head), 1621 GFP_KERNEL); 1622 if (!ovs_net->ct_limit_info->limits) { 1623 kfree(ovs_net->ct_limit_info); 1624 return -ENOMEM; 1625 } 1626 1627 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++) 1628 INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]); 1629 1630 ovs_net->ct_limit_info->data = 1631 nf_conncount_init(net, NFPROTO_INET, sizeof(u32)); 1632 1633 if (IS_ERR(ovs_net->ct_limit_info->data)) { 1634 err = PTR_ERR(ovs_net->ct_limit_info->data); 1635 kfree(ovs_net->ct_limit_info->limits); 1636 kfree(ovs_net->ct_limit_info); 1637 pr_err("openvswitch: failed to init nf_conncount %d\n", err); 1638 return err; 1639 } 1640 return 0; 1641 } 1642 1643 static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) 1644 { 1645 const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info; 1646 int i; 1647 1648 nf_conncount_destroy(net, NFPROTO_INET, info->data); 1649 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { 1650 struct hlist_head *head = &info->limits[i]; 1651 struct ovs_ct_limit *ct_limit; 1652 1653 hlist_for_each_entry_rcu(ct_limit, head, hlist_node, 1654 lockdep_ovsl_is_held()) 1655 kfree_rcu(ct_limit, rcu); 1656 } 1657 kfree(info->limits); 1658 kfree(info); 1659 } 1660 1661 static struct sk_buff * 1662 ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd, 1663 struct ovs_header **ovs_reply_header) 1664 { 1665 struct ovs_header *ovs_header = info->userhdr; 1666 struct sk_buff *skb; 1667 1668 skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 1669 if (!skb) 1670 return ERR_PTR(-ENOMEM); 1671 1672 *ovs_reply_header = genlmsg_put(skb, info->snd_portid, 1673 info->snd_seq, 1674 &dp_ct_limit_genl_family, 0, cmd); 1675 1676 if (!*ovs_reply_header) { 1677 nlmsg_free(skb); 1678 return ERR_PTR(-EMSGSIZE); 1679 } 1680 (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; 1681 1682 return skb; 1683 } 1684 1685 static bool check_zone_id(int zone_id, u16 *pzone) 1686 { 1687 if (zone_id >= 0 && zone_id <= 65535) { 1688 *pzone = (u16)zone_id; 1689 return true; 1690 } 1691 return false; 1692 } 1693 1694 static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit, 1695 struct ovs_ct_limit_info *info) 1696 { 1697 struct ovs_zone_limit *zone_limit; 1698 int rem; 1699 u16 zone; 1700 1701 rem = NLA_ALIGN(nla_len(nla_zone_limit)); 1702 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); 1703 1704 while (rem >= sizeof(*zone_limit)) { 1705 if (unlikely(zone_limit->zone_id == 1706 OVS_ZONE_LIMIT_DEFAULT_ZONE)) { 1707 ovs_lock(); 1708 info->default_limit = zone_limit->limit; 1709 ovs_unlock(); 1710 } else if (unlikely(!check_zone_id( 1711 zone_limit->zone_id, &zone))) { 1712 OVS_NLERR(true, "zone id is out of range"); 1713 } else { 1714 struct ovs_ct_limit *ct_limit; 1715 1716 ct_limit = kmalloc(sizeof(*ct_limit), 1717 GFP_KERNEL_ACCOUNT); 1718 if (!ct_limit) 1719 return -ENOMEM; 1720 1721 ct_limit->zone = zone; 1722 ct_limit->limit = zone_limit->limit; 1723 1724 ovs_lock(); 1725 ct_limit_set(info, ct_limit); 1726 ovs_unlock(); 1727 } 1728 rem -= NLA_ALIGN(sizeof(*zone_limit)); 1729 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + 1730 NLA_ALIGN(sizeof(*zone_limit))); 1731 } 1732 1733 if (rem) 1734 OVS_NLERR(true, "set zone limit has %d unknown bytes", rem); 1735 1736 return 0; 1737 } 1738 1739 static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit, 1740 struct ovs_ct_limit_info *info) 1741 { 1742 struct ovs_zone_limit *zone_limit; 1743 int rem; 1744 u16 zone; 1745 1746 rem = NLA_ALIGN(nla_len(nla_zone_limit)); 1747 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); 1748 1749 while (rem >= sizeof(*zone_limit)) { 1750 if (unlikely(zone_limit->zone_id == 1751 OVS_ZONE_LIMIT_DEFAULT_ZONE)) { 1752 ovs_lock(); 1753 info->default_limit = OVS_CT_LIMIT_DEFAULT; 1754 ovs_unlock(); 1755 } else if (unlikely(!check_zone_id( 1756 zone_limit->zone_id, &zone))) { 1757 OVS_NLERR(true, "zone id is out of range"); 1758 } else { 1759 ovs_lock(); 1760 ct_limit_del(info, zone); 1761 ovs_unlock(); 1762 } 1763 rem -= NLA_ALIGN(sizeof(*zone_limit)); 1764 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + 1765 NLA_ALIGN(sizeof(*zone_limit))); 1766 } 1767 1768 if (rem) 1769 OVS_NLERR(true, "del zone limit has %d unknown bytes", rem); 1770 1771 return 0; 1772 } 1773 1774 static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info, 1775 struct sk_buff *reply) 1776 { 1777 struct ovs_zone_limit zone_limit = { 1778 .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE, 1779 .limit = info->default_limit, 1780 }; 1781 1782 return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); 1783 } 1784 1785 static int __ovs_ct_limit_get_zone_limit(struct net *net, 1786 struct nf_conncount_data *data, 1787 u16 zone_id, u32 limit, 1788 struct sk_buff *reply) 1789 { 1790 struct nf_conntrack_zone ct_zone; 1791 struct ovs_zone_limit zone_limit; 1792 u32 conncount_key = zone_id; 1793 1794 zone_limit.zone_id = zone_id; 1795 zone_limit.limit = limit; 1796 nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); 1797 1798 zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, 1799 &ct_zone); 1800 return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); 1801 } 1802 1803 static int ovs_ct_limit_get_zone_limit(struct net *net, 1804 struct nlattr *nla_zone_limit, 1805 struct ovs_ct_limit_info *info, 1806 struct sk_buff *reply) 1807 { 1808 struct ovs_zone_limit *zone_limit; 1809 int rem, err; 1810 u32 limit; 1811 u16 zone; 1812 1813 rem = NLA_ALIGN(nla_len(nla_zone_limit)); 1814 zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); 1815 1816 while (rem >= sizeof(*zone_limit)) { 1817 if (unlikely(zone_limit->zone_id == 1818 OVS_ZONE_LIMIT_DEFAULT_ZONE)) { 1819 err = ovs_ct_limit_get_default_limit(info, reply); 1820 if (err) 1821 return err; 1822 } else if (unlikely(!check_zone_id(zone_limit->zone_id, 1823 &zone))) { 1824 OVS_NLERR(true, "zone id is out of range"); 1825 } else { 1826 rcu_read_lock(); 1827 limit = ct_limit_get(info, zone); 1828 rcu_read_unlock(); 1829 1830 err = __ovs_ct_limit_get_zone_limit( 1831 net, info->data, zone, limit, reply); 1832 if (err) 1833 return err; 1834 } 1835 rem -= NLA_ALIGN(sizeof(*zone_limit)); 1836 zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + 1837 NLA_ALIGN(sizeof(*zone_limit))); 1838 } 1839 1840 if (rem) 1841 OVS_NLERR(true, "get zone limit has %d unknown bytes", rem); 1842 1843 return 0; 1844 } 1845 1846 static int ovs_ct_limit_get_all_zone_limit(struct net *net, 1847 struct ovs_ct_limit_info *info, 1848 struct sk_buff *reply) 1849 { 1850 struct ovs_ct_limit *ct_limit; 1851 struct hlist_head *head; 1852 int i, err = 0; 1853 1854 err = ovs_ct_limit_get_default_limit(info, reply); 1855 if (err) 1856 return err; 1857 1858 rcu_read_lock(); 1859 for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { 1860 head = &info->limits[i]; 1861 hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { 1862 err = __ovs_ct_limit_get_zone_limit(net, info->data, 1863 ct_limit->zone, ct_limit->limit, reply); 1864 if (err) 1865 goto exit_err; 1866 } 1867 } 1868 1869 exit_err: 1870 rcu_read_unlock(); 1871 return err; 1872 } 1873 1874 static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info) 1875 { 1876 struct nlattr **a = info->attrs; 1877 struct sk_buff *reply; 1878 struct ovs_header *ovs_reply_header; 1879 struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); 1880 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; 1881 int err; 1882 1883 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET, 1884 &ovs_reply_header); 1885 if (IS_ERR(reply)) 1886 return PTR_ERR(reply); 1887 1888 if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { 1889 err = -EINVAL; 1890 goto exit_err; 1891 } 1892 1893 err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], 1894 ct_limit_info); 1895 if (err) 1896 goto exit_err; 1897 1898 static_branch_enable(&ovs_ct_limit_enabled); 1899 1900 genlmsg_end(reply, ovs_reply_header); 1901 return genlmsg_reply(reply, info); 1902 1903 exit_err: 1904 nlmsg_free(reply); 1905 return err; 1906 } 1907 1908 static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info) 1909 { 1910 struct nlattr **a = info->attrs; 1911 struct sk_buff *reply; 1912 struct ovs_header *ovs_reply_header; 1913 struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); 1914 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; 1915 int err; 1916 1917 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL, 1918 &ovs_reply_header); 1919 if (IS_ERR(reply)) 1920 return PTR_ERR(reply); 1921 1922 if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { 1923 err = -EINVAL; 1924 goto exit_err; 1925 } 1926 1927 err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], 1928 ct_limit_info); 1929 if (err) 1930 goto exit_err; 1931 1932 genlmsg_end(reply, ovs_reply_header); 1933 return genlmsg_reply(reply, info); 1934 1935 exit_err: 1936 nlmsg_free(reply); 1937 return err; 1938 } 1939 1940 static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info) 1941 { 1942 struct nlattr **a = info->attrs; 1943 struct nlattr *nla_reply; 1944 struct sk_buff *reply; 1945 struct ovs_header *ovs_reply_header; 1946 struct net *net = sock_net(skb->sk); 1947 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 1948 struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; 1949 int err; 1950 1951 reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET, 1952 &ovs_reply_header); 1953 if (IS_ERR(reply)) 1954 return PTR_ERR(reply); 1955 1956 nla_reply = nla_nest_start_noflag(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); 1957 if (!nla_reply) { 1958 err = -EMSGSIZE; 1959 goto exit_err; 1960 } 1961 1962 if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { 1963 err = ovs_ct_limit_get_zone_limit( 1964 net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info, 1965 reply); 1966 if (err) 1967 goto exit_err; 1968 } else { 1969 err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info, 1970 reply); 1971 if (err) 1972 goto exit_err; 1973 } 1974 1975 nla_nest_end(reply, nla_reply); 1976 genlmsg_end(reply, ovs_reply_header); 1977 return genlmsg_reply(reply, info); 1978 1979 exit_err: 1980 nlmsg_free(reply); 1981 return err; 1982 } 1983 1984 static const struct genl_small_ops ct_limit_genl_ops[] = { 1985 { .cmd = OVS_CT_LIMIT_CMD_SET, 1986 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 1987 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN 1988 * privilege. 1989 */ 1990 .doit = ovs_ct_limit_cmd_set, 1991 }, 1992 { .cmd = OVS_CT_LIMIT_CMD_DEL, 1993 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 1994 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN 1995 * privilege. 1996 */ 1997 .doit = ovs_ct_limit_cmd_del, 1998 }, 1999 { .cmd = OVS_CT_LIMIT_CMD_GET, 2000 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2001 .flags = 0, /* OK for unprivileged users. */ 2002 .doit = ovs_ct_limit_cmd_get, 2003 }, 2004 }; 2005 2006 static const struct genl_multicast_group ovs_ct_limit_multicast_group = { 2007 .name = OVS_CT_LIMIT_MCGROUP, 2008 }; 2009 2010 struct genl_family dp_ct_limit_genl_family __ro_after_init = { 2011 .hdrsize = sizeof(struct ovs_header), 2012 .name = OVS_CT_LIMIT_FAMILY, 2013 .version = OVS_CT_LIMIT_VERSION, 2014 .maxattr = OVS_CT_LIMIT_ATTR_MAX, 2015 .policy = ct_limit_policy, 2016 .netnsok = true, 2017 .parallel_ops = true, 2018 .small_ops = ct_limit_genl_ops, 2019 .n_small_ops = ARRAY_SIZE(ct_limit_genl_ops), 2020 .resv_start_op = OVS_CT_LIMIT_CMD_GET + 1, 2021 .mcgrps = &ovs_ct_limit_multicast_group, 2022 .n_mcgrps = 1, 2023 .module = THIS_MODULE, 2024 }; 2025 #endif 2026 2027 int ovs_ct_init(struct net *net) 2028 { 2029 unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; 2030 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 2031 2032 if (nf_connlabels_get(net, n_bits - 1)) { 2033 ovs_net->xt_label = false; 2034 OVS_NLERR(true, "Failed to set connlabel length"); 2035 } else { 2036 ovs_net->xt_label = true; 2037 } 2038 2039 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 2040 return ovs_ct_limit_init(net, ovs_net); 2041 #else 2042 return 0; 2043 #endif 2044 } 2045 2046 void ovs_ct_exit(struct net *net) 2047 { 2048 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 2049 2050 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 2051 ovs_ct_limit_exit(net, ovs_net); 2052 #endif 2053 2054 if (ovs_net->xt_label) 2055 nf_connlabels_put(net); 2056 } 2057