1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * IPv6 IOAM Lightweight Tunnel implementation 4 * 5 * Author: 6 * Justin Iurman <justin.iurman@uliege.be> 7 */ 8 9 #include <linux/kernel.h> 10 #include <linux/skbuff.h> 11 #include <linux/net.h> 12 #include <linux/in6.h> 13 #include <linux/ioam6.h> 14 #include <linux/ioam6_iptunnel.h> 15 #include <net/dst.h> 16 #include <net/sock.h> 17 #include <net/lwtunnel.h> 18 #include <net/ioam6.h> 19 #include <net/netlink.h> 20 #include <net/ipv6.h> 21 #include <net/dst_cache.h> 22 #include <net/ip6_route.h> 23 #include <net/addrconf.h> 24 25 #define IOAM6_MASK_SHORT_FIELDS 0xff100000 26 #define IOAM6_MASK_WIDE_FIELDS 0xe00000 27 28 struct ioam6_lwt_encap { 29 struct ipv6_hopopt_hdr eh; 30 u8 pad[2]; /* 2-octet padding for 4n-alignment */ 31 struct ioam6_hdr ioamh; 32 struct ioam6_trace_hdr traceh; 33 } __packed; 34 35 struct ioam6_lwt_freq { 36 u32 k; 37 u32 n; 38 }; 39 40 struct ioam6_lwt { 41 struct dst_entry null_dst; 42 struct dst_cache cache; 43 struct ioam6_lwt_freq freq; 44 atomic_t pkt_cnt; 45 u8 mode; 46 bool has_tunsrc; 47 struct in6_addr tunsrc; 48 struct in6_addr tundst; 49 struct ioam6_lwt_encap tuninfo; 50 }; 51 52 static const struct netlink_range_validation freq_range = { 53 .min = IOAM6_IPTUNNEL_FREQ_MIN, 54 .max = IOAM6_IPTUNNEL_FREQ_MAX, 55 }; 56 57 static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt) 58 { 59 return (struct ioam6_lwt *)lwt->data; 60 } 61 62 static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt) 63 { 64 return &ioam6_lwt_state(lwt)->tuninfo; 65 } 66 67 static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt) 68 { 69 return &(ioam6_lwt_state(lwt)->tuninfo.traceh); 70 } 71 72 static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { 73 [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range), 74 [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range), 75 [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8, 76 IOAM6_IPTUNNEL_MODE_MIN, 77 IOAM6_IPTUNNEL_MODE_MAX), 78 [IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), 79 [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), 80 [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN( 81 sizeof(struct ioam6_trace_hdr)), 82 }; 83 84 static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) 85 { 86 u32 fields; 87 88 if (!trace->type_be32 || !trace->remlen || 89 trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 || 90 trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | 91 trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | 92 trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | 93 trace->type.bit21 | trace->type.bit23) 94 return false; 95 96 trace->nodelen = 0; 97 fields = be32_to_cpu(trace->type_be32); 98 99 trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS) 100 * (sizeof(__be32) / 4); 101 trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS) 102 * (sizeof(__be64) / 4); 103 104 return true; 105 } 106 107 static int ioam6_build_state(struct net *net, struct nlattr *nla, 108 unsigned int family, const void *cfg, 109 struct lwtunnel_state **ts, 110 struct netlink_ext_ack *extack) 111 { 112 struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1]; 113 struct ioam6_lwt_encap *tuninfo; 114 struct ioam6_trace_hdr *trace; 115 struct lwtunnel_state *lwt; 116 struct ioam6_lwt *ilwt; 117 int len_aligned, err; 118 u32 freq_k, freq_n; 119 u8 mode; 120 121 if (family != AF_INET6) 122 return -EINVAL; 123 124 err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla, 125 ioam6_iptunnel_policy, extack); 126 if (err < 0) 127 return err; 128 129 if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) || 130 (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) { 131 NL_SET_ERR_MSG(extack, "freq: missing parameter"); 132 return -EINVAL; 133 } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) { 134 freq_k = IOAM6_IPTUNNEL_FREQ_MIN; 135 freq_n = IOAM6_IPTUNNEL_FREQ_MIN; 136 } else { 137 freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]); 138 freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]); 139 140 if (freq_k > freq_n) { 141 NL_SET_ERR_MSG(extack, "freq: k > n is forbidden"); 142 return -EINVAL; 143 } 144 } 145 146 mode = nla_get_u8_default(tb[IOAM6_IPTUNNEL_MODE], 147 IOAM6_IPTUNNEL_MODE_INLINE); 148 149 if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) { 150 NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode"); 151 return -EINVAL; 152 } 153 154 if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) { 155 NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination"); 156 return -EINVAL; 157 } 158 159 if (!tb[IOAM6_IPTUNNEL_TRACE]) { 160 NL_SET_ERR_MSG(extack, "missing trace"); 161 return -EINVAL; 162 } 163 164 trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]); 165 if (!ioam6_validate_trace_hdr(trace)) { 166 NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE], 167 "invalid trace validation"); 168 return -EINVAL; 169 } 170 171 len_aligned = ALIGN(trace->remlen * 4, 8); 172 lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned); 173 if (!lwt) 174 return -ENOMEM; 175 176 ilwt = ioam6_lwt_state(lwt); 177 err = dst_cache_init(&ilwt->cache, GFP_ATOMIC); 178 if (err) 179 goto free_lwt; 180 181 /* This "fake" dst_entry will be stored in a dst_cache, which will call 182 * dst_hold() and dst_release() on it. We must ensure that dst_destroy() 183 * will never be called. For that, its initial refcount is 1 and +1 when 184 * it is stored in the cache. Then, +1/-1 each time we read the cache 185 * and release it. Long story short, we're fine. 186 */ 187 dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); 188 189 atomic_set(&ilwt->pkt_cnt, 0); 190 ilwt->freq.k = freq_k; 191 ilwt->freq.n = freq_n; 192 193 ilwt->mode = mode; 194 195 if (!tb[IOAM6_IPTUNNEL_SRC]) { 196 ilwt->has_tunsrc = false; 197 } else { 198 ilwt->has_tunsrc = true; 199 ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]); 200 201 if (ipv6_addr_any(&ilwt->tunsrc)) { 202 NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC], 203 "invalid tunnel source address"); 204 err = -EINVAL; 205 goto free_cache; 206 } 207 } 208 209 if (tb[IOAM6_IPTUNNEL_DST]) { 210 ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]); 211 212 if (ipv6_addr_any(&ilwt->tundst)) { 213 NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST], 214 "invalid tunnel dest address"); 215 err = -EINVAL; 216 goto free_cache; 217 } 218 } 219 220 tuninfo = ioam6_lwt_info(lwt); 221 tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1; 222 tuninfo->pad[0] = IPV6_TLV_PADN; 223 tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC; 224 tuninfo->ioamh.opt_type = IPV6_TLV_IOAM; 225 tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace) 226 + trace->remlen * 4; 227 228 memcpy(&tuninfo->traceh, trace, sizeof(*trace)); 229 230 if (len_aligned - trace->remlen * 4) { 231 tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN; 232 tuninfo->traceh.data[trace->remlen * 4 + 1] = 2; 233 } 234 235 lwt->type = LWTUNNEL_ENCAP_IOAM6; 236 lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 237 238 *ts = lwt; 239 240 return 0; 241 free_cache: 242 dst_cache_destroy(&ilwt->cache); 243 free_lwt: 244 kfree(lwt); 245 return err; 246 } 247 248 static int ioam6_do_fill(struct net *net, struct sk_buff *skb) 249 { 250 struct ioam6_trace_hdr *trace; 251 struct ioam6_namespace *ns; 252 253 trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb) 254 + sizeof(struct ipv6_hopopt_hdr) + 2 255 + sizeof(struct ioam6_hdr)); 256 257 ns = ioam6_namespace(net, trace->namespace_id); 258 if (ns) 259 ioam6_fill_trace_data(skb, ns, trace, false); 260 261 return 0; 262 } 263 264 static int ioam6_do_inline(struct net *net, struct sk_buff *skb, 265 struct ioam6_lwt_encap *tuninfo, 266 struct dst_entry *cache_dst) 267 { 268 struct ipv6hdr *oldhdr, *hdr; 269 int hdrlen, err; 270 271 hdrlen = (tuninfo->eh.hdrlen + 1) << 3; 272 273 err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); 274 if (unlikely(err)) 275 return err; 276 277 oldhdr = ipv6_hdr(skb); 278 skb_pull(skb, sizeof(*oldhdr)); 279 skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr)); 280 281 skb_push(skb, sizeof(*oldhdr) + hdrlen); 282 skb_reset_network_header(skb); 283 skb_mac_header_rebuild(skb); 284 285 hdr = ipv6_hdr(skb); 286 memmove(hdr, oldhdr, sizeof(*oldhdr)); 287 tuninfo->eh.nexthdr = hdr->nexthdr; 288 289 skb_set_transport_header(skb, sizeof(*hdr)); 290 skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen); 291 292 memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); 293 294 hdr->nexthdr = NEXTHDR_HOP; 295 hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); 296 297 return ioam6_do_fill(net, skb); 298 } 299 300 static int ioam6_do_encap(struct net *net, struct sk_buff *skb, 301 struct ioam6_lwt_encap *tuninfo, 302 bool has_tunsrc, 303 struct in6_addr *tunsrc, 304 struct in6_addr *tundst, 305 struct dst_entry *cache_dst) 306 { 307 struct dst_entry *dst = skb_dst(skb); 308 struct ipv6hdr *hdr, *inner_hdr; 309 int hdrlen, len, err; 310 311 hdrlen = (tuninfo->eh.hdrlen + 1) << 3; 312 len = sizeof(*hdr) + hdrlen; 313 314 err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb)); 315 if (unlikely(err)) 316 return err; 317 318 inner_hdr = ipv6_hdr(skb); 319 320 skb_push(skb, len); 321 skb_reset_network_header(skb); 322 skb_mac_header_rebuild(skb); 323 skb_set_transport_header(skb, sizeof(*hdr)); 324 325 tuninfo->eh.nexthdr = NEXTHDR_IPV6; 326 memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); 327 328 hdr = ipv6_hdr(skb); 329 memcpy(hdr, inner_hdr, sizeof(*hdr)); 330 331 hdr->nexthdr = NEXTHDR_HOP; 332 hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); 333 hdr->daddr = *tundst; 334 335 if (has_tunsrc) 336 memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc)); 337 else 338 ipv6_dev_get_saddr(net, dst_dev(dst), &hdr->daddr, 339 IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); 340 341 skb_postpush_rcsum(skb, hdr, len); 342 343 return ioam6_do_fill(net, skb); 344 } 345 346 static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 347 { 348 struct dst_entry *orig_dst = skb_dst(skb); 349 struct dst_entry *dst = NULL; 350 struct ioam6_lwt *ilwt; 351 int err = -EINVAL; 352 u32 pkt_cnt; 353 354 if (skb->protocol != htons(ETH_P_IPV6)) 355 goto drop; 356 357 ilwt = ioam6_lwt_state(orig_dst->lwtstate); 358 359 /* Check for insertion frequency (i.e., "k over n" insertions) */ 360 pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt); 361 if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k) 362 goto out; 363 364 local_bh_disable(); 365 dst = dst_cache_get(&ilwt->cache); 366 local_bh_enable(); 367 368 /* This is how we notify that the destination does not change after 369 * transformation and that we need to use orig_dst instead of the cache 370 */ 371 if (dst == &ilwt->null_dst) { 372 dst_release(dst); 373 374 dst = orig_dst; 375 /* keep refcount balance: dst_release() is called at the end */ 376 dst_hold(dst); 377 } 378 379 switch (ilwt->mode) { 380 case IOAM6_IPTUNNEL_MODE_INLINE: 381 do_inline: 382 /* Direct insertion - if there is no Hop-by-Hop yet */ 383 if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) 384 goto out; 385 386 err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst); 387 if (unlikely(err)) 388 goto drop; 389 390 break; 391 case IOAM6_IPTUNNEL_MODE_ENCAP: 392 do_encap: 393 /* Encapsulation (ip6ip6) */ 394 err = ioam6_do_encap(net, skb, &ilwt->tuninfo, 395 ilwt->has_tunsrc, &ilwt->tunsrc, 396 &ilwt->tundst, dst); 397 if (unlikely(err)) 398 goto drop; 399 400 break; 401 case IOAM6_IPTUNNEL_MODE_AUTO: 402 /* Automatic (RFC8200 compliant): 403 * - local packets -> INLINE mode 404 * - in-transit packets -> ENCAP mode 405 */ 406 if (!skb->dev) 407 goto do_inline; 408 409 goto do_encap; 410 default: 411 goto drop; 412 } 413 414 if (unlikely(!dst)) { 415 struct ipv6hdr *hdr = ipv6_hdr(skb); 416 struct flowi6 fl6; 417 418 memset(&fl6, 0, sizeof(fl6)); 419 fl6.daddr = hdr->daddr; 420 fl6.saddr = hdr->saddr; 421 fl6.flowlabel = ip6_flowinfo(hdr); 422 fl6.flowi6_mark = skb->mark; 423 fl6.flowi6_proto = hdr->nexthdr; 424 425 dst = ip6_route_output(net, NULL, &fl6); 426 if (dst->error) { 427 err = dst->error; 428 goto drop; 429 } 430 431 /* If the destination is the same after transformation (which is 432 * a valid use case for IOAM), then we don't want to add it to 433 * the cache in order to avoid a reference loop. Instead, we add 434 * our fake dst_entry to the cache as a way to detect this case. 435 * Otherwise, we add the resolved destination to the cache. 436 */ 437 local_bh_disable(); 438 if (orig_dst->lwtstate == dst->lwtstate) 439 dst_cache_set_ip6(&ilwt->cache, 440 &ilwt->null_dst, &fl6.saddr); 441 else 442 dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); 443 local_bh_enable(); 444 445 err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); 446 if (unlikely(err)) 447 goto drop; 448 } 449 450 /* avoid lwtunnel_output() reentry loop when destination is the same 451 * after transformation (e.g., with the inline mode) 452 */ 453 if (orig_dst->lwtstate != dst->lwtstate) { 454 skb_dst_drop(skb); 455 skb_dst_set(skb, dst); 456 return dst_output(net, sk, skb); 457 } 458 out: 459 dst_release(dst); 460 return orig_dst->lwtstate->orig_output(net, sk, skb); 461 drop: 462 dst_release(dst); 463 kfree_skb(skb); 464 return err; 465 } 466 467 static void ioam6_destroy_state(struct lwtunnel_state *lwt) 468 { 469 /* Since the refcount of per-cpu dst_entry caches will never be 0 (see 470 * why above) when our "fake" dst_entry is used, it is not necessary to 471 * remove them before calling dst_cache_destroy() 472 */ 473 dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); 474 } 475 476 static int ioam6_fill_encap_info(struct sk_buff *skb, 477 struct lwtunnel_state *lwtstate) 478 { 479 struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate); 480 int err; 481 482 err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k); 483 if (err) 484 goto ret; 485 486 err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n); 487 if (err) 488 goto ret; 489 490 err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode); 491 if (err) 492 goto ret; 493 494 if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { 495 if (ilwt->has_tunsrc) { 496 err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC, 497 &ilwt->tunsrc); 498 if (err) 499 goto ret; 500 } 501 502 err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst); 503 if (err) 504 goto ret; 505 } 506 507 err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh), 508 &ilwt->tuninfo.traceh); 509 ret: 510 return err; 511 } 512 513 static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) 514 { 515 struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate); 516 int nlsize; 517 518 nlsize = nla_total_size(sizeof(ilwt->freq.k)) + 519 nla_total_size(sizeof(ilwt->freq.n)) + 520 nla_total_size(sizeof(ilwt->mode)) + 521 nla_total_size(sizeof(ilwt->tuninfo.traceh)); 522 523 if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { 524 if (ilwt->has_tunsrc) 525 nlsize += nla_total_size(sizeof(ilwt->tunsrc)); 526 527 nlsize += nla_total_size(sizeof(ilwt->tundst)); 528 } 529 530 return nlsize; 531 } 532 533 static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 534 { 535 struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a); 536 struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b); 537 struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a); 538 struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b); 539 540 return (ilwt_a->freq.k != ilwt_b->freq.k || 541 ilwt_a->freq.n != ilwt_b->freq.n || 542 ilwt_a->mode != ilwt_b->mode || 543 ilwt_a->has_tunsrc != ilwt_b->has_tunsrc || 544 (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && 545 !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) || 546 (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && 547 ilwt_a->has_tunsrc && 548 !ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) || 549 trace_a->namespace_id != trace_b->namespace_id); 550 } 551 552 static const struct lwtunnel_encap_ops ioam6_iptun_ops = { 553 .build_state = ioam6_build_state, 554 .destroy_state = ioam6_destroy_state, 555 .output = ioam6_output, 556 .fill_encap = ioam6_fill_encap_info, 557 .get_encap_size = ioam6_encap_nlsize, 558 .cmp_encap = ioam6_encap_cmp, 559 .owner = THIS_MODULE, 560 }; 561 562 int __init ioam6_iptunnel_init(void) 563 { 564 return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); 565 } 566 567 void ioam6_iptunnel_exit(void) 568 { 569 lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); 570 } 571