1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * IPv6 IOAM Lightweight Tunnel implementation 4 * 5 * Author: 6 * Justin Iurman <justin.iurman@uliege.be> 7 */ 8 9 #include <linux/kernel.h> 10 #include <linux/skbuff.h> 11 #include <linux/net.h> 12 #include <linux/in6.h> 13 #include <linux/ioam6.h> 14 #include <linux/ioam6_iptunnel.h> 15 #include <net/dst.h> 16 #include <net/sock.h> 17 #include <net/lwtunnel.h> 18 #include <net/ioam6.h> 19 #include <net/netlink.h> 20 #include <net/ipv6.h> 21 #include <net/dst_cache.h> 22 #include <net/ip6_route.h> 23 #include <net/addrconf.h> 24 25 #define IOAM6_MASK_SHORT_FIELDS 0xff100000 26 #define IOAM6_MASK_WIDE_FIELDS 0xe00000 27 28 struct ioam6_lwt_encap { 29 struct ipv6_hopopt_hdr eh; 30 u8 pad[2]; /* 2-octet padding for 4n-alignment */ 31 struct ioam6_hdr ioamh; 32 struct ioam6_trace_hdr traceh; 33 } __packed; 34 35 struct ioam6_lwt_freq { 36 u32 k; 37 u32 n; 38 }; 39 40 struct ioam6_lwt { 41 struct dst_cache cache; 42 struct ioam6_lwt_freq freq; 43 atomic_t pkt_cnt; 44 u8 mode; 45 bool has_tunsrc; 46 struct in6_addr tunsrc; 47 struct in6_addr tundst; 48 struct ioam6_lwt_encap tuninfo; 49 }; 50 51 static const struct netlink_range_validation freq_range = { 52 .min = IOAM6_IPTUNNEL_FREQ_MIN, 53 .max = IOAM6_IPTUNNEL_FREQ_MAX, 54 }; 55 56 static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt) 57 { 58 return (struct ioam6_lwt *)lwt->data; 59 } 60 61 static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt) 62 { 63 return &ioam6_lwt_state(lwt)->tuninfo; 64 } 65 66 static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt) 67 { 68 return &(ioam6_lwt_state(lwt)->tuninfo.traceh); 69 } 70 71 static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { 72 [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range), 73 [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range), 74 [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8, 75 IOAM6_IPTUNNEL_MODE_MIN, 76 IOAM6_IPTUNNEL_MODE_MAX), 77 [IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), 78 [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), 79 [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN( 80 sizeof(struct ioam6_trace_hdr)), 81 }; 82 83 static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) 84 { 85 u32 fields; 86 87 if (!trace->type_be32 || !trace->remlen || 88 trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 || 89 trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | 90 trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | 91 trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | 92 trace->type.bit21 | trace->type.bit23) 93 return false; 94 95 trace->nodelen = 0; 96 fields = be32_to_cpu(trace->type_be32); 97 98 trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS) 99 * (sizeof(__be32) / 4); 100 trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS) 101 * (sizeof(__be64) / 4); 102 103 return true; 104 } 105 106 static int ioam6_build_state(struct net *net, struct nlattr *nla, 107 unsigned int family, const void *cfg, 108 struct lwtunnel_state **ts, 109 struct netlink_ext_ack *extack) 110 { 111 struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1]; 112 struct ioam6_lwt_encap *tuninfo; 113 struct ioam6_trace_hdr *trace; 114 struct lwtunnel_state *lwt; 115 struct ioam6_lwt *ilwt; 116 int len_aligned, err; 117 u32 freq_k, freq_n; 118 u8 mode; 119 120 if (family != AF_INET6) 121 return -EINVAL; 122 123 err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla, 124 ioam6_iptunnel_policy, extack); 125 if (err < 0) 126 return err; 127 128 if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) || 129 (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) { 130 NL_SET_ERR_MSG(extack, "freq: missing parameter"); 131 return -EINVAL; 132 } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) { 133 freq_k = IOAM6_IPTUNNEL_FREQ_MIN; 134 freq_n = IOAM6_IPTUNNEL_FREQ_MIN; 135 } else { 136 freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]); 137 freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]); 138 139 if (freq_k > freq_n) { 140 NL_SET_ERR_MSG(extack, "freq: k > n is forbidden"); 141 return -EINVAL; 142 } 143 } 144 145 mode = nla_get_u8_default(tb[IOAM6_IPTUNNEL_MODE], 146 IOAM6_IPTUNNEL_MODE_INLINE); 147 148 if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) { 149 NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode"); 150 return -EINVAL; 151 } 152 153 if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) { 154 NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination"); 155 return -EINVAL; 156 } 157 158 if (!tb[IOAM6_IPTUNNEL_TRACE]) { 159 NL_SET_ERR_MSG(extack, "missing trace"); 160 return -EINVAL; 161 } 162 163 trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]); 164 if (!ioam6_validate_trace_hdr(trace)) { 165 NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE], 166 "invalid trace validation"); 167 return -EINVAL; 168 } 169 170 len_aligned = ALIGN(trace->remlen * 4, 8); 171 lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned); 172 if (!lwt) 173 return -ENOMEM; 174 175 ilwt = ioam6_lwt_state(lwt); 176 err = dst_cache_init(&ilwt->cache, GFP_ATOMIC); 177 if (err) 178 goto free_lwt; 179 180 atomic_set(&ilwt->pkt_cnt, 0); 181 ilwt->freq.k = freq_k; 182 ilwt->freq.n = freq_n; 183 184 ilwt->mode = mode; 185 186 if (!tb[IOAM6_IPTUNNEL_SRC]) { 187 ilwt->has_tunsrc = false; 188 } else { 189 ilwt->has_tunsrc = true; 190 ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]); 191 192 if (ipv6_addr_any(&ilwt->tunsrc)) { 193 NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC], 194 "invalid tunnel source address"); 195 err = -EINVAL; 196 goto free_cache; 197 } 198 } 199 200 if (tb[IOAM6_IPTUNNEL_DST]) { 201 ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]); 202 203 if (ipv6_addr_any(&ilwt->tundst)) { 204 NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST], 205 "invalid tunnel dest address"); 206 err = -EINVAL; 207 goto free_cache; 208 } 209 } 210 211 tuninfo = ioam6_lwt_info(lwt); 212 tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1; 213 tuninfo->pad[0] = IPV6_TLV_PADN; 214 tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC; 215 tuninfo->ioamh.opt_type = IPV6_TLV_IOAM; 216 tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace) 217 + trace->remlen * 4; 218 219 memcpy(&tuninfo->traceh, trace, sizeof(*trace)); 220 221 if (len_aligned - trace->remlen * 4) { 222 tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN; 223 tuninfo->traceh.data[trace->remlen * 4 + 1] = 2; 224 } 225 226 lwt->type = LWTUNNEL_ENCAP_IOAM6; 227 lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 228 229 *ts = lwt; 230 231 return 0; 232 free_cache: 233 dst_cache_destroy(&ilwt->cache); 234 free_lwt: 235 kfree(lwt); 236 return err; 237 } 238 239 static int ioam6_do_fill(struct net *net, struct sk_buff *skb) 240 { 241 struct ioam6_trace_hdr *trace; 242 struct ioam6_namespace *ns; 243 244 trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb) 245 + sizeof(struct ipv6_hopopt_hdr) + 2 246 + sizeof(struct ioam6_hdr)); 247 248 ns = ioam6_namespace(net, trace->namespace_id); 249 if (ns) 250 ioam6_fill_trace_data(skb, ns, trace, false); 251 252 return 0; 253 } 254 255 static int ioam6_do_inline(struct net *net, struct sk_buff *skb, 256 struct ioam6_lwt_encap *tuninfo, 257 struct dst_entry *cache_dst) 258 { 259 struct ipv6hdr *oldhdr, *hdr; 260 int hdrlen, err; 261 262 hdrlen = (tuninfo->eh.hdrlen + 1) << 3; 263 264 err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); 265 if (unlikely(err)) 266 return err; 267 268 oldhdr = ipv6_hdr(skb); 269 skb_pull(skb, sizeof(*oldhdr)); 270 skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr)); 271 272 skb_push(skb, sizeof(*oldhdr) + hdrlen); 273 skb_reset_network_header(skb); 274 skb_mac_header_rebuild(skb); 275 276 hdr = ipv6_hdr(skb); 277 memmove(hdr, oldhdr, sizeof(*oldhdr)); 278 tuninfo->eh.nexthdr = hdr->nexthdr; 279 280 skb_set_transport_header(skb, sizeof(*hdr)); 281 skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen); 282 283 memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); 284 285 hdr->nexthdr = NEXTHDR_HOP; 286 hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); 287 288 return ioam6_do_fill(net, skb); 289 } 290 291 static int ioam6_do_encap(struct net *net, struct sk_buff *skb, 292 struct ioam6_lwt_encap *tuninfo, 293 bool has_tunsrc, 294 struct in6_addr *tunsrc, 295 struct in6_addr *tundst, 296 struct dst_entry *cache_dst) 297 { 298 struct dst_entry *dst = skb_dst(skb); 299 struct ipv6hdr *hdr, *inner_hdr; 300 int hdrlen, len, err; 301 302 hdrlen = (tuninfo->eh.hdrlen + 1) << 3; 303 len = sizeof(*hdr) + hdrlen; 304 305 err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb)); 306 if (unlikely(err)) 307 return err; 308 309 inner_hdr = ipv6_hdr(skb); 310 311 skb_push(skb, len); 312 skb_reset_network_header(skb); 313 skb_mac_header_rebuild(skb); 314 skb_set_transport_header(skb, sizeof(*hdr)); 315 316 tuninfo->eh.nexthdr = NEXTHDR_IPV6; 317 memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); 318 319 hdr = ipv6_hdr(skb); 320 memcpy(hdr, inner_hdr, sizeof(*hdr)); 321 322 hdr->nexthdr = NEXTHDR_HOP; 323 hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); 324 hdr->daddr = *tundst; 325 326 if (has_tunsrc) 327 memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc)); 328 else 329 ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, 330 IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); 331 332 skb_postpush_rcsum(skb, hdr, len); 333 334 return ioam6_do_fill(net, skb); 335 } 336 337 static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 338 { 339 struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; 340 struct ioam6_lwt *ilwt; 341 int err = -EINVAL; 342 u32 pkt_cnt; 343 344 if (skb->protocol != htons(ETH_P_IPV6)) 345 goto drop; 346 347 ilwt = ioam6_lwt_state(dst->lwtstate); 348 349 /* Check for insertion frequency (i.e., "k over n" insertions) */ 350 pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt); 351 if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k) 352 goto out; 353 354 local_bh_disable(); 355 cache_dst = dst_cache_get(&ilwt->cache); 356 local_bh_enable(); 357 358 switch (ilwt->mode) { 359 case IOAM6_IPTUNNEL_MODE_INLINE: 360 do_inline: 361 /* Direct insertion - if there is no Hop-by-Hop yet */ 362 if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) 363 goto out; 364 365 err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst); 366 if (unlikely(err)) 367 goto drop; 368 369 break; 370 case IOAM6_IPTUNNEL_MODE_ENCAP: 371 do_encap: 372 /* Encapsulation (ip6ip6) */ 373 err = ioam6_do_encap(net, skb, &ilwt->tuninfo, 374 ilwt->has_tunsrc, &ilwt->tunsrc, 375 &ilwt->tundst, cache_dst); 376 if (unlikely(err)) 377 goto drop; 378 379 break; 380 case IOAM6_IPTUNNEL_MODE_AUTO: 381 /* Automatic (RFC8200 compliant): 382 * - local packets -> INLINE mode 383 * - in-transit packets -> ENCAP mode 384 */ 385 if (!skb->dev) 386 goto do_inline; 387 388 goto do_encap; 389 default: 390 goto drop; 391 } 392 393 if (unlikely(!cache_dst)) { 394 struct ipv6hdr *hdr = ipv6_hdr(skb); 395 struct flowi6 fl6; 396 397 memset(&fl6, 0, sizeof(fl6)); 398 fl6.daddr = hdr->daddr; 399 fl6.saddr = hdr->saddr; 400 fl6.flowlabel = ip6_flowinfo(hdr); 401 fl6.flowi6_mark = skb->mark; 402 fl6.flowi6_proto = hdr->nexthdr; 403 404 cache_dst = ip6_route_output(net, NULL, &fl6); 405 if (cache_dst->error) { 406 err = cache_dst->error; 407 goto drop; 408 } 409 410 /* cache only if we don't create a dst reference loop */ 411 if (dst->lwtstate != cache_dst->lwtstate) { 412 local_bh_disable(); 413 dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); 414 local_bh_enable(); 415 } 416 417 err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); 418 if (unlikely(err)) 419 goto drop; 420 } 421 422 /* avoid lwtunnel_output() reentry loop when destination is the same 423 * after transformation (e.g., with the inline mode) 424 */ 425 if (dst->lwtstate != cache_dst->lwtstate) { 426 skb_dst_drop(skb); 427 skb_dst_set(skb, cache_dst); 428 return dst_output(net, sk, skb); 429 } 430 out: 431 dst_release(cache_dst); 432 return dst->lwtstate->orig_output(net, sk, skb); 433 drop: 434 dst_release(cache_dst); 435 kfree_skb(skb); 436 return err; 437 } 438 439 static void ioam6_destroy_state(struct lwtunnel_state *lwt) 440 { 441 dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); 442 } 443 444 static int ioam6_fill_encap_info(struct sk_buff *skb, 445 struct lwtunnel_state *lwtstate) 446 { 447 struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate); 448 int err; 449 450 err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k); 451 if (err) 452 goto ret; 453 454 err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n); 455 if (err) 456 goto ret; 457 458 err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode); 459 if (err) 460 goto ret; 461 462 if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { 463 if (ilwt->has_tunsrc) { 464 err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC, 465 &ilwt->tunsrc); 466 if (err) 467 goto ret; 468 } 469 470 err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst); 471 if (err) 472 goto ret; 473 } 474 475 err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh), 476 &ilwt->tuninfo.traceh); 477 ret: 478 return err; 479 } 480 481 static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) 482 { 483 struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate); 484 int nlsize; 485 486 nlsize = nla_total_size(sizeof(ilwt->freq.k)) + 487 nla_total_size(sizeof(ilwt->freq.n)) + 488 nla_total_size(sizeof(ilwt->mode)) + 489 nla_total_size(sizeof(ilwt->tuninfo.traceh)); 490 491 if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { 492 if (ilwt->has_tunsrc) 493 nlsize += nla_total_size(sizeof(ilwt->tunsrc)); 494 495 nlsize += nla_total_size(sizeof(ilwt->tundst)); 496 } 497 498 return nlsize; 499 } 500 501 static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 502 { 503 struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a); 504 struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b); 505 struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a); 506 struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b); 507 508 return (ilwt_a->freq.k != ilwt_b->freq.k || 509 ilwt_a->freq.n != ilwt_b->freq.n || 510 ilwt_a->mode != ilwt_b->mode || 511 ilwt_a->has_tunsrc != ilwt_b->has_tunsrc || 512 (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && 513 !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) || 514 (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && 515 ilwt_a->has_tunsrc && 516 !ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) || 517 trace_a->namespace_id != trace_b->namespace_id); 518 } 519 520 static const struct lwtunnel_encap_ops ioam6_iptun_ops = { 521 .build_state = ioam6_build_state, 522 .destroy_state = ioam6_destroy_state, 523 .output = ioam6_output, 524 .fill_encap = ioam6_fill_encap_info, 525 .get_encap_size = ioam6_encap_nlsize, 526 .cmp_encap = ioam6_encap_cmp, 527 .owner = THIS_MODULE, 528 }; 529 530 int __init ioam6_iptunnel_init(void) 531 { 532 return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); 533 } 534 535 void ioam6_iptunnel_exit(void) 536 { 537 lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); 538 } 539