1 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/skbuff.h> 16 #include <linux/types.h> 17 #include <linux/bpf.h> 18 #include <net/lwtunnel.h> 19 20 struct bpf_lwt_prog { 21 struct bpf_prog *prog; 22 char *name; 23 }; 24 25 struct bpf_lwt { 26 struct bpf_lwt_prog in; 27 struct bpf_lwt_prog out; 28 struct bpf_lwt_prog xmit; 29 int family; 30 }; 31 32 #define MAX_PROG_NAME 256 33 34 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 35 { 36 return (struct bpf_lwt *)lwt->data; 37 } 38 39 #define NO_REDIRECT false 40 #define CAN_REDIRECT true 41 42 static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 43 struct dst_entry *dst, bool can_redirect) 44 { 45 int ret; 46 47 /* Preempt disable is needed to protect per-cpu redirect_info between 48 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and 49 * access to maps strictly require a rcu_read_lock() for protection, 50 * mixing with BH RCU lock doesn't work. 51 */ 52 preempt_disable(); 53 rcu_read_lock(); 54 bpf_compute_data_pointers(skb); 55 ret = bpf_prog_run_save_cb(lwt->prog, skb); 56 rcu_read_unlock(); 57 58 switch (ret) { 59 case BPF_OK: 60 break; 61 62 case BPF_REDIRECT: 63 if (unlikely(!can_redirect)) { 64 pr_warn_once("Illegal redirect return code in prog %s\n", 65 lwt->name ? : "<unknown>"); 66 ret = BPF_OK; 67 } else { 68 ret = skb_do_redirect(skb); 69 if (ret == 0) 70 ret = BPF_REDIRECT; 71 } 72 break; 73 74 case BPF_DROP: 75 kfree_skb(skb); 76 ret = -EPERM; 77 break; 78 79 default: 80 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 81 kfree_skb(skb); 82 ret = -EINVAL; 83 break; 84 } 85 86 preempt_enable(); 87 88 return ret; 89 } 90 91 static int bpf_input(struct sk_buff *skb) 92 { 93 struct dst_entry *dst = skb_dst(skb); 94 struct bpf_lwt *bpf; 95 int ret; 96 97 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 98 if (bpf->in.prog) { 99 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 100 if (ret < 0) 101 return ret; 102 } 103 104 if (unlikely(!dst->lwtstate->orig_input)) { 105 pr_warn_once("orig_input not set on dst for prog %s\n", 106 bpf->out.name); 107 kfree_skb(skb); 108 return -EINVAL; 109 } 110 111 return dst->lwtstate->orig_input(skb); 112 } 113 114 static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 115 { 116 struct dst_entry *dst = skb_dst(skb); 117 struct bpf_lwt *bpf; 118 int ret; 119 120 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 121 if (bpf->out.prog) { 122 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 123 if (ret < 0) 124 return ret; 125 } 126 127 if (unlikely(!dst->lwtstate->orig_output)) { 128 pr_warn_once("orig_output not set on dst for prog %s\n", 129 bpf->out.name); 130 kfree_skb(skb); 131 return -EINVAL; 132 } 133 134 return dst->lwtstate->orig_output(net, sk, skb); 135 } 136 137 static int xmit_check_hhlen(struct sk_buff *skb) 138 { 139 int hh_len = skb_dst(skb)->dev->hard_header_len; 140 141 if (skb_headroom(skb) < hh_len) { 142 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 143 144 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 145 return -ENOMEM; 146 } 147 148 return 0; 149 } 150 151 static int bpf_xmit(struct sk_buff *skb) 152 { 153 struct dst_entry *dst = skb_dst(skb); 154 struct bpf_lwt *bpf; 155 156 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 157 if (bpf->xmit.prog) { 158 int ret; 159 160 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 161 switch (ret) { 162 case BPF_OK: 163 /* If the header was expanded, headroom might be too 164 * small for L2 header to come, expand as needed. 165 */ 166 ret = xmit_check_hhlen(skb); 167 if (unlikely(ret)) 168 return ret; 169 170 return LWTUNNEL_XMIT_CONTINUE; 171 case BPF_REDIRECT: 172 return LWTUNNEL_XMIT_DONE; 173 default: 174 return ret; 175 } 176 } 177 178 return LWTUNNEL_XMIT_CONTINUE; 179 } 180 181 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 182 { 183 if (prog->prog) 184 bpf_prog_put(prog->prog); 185 186 kfree(prog->name); 187 } 188 189 static void bpf_destroy_state(struct lwtunnel_state *lwt) 190 { 191 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 192 193 bpf_lwt_prog_destroy(&bpf->in); 194 bpf_lwt_prog_destroy(&bpf->out); 195 bpf_lwt_prog_destroy(&bpf->xmit); 196 } 197 198 static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 199 [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 200 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 201 .len = MAX_PROG_NAME }, 202 }; 203 204 static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 205 enum bpf_prog_type type) 206 { 207 struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 208 struct bpf_prog *p; 209 int ret; 210 u32 fd; 211 212 ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, 213 NULL); 214 if (ret < 0) 215 return ret; 216 217 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 218 return -EINVAL; 219 220 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); 221 if (!prog->name) 222 return -ENOMEM; 223 224 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 225 p = bpf_prog_get_type(fd, type); 226 if (IS_ERR(p)) 227 return PTR_ERR(p); 228 229 prog->prog = p; 230 231 return 0; 232 } 233 234 static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 235 [LWT_BPF_IN] = { .type = NLA_NESTED, }, 236 [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 237 [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 238 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 239 }; 240 241 static int bpf_build_state(struct nlattr *nla, 242 unsigned int family, const void *cfg, 243 struct lwtunnel_state **ts, 244 struct netlink_ext_ack *extack) 245 { 246 struct nlattr *tb[LWT_BPF_MAX + 1]; 247 struct lwtunnel_state *newts; 248 struct bpf_lwt *bpf; 249 int ret; 250 251 if (family != AF_INET && family != AF_INET6) 252 return -EAFNOSUPPORT; 253 254 ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); 255 if (ret < 0) 256 return ret; 257 258 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 259 return -EINVAL; 260 261 newts = lwtunnel_state_alloc(sizeof(*bpf)); 262 if (!newts) 263 return -ENOMEM; 264 265 newts->type = LWTUNNEL_ENCAP_BPF; 266 bpf = bpf_lwt_lwtunnel(newts); 267 268 if (tb[LWT_BPF_IN]) { 269 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 270 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 271 BPF_PROG_TYPE_LWT_IN); 272 if (ret < 0) 273 goto errout; 274 } 275 276 if (tb[LWT_BPF_OUT]) { 277 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 278 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 279 BPF_PROG_TYPE_LWT_OUT); 280 if (ret < 0) 281 goto errout; 282 } 283 284 if (tb[LWT_BPF_XMIT]) { 285 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 286 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 287 BPF_PROG_TYPE_LWT_XMIT); 288 if (ret < 0) 289 goto errout; 290 } 291 292 if (tb[LWT_BPF_XMIT_HEADROOM]) { 293 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 294 295 if (headroom > LWT_BPF_MAX_HEADROOM) { 296 ret = -ERANGE; 297 goto errout; 298 } 299 300 newts->headroom = headroom; 301 } 302 303 bpf->family = family; 304 *ts = newts; 305 306 return 0; 307 308 errout: 309 bpf_destroy_state(newts); 310 kfree(newts); 311 return ret; 312 } 313 314 static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 315 struct bpf_lwt_prog *prog) 316 { 317 struct nlattr *nest; 318 319 if (!prog->prog) 320 return 0; 321 322 nest = nla_nest_start(skb, attr); 323 if (!nest) 324 return -EMSGSIZE; 325 326 if (prog->name && 327 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 328 return -EMSGSIZE; 329 330 return nla_nest_end(skb, nest); 331 } 332 333 static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 334 { 335 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 336 337 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 338 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 339 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 340 return -EMSGSIZE; 341 342 return 0; 343 } 344 345 static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 346 { 347 int nest_len = nla_total_size(sizeof(struct nlattr)) + 348 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 349 0; 350 351 return nest_len + /* LWT_BPF_IN */ 352 nest_len + /* LWT_BPF_OUT */ 353 nest_len + /* LWT_BPF_XMIT */ 354 0; 355 } 356 357 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 358 { 359 /* FIXME: 360 * The LWT state is currently rebuilt for delete requests which 361 * results in a new bpf_prog instance. Comparing names for now. 362 */ 363 if (!a->name && !b->name) 364 return 0; 365 366 if (!a->name || !b->name) 367 return 1; 368 369 return strcmp(a->name, b->name); 370 } 371 372 static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 373 { 374 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 375 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 376 377 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 378 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 379 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 380 } 381 382 static const struct lwtunnel_encap_ops bpf_encap_ops = { 383 .build_state = bpf_build_state, 384 .destroy_state = bpf_destroy_state, 385 .input = bpf_input, 386 .output = bpf_output, 387 .xmit = bpf_xmit, 388 .fill_encap = bpf_fill_encap_info, 389 .get_encap_size = bpf_encap_nlsize, 390 .cmp_encap = bpf_encap_cmp, 391 .owner = THIS_MODULE, 392 }; 393 394 static int __init bpf_lwt_init(void) 395 { 396 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 397 } 398 399 subsys_initcall(bpf_lwt_init) 400