1 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/skbuff.h> 16 #include <linux/types.h> 17 #include <linux/bpf.h> 18 #include <net/lwtunnel.h> 19 20 struct bpf_lwt_prog { 21 struct bpf_prog *prog; 22 char *name; 23 }; 24 25 struct bpf_lwt { 26 struct bpf_lwt_prog in; 27 struct bpf_lwt_prog out; 28 struct bpf_lwt_prog xmit; 29 int family; 30 }; 31 32 #define MAX_PROG_NAME 256 33 34 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 35 { 36 return (struct bpf_lwt *)lwt->data; 37 } 38 39 #define NO_REDIRECT false 40 #define CAN_REDIRECT true 41 42 static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 43 struct dst_entry *dst, bool can_redirect) 44 { 45 int ret; 46 47 /* Preempt disable is needed to protect per-cpu redirect_info between 48 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and 49 * access to maps strictly require a rcu_read_lock() for protection, 50 * mixing with BH RCU lock doesn't work. 51 */ 52 preempt_disable(); 53 rcu_read_lock(); 54 bpf_compute_data_end(skb); 55 ret = bpf_prog_run_save_cb(lwt->prog, skb); 56 rcu_read_unlock(); 57 58 switch (ret) { 59 case BPF_OK: 60 break; 61 62 case BPF_REDIRECT: 63 if (unlikely(!can_redirect)) { 64 pr_warn_once("Illegal redirect return code in prog %s\n", 65 lwt->name ? : "<unknown>"); 66 ret = BPF_OK; 67 } else { 68 ret = skb_do_redirect(skb); 69 if (ret == 0) 70 ret = BPF_REDIRECT; 71 } 72 break; 73 74 case BPF_DROP: 75 kfree_skb(skb); 76 ret = -EPERM; 77 break; 78 79 default: 80 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 81 kfree_skb(skb); 82 ret = -EINVAL; 83 break; 84 } 85 86 preempt_enable(); 87 88 return ret; 89 } 90 91 static int bpf_input(struct sk_buff *skb) 92 { 93 struct dst_entry *dst = skb_dst(skb); 94 struct bpf_lwt *bpf; 95 int ret; 96 97 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 98 if (bpf->in.prog) { 99 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 100 if (ret < 0) 101 return ret; 102 } 103 104 if (unlikely(!dst->lwtstate->orig_input)) { 105 pr_warn_once("orig_input not set on dst for prog %s\n", 106 bpf->out.name); 107 kfree_skb(skb); 108 return -EINVAL; 109 } 110 111 return dst->lwtstate->orig_input(skb); 112 } 113 114 static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 115 { 116 struct dst_entry *dst = skb_dst(skb); 117 struct bpf_lwt *bpf; 118 int ret; 119 120 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 121 if (bpf->out.prog) { 122 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 123 if (ret < 0) 124 return ret; 125 } 126 127 if (unlikely(!dst->lwtstate->orig_output)) { 128 pr_warn_once("orig_output not set on dst for prog %s\n", 129 bpf->out.name); 130 kfree_skb(skb); 131 return -EINVAL; 132 } 133 134 return dst->lwtstate->orig_output(net, sk, skb); 135 } 136 137 static int xmit_check_hhlen(struct sk_buff *skb) 138 { 139 int hh_len = skb_dst(skb)->dev->hard_header_len; 140 141 if (skb_headroom(skb) < hh_len) { 142 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 143 144 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 145 return -ENOMEM; 146 } 147 148 return 0; 149 } 150 151 static int bpf_xmit(struct sk_buff *skb) 152 { 153 struct dst_entry *dst = skb_dst(skb); 154 struct bpf_lwt *bpf; 155 156 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 157 if (bpf->xmit.prog) { 158 int ret; 159 160 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 161 switch (ret) { 162 case BPF_OK: 163 /* If the header was expanded, headroom might be too 164 * small for L2 header to come, expand as needed. 165 */ 166 ret = xmit_check_hhlen(skb); 167 if (unlikely(ret)) 168 return ret; 169 170 return LWTUNNEL_XMIT_CONTINUE; 171 case BPF_REDIRECT: 172 return LWTUNNEL_XMIT_DONE; 173 default: 174 return ret; 175 } 176 } 177 178 return LWTUNNEL_XMIT_CONTINUE; 179 } 180 181 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 182 { 183 if (prog->prog) 184 bpf_prog_put(prog->prog); 185 186 kfree(prog->name); 187 } 188 189 static void bpf_destroy_state(struct lwtunnel_state *lwt) 190 { 191 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 192 193 bpf_lwt_prog_destroy(&bpf->in); 194 bpf_lwt_prog_destroy(&bpf->out); 195 bpf_lwt_prog_destroy(&bpf->xmit); 196 } 197 198 static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 199 [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 200 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 201 .len = MAX_PROG_NAME }, 202 }; 203 204 static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 205 enum bpf_prog_type type) 206 { 207 struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 208 struct bpf_prog *p; 209 int ret; 210 u32 fd; 211 212 ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, 213 NULL); 214 if (ret < 0) 215 return ret; 216 217 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 218 return -EINVAL; 219 220 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); 221 if (!prog->name) 222 return -ENOMEM; 223 224 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 225 p = bpf_prog_get_type(fd, type); 226 if (IS_ERR(p)) 227 return PTR_ERR(p); 228 229 prog->prog = p; 230 231 return 0; 232 } 233 234 static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 235 [LWT_BPF_IN] = { .type = NLA_NESTED, }, 236 [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 237 [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 238 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 239 }; 240 241 static int bpf_build_state(struct nlattr *nla, 242 unsigned int family, const void *cfg, 243 struct lwtunnel_state **ts) 244 { 245 struct nlattr *tb[LWT_BPF_MAX + 1]; 246 struct lwtunnel_state *newts; 247 struct bpf_lwt *bpf; 248 int ret; 249 250 if (family != AF_INET && family != AF_INET6) 251 return -EAFNOSUPPORT; 252 253 ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, NULL); 254 if (ret < 0) 255 return ret; 256 257 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 258 return -EINVAL; 259 260 newts = lwtunnel_state_alloc(sizeof(*bpf)); 261 if (!newts) 262 return -ENOMEM; 263 264 newts->type = LWTUNNEL_ENCAP_BPF; 265 bpf = bpf_lwt_lwtunnel(newts); 266 267 if (tb[LWT_BPF_IN]) { 268 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 269 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 270 BPF_PROG_TYPE_LWT_IN); 271 if (ret < 0) 272 goto errout; 273 } 274 275 if (tb[LWT_BPF_OUT]) { 276 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 277 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 278 BPF_PROG_TYPE_LWT_OUT); 279 if (ret < 0) 280 goto errout; 281 } 282 283 if (tb[LWT_BPF_XMIT]) { 284 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 285 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 286 BPF_PROG_TYPE_LWT_XMIT); 287 if (ret < 0) 288 goto errout; 289 } 290 291 if (tb[LWT_BPF_XMIT_HEADROOM]) { 292 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 293 294 if (headroom > LWT_BPF_MAX_HEADROOM) { 295 ret = -ERANGE; 296 goto errout; 297 } 298 299 newts->headroom = headroom; 300 } 301 302 bpf->family = family; 303 *ts = newts; 304 305 return 0; 306 307 errout: 308 bpf_destroy_state(newts); 309 kfree(newts); 310 return ret; 311 } 312 313 static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 314 struct bpf_lwt_prog *prog) 315 { 316 struct nlattr *nest; 317 318 if (!prog->prog) 319 return 0; 320 321 nest = nla_nest_start(skb, attr); 322 if (!nest) 323 return -EMSGSIZE; 324 325 if (prog->name && 326 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 327 return -EMSGSIZE; 328 329 return nla_nest_end(skb, nest); 330 } 331 332 static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 333 { 334 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 335 336 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 337 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 338 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 339 return -EMSGSIZE; 340 341 return 0; 342 } 343 344 static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 345 { 346 int nest_len = nla_total_size(sizeof(struct nlattr)) + 347 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 348 0; 349 350 return nest_len + /* LWT_BPF_IN */ 351 nest_len + /* LWT_BPF_OUT */ 352 nest_len + /* LWT_BPF_XMIT */ 353 0; 354 } 355 356 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 357 { 358 /* FIXME: 359 * The LWT state is currently rebuilt for delete requests which 360 * results in a new bpf_prog instance. Comparing names for now. 361 */ 362 if (!a->name && !b->name) 363 return 0; 364 365 if (!a->name || !b->name) 366 return 1; 367 368 return strcmp(a->name, b->name); 369 } 370 371 static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 372 { 373 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 374 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 375 376 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 377 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 378 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 379 } 380 381 static const struct lwtunnel_encap_ops bpf_encap_ops = { 382 .build_state = bpf_build_state, 383 .destroy_state = bpf_destroy_state, 384 .input = bpf_input, 385 .output = bpf_output, 386 .xmit = bpf_xmit, 387 .fill_encap = bpf_fill_encap_info, 388 .get_encap_size = bpf_encap_nlsize, 389 .cmp_encap = bpf_encap_cmp, 390 .owner = THIS_MODULE, 391 }; 392 393 static int __init bpf_lwt_init(void) 394 { 395 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 396 } 397 398 subsys_initcall(bpf_lwt_init) 399