xref: /linux/net/openvswitch/flow.c (revision c14af233fbe279d0e561ecf84f1208b1bae087ef)
1 /*
2  * Copyright (c) 2007-2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #include "flow.h"
20 #include "datapath.h"
21 #include <linux/uaccess.h>
22 #include <linux/netdevice.h>
23 #include <linux/etherdevice.h>
24 #include <linux/if_ether.h>
25 #include <linux/if_vlan.h>
26 #include <net/llc_pdu.h>
27 #include <linux/kernel.h>
28 #include <linux/jhash.h>
29 #include <linux/jiffies.h>
30 #include <linux/llc.h>
31 #include <linux/module.h>
32 #include <linux/in.h>
33 #include <linux/rcupdate.h>
34 #include <linux/if_arp.h>
35 #include <linux/ip.h>
36 #include <linux/ipv6.h>
37 #include <linux/sctp.h>
38 #include <linux/smp.h>
39 #include <linux/tcp.h>
40 #include <linux/udp.h>
41 #include <linux/icmp.h>
42 #include <linux/icmpv6.h>
43 #include <linux/rculist.h>
44 #include <net/ip.h>
45 #include <net/ip_tunnels.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 
49 u64 ovs_flow_used_time(unsigned long flow_jiffies)
50 {
51 	struct timespec cur_ts;
52 	u64 cur_ms, idle_ms;
53 
54 	ktime_get_ts(&cur_ts);
55 	idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
56 	cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
57 		 cur_ts.tv_nsec / NSEC_PER_MSEC;
58 
59 	return cur_ms - idle_ms;
60 }
61 
62 #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
63 
64 void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
65 {
66 	struct flow_stats *stats;
67 	__be16 tcp_flags = 0;
68 
69 	if (!flow->stats.is_percpu)
70 		stats = flow->stats.stat;
71 	else
72 		stats = this_cpu_ptr(flow->stats.cpu_stats);
73 
74 	if ((flow->key.eth.type == htons(ETH_P_IP) ||
75 	     flow->key.eth.type == htons(ETH_P_IPV6)) &&
76 	    flow->key.ip.frag != OVS_FRAG_TYPE_LATER &&
77 	    flow->key.ip.proto == IPPROTO_TCP &&
78 	    likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
79 		tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
80 	}
81 
82 	spin_lock(&stats->lock);
83 	stats->used = jiffies;
84 	stats->packet_count++;
85 	stats->byte_count += skb->len;
86 	stats->tcp_flags |= tcp_flags;
87 	spin_unlock(&stats->lock);
88 }
89 
90 static void stats_read(struct flow_stats *stats,
91 		       struct ovs_flow_stats *ovs_stats,
92 		       unsigned long *used, __be16 *tcp_flags)
93 {
94 	spin_lock(&stats->lock);
95 	if (!*used || time_after(stats->used, *used))
96 		*used = stats->used;
97 	*tcp_flags |= stats->tcp_flags;
98 	ovs_stats->n_packets += stats->packet_count;
99 	ovs_stats->n_bytes += stats->byte_count;
100 	spin_unlock(&stats->lock);
101 }
102 
103 void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
104 			unsigned long *used, __be16 *tcp_flags)
105 {
106 	int cpu, cur_cpu;
107 
108 	*used = 0;
109 	*tcp_flags = 0;
110 	memset(ovs_stats, 0, sizeof(*ovs_stats));
111 
112 	if (!flow->stats.is_percpu) {
113 		stats_read(flow->stats.stat, ovs_stats, used, tcp_flags);
114 	} else {
115 		cur_cpu = get_cpu();
116 		for_each_possible_cpu(cpu) {
117 			struct flow_stats *stats;
118 
119 			if (cpu == cur_cpu)
120 				local_bh_disable();
121 
122 			stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
123 			stats_read(stats, ovs_stats, used, tcp_flags);
124 
125 			if (cpu == cur_cpu)
126 				local_bh_enable();
127 		}
128 		put_cpu();
129 	}
130 }
131 
132 static void stats_reset(struct flow_stats *stats)
133 {
134 	spin_lock(&stats->lock);
135 	stats->used = 0;
136 	stats->packet_count = 0;
137 	stats->byte_count = 0;
138 	stats->tcp_flags = 0;
139 	spin_unlock(&stats->lock);
140 }
141 
142 void ovs_flow_stats_clear(struct sw_flow *flow)
143 {
144 	int cpu, cur_cpu;
145 
146 	if (!flow->stats.is_percpu) {
147 		stats_reset(flow->stats.stat);
148 	} else {
149 		cur_cpu = get_cpu();
150 
151 		for_each_possible_cpu(cpu) {
152 
153 			if (cpu == cur_cpu)
154 				local_bh_disable();
155 
156 			stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu));
157 
158 			if (cpu == cur_cpu)
159 				local_bh_enable();
160 		}
161 		put_cpu();
162 	}
163 }
164 
165 static int check_header(struct sk_buff *skb, int len)
166 {
167 	if (unlikely(skb->len < len))
168 		return -EINVAL;
169 	if (unlikely(!pskb_may_pull(skb, len)))
170 		return -ENOMEM;
171 	return 0;
172 }
173 
174 static bool arphdr_ok(struct sk_buff *skb)
175 {
176 	return pskb_may_pull(skb, skb_network_offset(skb) +
177 				  sizeof(struct arp_eth_header));
178 }
179 
180 static int check_iphdr(struct sk_buff *skb)
181 {
182 	unsigned int nh_ofs = skb_network_offset(skb);
183 	unsigned int ip_len;
184 	int err;
185 
186 	err = check_header(skb, nh_ofs + sizeof(struct iphdr));
187 	if (unlikely(err))
188 		return err;
189 
190 	ip_len = ip_hdrlen(skb);
191 	if (unlikely(ip_len < sizeof(struct iphdr) ||
192 		     skb->len < nh_ofs + ip_len))
193 		return -EINVAL;
194 
195 	skb_set_transport_header(skb, nh_ofs + ip_len);
196 	return 0;
197 }
198 
199 static bool tcphdr_ok(struct sk_buff *skb)
200 {
201 	int th_ofs = skb_transport_offset(skb);
202 	int tcp_len;
203 
204 	if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))))
205 		return false;
206 
207 	tcp_len = tcp_hdrlen(skb);
208 	if (unlikely(tcp_len < sizeof(struct tcphdr) ||
209 		     skb->len < th_ofs + tcp_len))
210 		return false;
211 
212 	return true;
213 }
214 
215 static bool udphdr_ok(struct sk_buff *skb)
216 {
217 	return pskb_may_pull(skb, skb_transport_offset(skb) +
218 				  sizeof(struct udphdr));
219 }
220 
221 static bool sctphdr_ok(struct sk_buff *skb)
222 {
223 	return pskb_may_pull(skb, skb_transport_offset(skb) +
224 				  sizeof(struct sctphdr));
225 }
226 
227 static bool icmphdr_ok(struct sk_buff *skb)
228 {
229 	return pskb_may_pull(skb, skb_transport_offset(skb) +
230 				  sizeof(struct icmphdr));
231 }
232 
233 static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
234 {
235 	unsigned int nh_ofs = skb_network_offset(skb);
236 	unsigned int nh_len;
237 	int payload_ofs;
238 	struct ipv6hdr *nh;
239 	uint8_t nexthdr;
240 	__be16 frag_off;
241 	int err;
242 
243 	err = check_header(skb, nh_ofs + sizeof(*nh));
244 	if (unlikely(err))
245 		return err;
246 
247 	nh = ipv6_hdr(skb);
248 	nexthdr = nh->nexthdr;
249 	payload_ofs = (u8 *)(nh + 1) - skb->data;
250 
251 	key->ip.proto = NEXTHDR_NONE;
252 	key->ip.tos = ipv6_get_dsfield(nh);
253 	key->ip.ttl = nh->hop_limit;
254 	key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
255 	key->ipv6.addr.src = nh->saddr;
256 	key->ipv6.addr.dst = nh->daddr;
257 
258 	payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
259 	if (unlikely(payload_ofs < 0))
260 		return -EINVAL;
261 
262 	if (frag_off) {
263 		if (frag_off & htons(~0x7))
264 			key->ip.frag = OVS_FRAG_TYPE_LATER;
265 		else
266 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
267 	}
268 
269 	nh_len = payload_ofs - nh_ofs;
270 	skb_set_transport_header(skb, nh_ofs + nh_len);
271 	key->ip.proto = nexthdr;
272 	return nh_len;
273 }
274 
275 static bool icmp6hdr_ok(struct sk_buff *skb)
276 {
277 	return pskb_may_pull(skb, skb_transport_offset(skb) +
278 				  sizeof(struct icmp6hdr));
279 }
280 
281 static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
282 {
283 	struct qtag_prefix {
284 		__be16 eth_type; /* ETH_P_8021Q */
285 		__be16 tci;
286 	};
287 	struct qtag_prefix *qp;
288 
289 	if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
290 		return 0;
291 
292 	if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
293 					 sizeof(__be16))))
294 		return -ENOMEM;
295 
296 	qp = (struct qtag_prefix *) skb->data;
297 	key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
298 	__skb_pull(skb, sizeof(struct qtag_prefix));
299 
300 	return 0;
301 }
302 
303 static __be16 parse_ethertype(struct sk_buff *skb)
304 {
305 	struct llc_snap_hdr {
306 		u8  dsap;  /* Always 0xAA */
307 		u8  ssap;  /* Always 0xAA */
308 		u8  ctrl;
309 		u8  oui[3];
310 		__be16 ethertype;
311 	};
312 	struct llc_snap_hdr *llc;
313 	__be16 proto;
314 
315 	proto = *(__be16 *) skb->data;
316 	__skb_pull(skb, sizeof(__be16));
317 
318 	if (ntohs(proto) >= ETH_P_802_3_MIN)
319 		return proto;
320 
321 	if (skb->len < sizeof(struct llc_snap_hdr))
322 		return htons(ETH_P_802_2);
323 
324 	if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr))))
325 		return htons(0);
326 
327 	llc = (struct llc_snap_hdr *) skb->data;
328 	if (llc->dsap != LLC_SAP_SNAP ||
329 	    llc->ssap != LLC_SAP_SNAP ||
330 	    (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
331 		return htons(ETH_P_802_2);
332 
333 	__skb_pull(skb, sizeof(struct llc_snap_hdr));
334 
335 	if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
336 		return llc->ethertype;
337 
338 	return htons(ETH_P_802_2);
339 }
340 
341 static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
342 			int nh_len)
343 {
344 	struct icmp6hdr *icmp = icmp6_hdr(skb);
345 
346 	/* The ICMPv6 type and code fields use the 16-bit transport port
347 	 * fields, so we need to store them in 16-bit network byte order.
348 	 */
349 	key->ipv6.tp.src = htons(icmp->icmp6_type);
350 	key->ipv6.tp.dst = htons(icmp->icmp6_code);
351 
352 	if (icmp->icmp6_code == 0 &&
353 	    (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
354 	     icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) {
355 		int icmp_len = skb->len - skb_transport_offset(skb);
356 		struct nd_msg *nd;
357 		int offset;
358 
359 		/* In order to process neighbor discovery options, we need the
360 		 * entire packet.
361 		 */
362 		if (unlikely(icmp_len < sizeof(*nd)))
363 			return 0;
364 
365 		if (unlikely(skb_linearize(skb)))
366 			return -ENOMEM;
367 
368 		nd = (struct nd_msg *)skb_transport_header(skb);
369 		key->ipv6.nd.target = nd->target;
370 
371 		icmp_len -= sizeof(*nd);
372 		offset = 0;
373 		while (icmp_len >= 8) {
374 			struct nd_opt_hdr *nd_opt =
375 				 (struct nd_opt_hdr *)(nd->opt + offset);
376 			int opt_len = nd_opt->nd_opt_len * 8;
377 
378 			if (unlikely(!opt_len || opt_len > icmp_len))
379 				return 0;
380 
381 			/* Store the link layer address if the appropriate
382 			 * option is provided.  It is considered an error if
383 			 * the same link layer option is specified twice.
384 			 */
385 			if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
386 			    && opt_len == 8) {
387 				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
388 					goto invalid;
389 				memcpy(key->ipv6.nd.sll,
390 				    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
391 			} else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
392 				   && opt_len == 8) {
393 				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
394 					goto invalid;
395 				memcpy(key->ipv6.nd.tll,
396 				    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
397 			}
398 
399 			icmp_len -= opt_len;
400 			offset += opt_len;
401 		}
402 	}
403 
404 	return 0;
405 
406 invalid:
407 	memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
408 	memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
409 	memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
410 
411 	return 0;
412 }
413 
414 /**
415  * ovs_flow_extract - extracts a flow key from an Ethernet frame.
416  * @skb: sk_buff that contains the frame, with skb->data pointing to the
417  * Ethernet header
418  * @in_port: port number on which @skb was received.
419  * @key: output flow key
420  *
421  * The caller must ensure that skb->len >= ETH_HLEN.
422  *
423  * Returns 0 if successful, otherwise a negative errno value.
424  *
425  * Initializes @skb header pointers as follows:
426  *
427  *    - skb->mac_header: the Ethernet header.
428  *
429  *    - skb->network_header: just past the Ethernet header, or just past the
430  *      VLAN header, to the first byte of the Ethernet payload.
431  *
432  *    - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
433  *      on output, then just past the IP header, if one is present and
434  *      of a correct length, otherwise the same as skb->network_header.
435  *      For other key->eth.type values it is left untouched.
436  */
437 int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
438 {
439 	int error;
440 	struct ethhdr *eth;
441 
442 	memset(key, 0, sizeof(*key));
443 
444 	key->phy.priority = skb->priority;
445 	if (OVS_CB(skb)->tun_key)
446 		memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key));
447 	key->phy.in_port = in_port;
448 	key->phy.skb_mark = skb->mark;
449 
450 	skb_reset_mac_header(skb);
451 
452 	/* Link layer.  We are guaranteed to have at least the 14 byte Ethernet
453 	 * header in the linear data area.
454 	 */
455 	eth = eth_hdr(skb);
456 	memcpy(key->eth.src, eth->h_source, ETH_ALEN);
457 	memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
458 
459 	__skb_pull(skb, 2 * ETH_ALEN);
460 	/* We are going to push all headers that we pull, so no need to
461 	 * update skb->csum here.
462 	 */
463 
464 	if (vlan_tx_tag_present(skb))
465 		key->eth.tci = htons(skb->vlan_tci);
466 	else if (eth->h_proto == htons(ETH_P_8021Q))
467 		if (unlikely(parse_vlan(skb, key)))
468 			return -ENOMEM;
469 
470 	key->eth.type = parse_ethertype(skb);
471 	if (unlikely(key->eth.type == htons(0)))
472 		return -ENOMEM;
473 
474 	skb_reset_network_header(skb);
475 	__skb_push(skb, skb->data - skb_mac_header(skb));
476 
477 	/* Network layer. */
478 	if (key->eth.type == htons(ETH_P_IP)) {
479 		struct iphdr *nh;
480 		__be16 offset;
481 
482 		error = check_iphdr(skb);
483 		if (unlikely(error)) {
484 			if (error == -EINVAL) {
485 				skb->transport_header = skb->network_header;
486 				error = 0;
487 			}
488 			return error;
489 		}
490 
491 		nh = ip_hdr(skb);
492 		key->ipv4.addr.src = nh->saddr;
493 		key->ipv4.addr.dst = nh->daddr;
494 
495 		key->ip.proto = nh->protocol;
496 		key->ip.tos = nh->tos;
497 		key->ip.ttl = nh->ttl;
498 
499 		offset = nh->frag_off & htons(IP_OFFSET);
500 		if (offset) {
501 			key->ip.frag = OVS_FRAG_TYPE_LATER;
502 			return 0;
503 		}
504 		if (nh->frag_off & htons(IP_MF) ||
505 			 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
506 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
507 
508 		/* Transport layer. */
509 		if (key->ip.proto == IPPROTO_TCP) {
510 			if (tcphdr_ok(skb)) {
511 				struct tcphdr *tcp = tcp_hdr(skb);
512 				key->ipv4.tp.src = tcp->source;
513 				key->ipv4.tp.dst = tcp->dest;
514 				key->ipv4.tp.flags = TCP_FLAGS_BE16(tcp);
515 			}
516 		} else if (key->ip.proto == IPPROTO_UDP) {
517 			if (udphdr_ok(skb)) {
518 				struct udphdr *udp = udp_hdr(skb);
519 				key->ipv4.tp.src = udp->source;
520 				key->ipv4.tp.dst = udp->dest;
521 			}
522 		} else if (key->ip.proto == IPPROTO_SCTP) {
523 			if (sctphdr_ok(skb)) {
524 				struct sctphdr *sctp = sctp_hdr(skb);
525 				key->ipv4.tp.src = sctp->source;
526 				key->ipv4.tp.dst = sctp->dest;
527 			}
528 		} else if (key->ip.proto == IPPROTO_ICMP) {
529 			if (icmphdr_ok(skb)) {
530 				struct icmphdr *icmp = icmp_hdr(skb);
531 				/* The ICMP type and code fields use the 16-bit
532 				 * transport port fields, so we need to store
533 				 * them in 16-bit network byte order. */
534 				key->ipv4.tp.src = htons(icmp->type);
535 				key->ipv4.tp.dst = htons(icmp->code);
536 			}
537 		}
538 
539 	} else if ((key->eth.type == htons(ETH_P_ARP) ||
540 		   key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) {
541 		struct arp_eth_header *arp;
542 
543 		arp = (struct arp_eth_header *)skb_network_header(skb);
544 
545 		if (arp->ar_hrd == htons(ARPHRD_ETHER)
546 				&& arp->ar_pro == htons(ETH_P_IP)
547 				&& arp->ar_hln == ETH_ALEN
548 				&& arp->ar_pln == 4) {
549 
550 			/* We only match on the lower 8 bits of the opcode. */
551 			if (ntohs(arp->ar_op) <= 0xff)
552 				key->ip.proto = ntohs(arp->ar_op);
553 			memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
554 			memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
555 			memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
556 			memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
557 		}
558 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
559 		int nh_len;             /* IPv6 Header + Extensions */
560 
561 		nh_len = parse_ipv6hdr(skb, key);
562 		if (unlikely(nh_len < 0)) {
563 			if (nh_len == -EINVAL) {
564 				skb->transport_header = skb->network_header;
565 				error = 0;
566 			} else {
567 				error = nh_len;
568 			}
569 			return error;
570 		}
571 
572 		if (key->ip.frag == OVS_FRAG_TYPE_LATER)
573 			return 0;
574 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
575 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
576 
577 		/* Transport layer. */
578 		if (key->ip.proto == NEXTHDR_TCP) {
579 			if (tcphdr_ok(skb)) {
580 				struct tcphdr *tcp = tcp_hdr(skb);
581 				key->ipv6.tp.src = tcp->source;
582 				key->ipv6.tp.dst = tcp->dest;
583 				key->ipv6.tp.flags = TCP_FLAGS_BE16(tcp);
584 			}
585 		} else if (key->ip.proto == NEXTHDR_UDP) {
586 			if (udphdr_ok(skb)) {
587 				struct udphdr *udp = udp_hdr(skb);
588 				key->ipv6.tp.src = udp->source;
589 				key->ipv6.tp.dst = udp->dest;
590 			}
591 		} else if (key->ip.proto == NEXTHDR_SCTP) {
592 			if (sctphdr_ok(skb)) {
593 				struct sctphdr *sctp = sctp_hdr(skb);
594 				key->ipv6.tp.src = sctp->source;
595 				key->ipv6.tp.dst = sctp->dest;
596 			}
597 		} else if (key->ip.proto == NEXTHDR_ICMP) {
598 			if (icmp6hdr_ok(skb)) {
599 				error = parse_icmpv6(skb, key, nh_len);
600 				if (error)
601 					return error;
602 			}
603 		}
604 	}
605 
606 	return 0;
607 }
608