1 #include <linux/skbuff.h> 2 #include <linux/export.h> 3 #include <linux/ip.h> 4 #include <linux/ipv6.h> 5 #include <linux/if_vlan.h> 6 #include <net/ip.h> 7 #include <net/ipv6.h> 8 #include <linux/igmp.h> 9 #include <linux/icmp.h> 10 #include <linux/sctp.h> 11 #include <linux/dccp.h> 12 #include <linux/if_tunnel.h> 13 #include <linux/if_pppox.h> 14 #include <linux/ppp_defs.h> 15 #include <net/flow_keys.h> 16 #include <scsi/fc/fc_fcoe.h> 17 18 /* copy saddr & daddr, possibly using 64bit load/store 19 * Equivalent to : flow->src = iph->saddr; 20 * flow->dst = iph->daddr; 21 */ 22 static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) 23 { 24 BUILD_BUG_ON(offsetof(typeof(*flow), dst) != 25 offsetof(typeof(*flow), src) + sizeof(flow->src)); 26 memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); 27 } 28 29 /** 30 * __skb_flow_get_ports - extract the upper layer ports and return them 31 * @skb: sk_buff to extract the ports from 32 * @thoff: transport header offset 33 * @ip_proto: protocol for which to get port offset 34 * @data: raw buffer pointer to the packet, if NULL use skb->data 35 * @hlen: packet header length, if @data is NULL use skb_headlen(skb) 36 * 37 * The function will try to retrieve the ports at offset thoff + poff where poff 38 * is the protocol port offset returned from proto_ports_offset 39 */ 40 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, 41 void *data, int hlen) 42 { 43 int poff = proto_ports_offset(ip_proto); 44 45 if (!data) { 46 data = skb->data; 47 hlen = skb_headlen(skb); 48 } 49 50 if (poff >= 0) { 51 __be32 *ports, _ports; 52 53 ports = __skb_header_pointer(skb, thoff + poff, 54 sizeof(_ports), data, hlen, &_ports); 55 if (ports) 56 return *ports; 57 } 58 59 return 0; 60 } 61 EXPORT_SYMBOL(__skb_flow_get_ports); 62 63 /** 64 * __skb_flow_dissect - extract the flow_keys struct and return it 65 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified 66 * @data: raw buffer pointer to the packet, if NULL use skb->data 67 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol 68 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) 69 * @hlen: packet header length, if @data is NULL use skb_headlen(skb) 70 * 71 * The function will try to retrieve the struct flow_keys from either the skbuff 72 * or a raw buffer specified by the rest parameters 73 */ 74 bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, 75 void *data, __be16 proto, int nhoff, int hlen) 76 { 77 u8 ip_proto; 78 79 if (!data) { 80 data = skb->data; 81 proto = skb->protocol; 82 nhoff = skb_network_offset(skb); 83 hlen = skb_headlen(skb); 84 } 85 86 memset(flow, 0, sizeof(*flow)); 87 88 again: 89 switch (proto) { 90 case htons(ETH_P_IP): { 91 const struct iphdr *iph; 92 struct iphdr _iph; 93 ip: 94 iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); 95 if (!iph || iph->ihl < 5) 96 return false; 97 nhoff += iph->ihl * 4; 98 99 ip_proto = iph->protocol; 100 if (ip_is_fragment(iph)) 101 ip_proto = 0; 102 103 iph_to_flow_copy_addrs(flow, iph); 104 break; 105 } 106 case htons(ETH_P_IPV6): { 107 const struct ipv6hdr *iph; 108 struct ipv6hdr _iph; 109 __be32 flow_label; 110 111 ipv6: 112 iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); 113 if (!iph) 114 return false; 115 116 ip_proto = iph->nexthdr; 117 flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); 118 flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); 119 nhoff += sizeof(struct ipv6hdr); 120 121 /* skip the flow label processing if skb is NULL. The 122 * assumption here is that if there is no skb we are not 123 * looking for flow info as much as we are length. 124 */ 125 if (!skb) 126 break; 127 128 flow_label = ip6_flowlabel(iph); 129 if (flow_label) { 130 /* Awesome, IPv6 packet has a flow label so we can 131 * use that to represent the ports without any 132 * further dissection. 133 */ 134 flow->n_proto = proto; 135 flow->ip_proto = ip_proto; 136 flow->ports = flow_label; 137 flow->thoff = (u16)nhoff; 138 139 return true; 140 } 141 142 break; 143 } 144 case htons(ETH_P_8021AD): 145 case htons(ETH_P_8021Q): { 146 const struct vlan_hdr *vlan; 147 struct vlan_hdr _vlan; 148 149 vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); 150 if (!vlan) 151 return false; 152 153 proto = vlan->h_vlan_encapsulated_proto; 154 nhoff += sizeof(*vlan); 155 goto again; 156 } 157 case htons(ETH_P_PPP_SES): { 158 struct { 159 struct pppoe_hdr hdr; 160 __be16 proto; 161 } *hdr, _hdr; 162 hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); 163 if (!hdr) 164 return false; 165 proto = hdr->proto; 166 nhoff += PPPOE_SES_HLEN; 167 switch (proto) { 168 case htons(PPP_IP): 169 goto ip; 170 case htons(PPP_IPV6): 171 goto ipv6; 172 default: 173 return false; 174 } 175 } 176 case htons(ETH_P_FCOE): 177 flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); 178 /* fall through */ 179 default: 180 return false; 181 } 182 183 switch (ip_proto) { 184 case IPPROTO_GRE: { 185 struct gre_hdr { 186 __be16 flags; 187 __be16 proto; 188 } *hdr, _hdr; 189 190 hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); 191 if (!hdr) 192 return false; 193 /* 194 * Only look inside GRE if version zero and no 195 * routing 196 */ 197 if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { 198 proto = hdr->proto; 199 nhoff += 4; 200 if (hdr->flags & GRE_CSUM) 201 nhoff += 4; 202 if (hdr->flags & GRE_KEY) 203 nhoff += 4; 204 if (hdr->flags & GRE_SEQ) 205 nhoff += 4; 206 if (proto == htons(ETH_P_TEB)) { 207 const struct ethhdr *eth; 208 struct ethhdr _eth; 209 210 eth = __skb_header_pointer(skb, nhoff, 211 sizeof(_eth), 212 data, hlen, &_eth); 213 if (!eth) 214 return false; 215 proto = eth->h_proto; 216 nhoff += sizeof(*eth); 217 } 218 goto again; 219 } 220 break; 221 } 222 case IPPROTO_IPIP: 223 proto = htons(ETH_P_IP); 224 goto ip; 225 case IPPROTO_IPV6: 226 proto = htons(ETH_P_IPV6); 227 goto ipv6; 228 default: 229 break; 230 } 231 232 flow->n_proto = proto; 233 flow->ip_proto = ip_proto; 234 flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); 235 flow->thoff = (u16) nhoff; 236 237 return true; 238 } 239 EXPORT_SYMBOL(__skb_flow_dissect); 240 241 static u32 hashrnd __read_mostly; 242 static __always_inline void __flow_hash_secret_init(void) 243 { 244 net_get_random_once(&hashrnd, sizeof(hashrnd)); 245 } 246 247 static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) 248 { 249 __flow_hash_secret_init(); 250 return jhash_3words(a, b, c, hashrnd); 251 } 252 253 static inline u32 __flow_hash_from_keys(struct flow_keys *keys) 254 { 255 u32 hash; 256 257 /* get a consistent hash (same value on both flow directions) */ 258 if (((__force u32)keys->dst < (__force u32)keys->src) || 259 (((__force u32)keys->dst == (__force u32)keys->src) && 260 ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { 261 swap(keys->dst, keys->src); 262 swap(keys->port16[0], keys->port16[1]); 263 } 264 265 hash = __flow_hash_3words((__force u32)keys->dst, 266 (__force u32)keys->src, 267 (__force u32)keys->ports); 268 if (!hash) 269 hash = 1; 270 271 return hash; 272 } 273 274 u32 flow_hash_from_keys(struct flow_keys *keys) 275 { 276 return __flow_hash_from_keys(keys); 277 } 278 EXPORT_SYMBOL(flow_hash_from_keys); 279 280 /* 281 * __skb_get_hash: calculate a flow hash based on src/dst addresses 282 * and src/dst port numbers. Sets hash in skb to non-zero hash value 283 * on success, zero indicates no valid hash. Also, sets l4_hash in skb 284 * if hash is a canonical 4-tuple hash over transport ports. 285 */ 286 void __skb_get_hash(struct sk_buff *skb) 287 { 288 struct flow_keys keys; 289 290 if (!skb_flow_dissect(skb, &keys)) 291 return; 292 293 if (keys.ports) 294 skb->l4_hash = 1; 295 296 skb->sw_hash = 1; 297 298 skb->hash = __flow_hash_from_keys(&keys); 299 } 300 EXPORT_SYMBOL(__skb_get_hash); 301 302 /* 303 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 304 * to be used as a distribution range. 305 */ 306 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 307 unsigned int num_tx_queues) 308 { 309 u32 hash; 310 u16 qoffset = 0; 311 u16 qcount = num_tx_queues; 312 313 if (skb_rx_queue_recorded(skb)) { 314 hash = skb_get_rx_queue(skb); 315 while (unlikely(hash >= num_tx_queues)) 316 hash -= num_tx_queues; 317 return hash; 318 } 319 320 if (dev->num_tc) { 321 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 322 qoffset = dev->tc_to_txq[tc].offset; 323 qcount = dev->tc_to_txq[tc].count; 324 } 325 326 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 327 } 328 EXPORT_SYMBOL(__skb_tx_hash); 329 330 u32 __skb_get_poff(const struct sk_buff *skb, void *data, 331 const struct flow_keys *keys, int hlen) 332 { 333 u32 poff = keys->thoff; 334 335 switch (keys->ip_proto) { 336 case IPPROTO_TCP: { 337 const struct tcphdr *tcph; 338 struct tcphdr _tcph; 339 340 tcph = __skb_header_pointer(skb, poff, sizeof(_tcph), 341 data, hlen, &_tcph); 342 if (!tcph) 343 return poff; 344 345 poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4); 346 break; 347 } 348 case IPPROTO_UDP: 349 case IPPROTO_UDPLITE: 350 poff += sizeof(struct udphdr); 351 break; 352 /* For the rest, we do not really care about header 353 * extensions at this point for now. 354 */ 355 case IPPROTO_ICMP: 356 poff += sizeof(struct icmphdr); 357 break; 358 case IPPROTO_ICMPV6: 359 poff += sizeof(struct icmp6hdr); 360 break; 361 case IPPROTO_IGMP: 362 poff += sizeof(struct igmphdr); 363 break; 364 case IPPROTO_DCCP: 365 poff += sizeof(struct dccp_hdr); 366 break; 367 case IPPROTO_SCTP: 368 poff += sizeof(struct sctphdr); 369 break; 370 } 371 372 return poff; 373 } 374 375 /* skb_get_poff() returns the offset to the payload as far as it could 376 * be dissected. The main user is currently BPF, so that we can dynamically 377 * truncate packets without needing to push actual payload to the user 378 * space and can analyze headers only, instead. 379 */ 380 u32 skb_get_poff(const struct sk_buff *skb) 381 { 382 struct flow_keys keys; 383 384 if (!skb_flow_dissect(skb, &keys)) 385 return 0; 386 387 return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); 388 } 389 390 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 391 { 392 #ifdef CONFIG_XPS 393 struct xps_dev_maps *dev_maps; 394 struct xps_map *map; 395 int queue_index = -1; 396 397 rcu_read_lock(); 398 dev_maps = rcu_dereference(dev->xps_maps); 399 if (dev_maps) { 400 map = rcu_dereference( 401 dev_maps->cpu_map[raw_smp_processor_id()]); 402 if (map) { 403 if (map->len == 1) 404 queue_index = map->queues[0]; 405 else 406 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 407 map->len)]; 408 if (unlikely(queue_index >= dev->real_num_tx_queues)) 409 queue_index = -1; 410 } 411 } 412 rcu_read_unlock(); 413 414 return queue_index; 415 #else 416 return -1; 417 #endif 418 } 419 420 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 421 { 422 struct sock *sk = skb->sk; 423 int queue_index = sk_tx_queue_get(sk); 424 425 if (queue_index < 0 || skb->ooo_okay || 426 queue_index >= dev->real_num_tx_queues) { 427 int new_index = get_xps_queue(dev, skb); 428 if (new_index < 0) 429 new_index = skb_tx_hash(dev, skb); 430 431 if (queue_index != new_index && sk && 432 rcu_access_pointer(sk->sk_dst_cache)) 433 sk_tx_queue_set(sk, new_index); 434 435 queue_index = new_index; 436 } 437 438 return queue_index; 439 } 440 441 struct netdev_queue *netdev_pick_tx(struct net_device *dev, 442 struct sk_buff *skb, 443 void *accel_priv) 444 { 445 int queue_index = 0; 446 447 if (dev->real_num_tx_queues != 1) { 448 const struct net_device_ops *ops = dev->netdev_ops; 449 if (ops->ndo_select_queue) 450 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 451 __netdev_pick_tx); 452 else 453 queue_index = __netdev_pick_tx(dev, skb); 454 455 if (!accel_priv) 456 queue_index = netdev_cap_txqueue(dev, queue_index); 457 } 458 459 skb_set_queue_mapping(skb, queue_index); 460 return netdev_get_tx_queue(dev, queue_index); 461 } 462