11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * INET An implementation of the TCP/IP protocol suite for the LINUX 31da177e4SLinus Torvalds * operating system. INET is implemented using the BSD Socket 41da177e4SLinus Torvalds * interface as the means of communication with the user level. 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * The IP fragmentation functionality. 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $ 91da177e4SLinus Torvalds * 101da177e4SLinus Torvalds * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> 111da177e4SLinus Torvalds * Alan Cox <Alan.Cox@linux.org> 121da177e4SLinus Torvalds * 131da177e4SLinus Torvalds * Fixes: 141da177e4SLinus Torvalds * Alan Cox : Split from ip.c , see ip_input.c for history. 151da177e4SLinus Torvalds * David S. Miller : Begin massive cleanup... 161da177e4SLinus Torvalds * Andi Kleen : Add sysctls. 171da177e4SLinus Torvalds * xxxx : Overlapfrag bug. 181da177e4SLinus Torvalds * Ultima : ip_expire() kernel panic. 191da177e4SLinus Torvalds * Bill Hawes : Frag accounting and evictor fixes. 201da177e4SLinus Torvalds * John McDonald : 0 length frag bug. 211da177e4SLinus Torvalds * Alexey Kuznetsov: SMP races, threading, cleanup. 221da177e4SLinus Torvalds * Patrick McHardy : LRU queue of frag heads for evictor. 231da177e4SLinus Torvalds */ 241da177e4SLinus Torvalds 2589cee8b1SHerbert Xu #include <linux/compiler.h> 261da177e4SLinus Torvalds #include <linux/module.h> 271da177e4SLinus Torvalds #include <linux/types.h> 281da177e4SLinus Torvalds #include <linux/mm.h> 291da177e4SLinus Torvalds #include <linux/jiffies.h> 301da177e4SLinus Torvalds #include <linux/skbuff.h> 311da177e4SLinus Torvalds #include <linux/list.h> 321da177e4SLinus Torvalds #include <linux/ip.h> 331da177e4SLinus Torvalds #include <linux/icmp.h> 341da177e4SLinus Torvalds #include <linux/netdevice.h> 351da177e4SLinus Torvalds #include <linux/jhash.h> 361da177e4SLinus Torvalds #include <linux/random.h> 371da177e4SLinus Torvalds #include <net/sock.h> 381da177e4SLinus Torvalds #include <net/ip.h> 391da177e4SLinus Torvalds #include <net/icmp.h> 401da177e4SLinus Torvalds #include <net/checksum.h> 4189cee8b1SHerbert Xu #include <net/inetpeer.h> 425ab11c98SPavel Emelyanov #include <net/inet_frag.h> 431da177e4SLinus Torvalds #include <linux/tcp.h> 441da177e4SLinus Torvalds #include <linux/udp.h> 451da177e4SLinus Torvalds #include <linux/inet.h> 461da177e4SLinus Torvalds #include <linux/netfilter_ipv4.h> 471da177e4SLinus Torvalds 481da177e4SLinus Torvalds /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 491da177e4SLinus Torvalds * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c 501da177e4SLinus Torvalds * as well. Or notify me, at least. --ANK 511da177e4SLinus Torvalds */ 521da177e4SLinus Torvalds 53ab32ea5dSBrian Haley int sysctl_ipfrag_max_dist __read_mostly = 64; 5489cee8b1SHerbert Xu 551da177e4SLinus Torvalds struct ipfrag_skb_cb 561da177e4SLinus Torvalds { 571da177e4SLinus Torvalds struct inet_skb_parm h; 581da177e4SLinus Torvalds int offset; 591da177e4SLinus Torvalds }; 601da177e4SLinus Torvalds 611da177e4SLinus Torvalds #define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb)) 621da177e4SLinus Torvalds 631da177e4SLinus Torvalds /* Describe an entry in the "incomplete datagrams" queue. */ 641da177e4SLinus Torvalds struct ipq { 655ab11c98SPavel Emelyanov struct inet_frag_queue q; 665ab11c98SPavel Emelyanov 671da177e4SLinus Torvalds u32 user; 6818277770SAl Viro __be32 saddr; 6918277770SAl Viro __be32 daddr; 7018277770SAl Viro __be16 id; 711da177e4SLinus Torvalds u8 protocol; 7289cee8b1SHerbert Xu int iif; 7389cee8b1SHerbert Xu unsigned int rid; 7489cee8b1SHerbert Xu struct inet_peer *peer; 751da177e4SLinus Torvalds }; 761da177e4SLinus Torvalds 7704128f23SPavel Emelyanov struct inet_frags_ctl ip4_frags_ctl __read_mostly = { 7804128f23SPavel Emelyanov /* 7904128f23SPavel Emelyanov * Fragment cache limits. We will commit 256K at one time. Should we 8004128f23SPavel Emelyanov * cross that limit we will prune down to 192K. This should cope with 8104128f23SPavel Emelyanov * even the most extreme cases without allowing an attacker to 8204128f23SPavel Emelyanov * measurably harm machine performance. 8304128f23SPavel Emelyanov */ 8404128f23SPavel Emelyanov .high_thresh = 256 * 1024, 8504128f23SPavel Emelyanov .low_thresh = 192 * 1024, 8604128f23SPavel Emelyanov 8704128f23SPavel Emelyanov /* 8804128f23SPavel Emelyanov * Important NOTE! Fragment queue must be destroyed before MSL expires. 8904128f23SPavel Emelyanov * RFC791 is wrong proposing to prolongate timer each fragment arrival 9004128f23SPavel Emelyanov * by TTL. 9104128f23SPavel Emelyanov */ 9204128f23SPavel Emelyanov .timeout = IP_FRAG_TIME, 9304128f23SPavel Emelyanov .secret_interval = 10 * 60 * HZ, 9404128f23SPavel Emelyanov }; 9504128f23SPavel Emelyanov 967eb95156SPavel Emelyanov static struct inet_frags ip4_frags; 971da177e4SLinus Torvalds 987eb95156SPavel Emelyanov int ip_frag_nqueues(void) 997eb95156SPavel Emelyanov { 1007eb95156SPavel Emelyanov return ip4_frags.nqueues; 1017eb95156SPavel Emelyanov } 1021da177e4SLinus Torvalds 1037eb95156SPavel Emelyanov int ip_frag_mem(void) 1047eb95156SPavel Emelyanov { 1057eb95156SPavel Emelyanov return atomic_read(&ip4_frags.mem); 1067eb95156SPavel Emelyanov } 1071da177e4SLinus Torvalds 1081706d587SHerbert Xu static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, 1091706d587SHerbert Xu struct net_device *dev); 1101706d587SHerbert Xu 11118277770SAl Viro static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) 1121da177e4SLinus Torvalds { 11318277770SAl Viro return jhash_3words((__force u32)id << 16 | prot, 11418277770SAl Viro (__force u32)saddr, (__force u32)daddr, 1157eb95156SPavel Emelyanov ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); 1161da177e4SLinus Torvalds } 1171da177e4SLinus Torvalds 118321a3a99SPavel Emelyanov static unsigned int ip4_hashfn(struct inet_frag_queue *q) 1191da177e4SLinus Torvalds { 120321a3a99SPavel Emelyanov struct ipq *ipq; 1211da177e4SLinus Torvalds 122321a3a99SPavel Emelyanov ipq = container_of(q, struct ipq, q); 123321a3a99SPavel Emelyanov return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); 1241da177e4SLinus Torvalds } 1251da177e4SLinus Torvalds 1261da177e4SLinus Torvalds /* Memory Tracking Functions. */ 1271da177e4SLinus Torvalds static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) 1281da177e4SLinus Torvalds { 1291da177e4SLinus Torvalds if (work) 1301da177e4SLinus Torvalds *work -= skb->truesize; 1317eb95156SPavel Emelyanov atomic_sub(skb->truesize, &ip4_frags.mem); 1321da177e4SLinus Torvalds kfree_skb(skb); 1331da177e4SLinus Torvalds } 1341da177e4SLinus Torvalds 1351e4b8287SPavel Emelyanov static __inline__ void ip4_frag_free(struct inet_frag_queue *q) 1361da177e4SLinus Torvalds { 1371e4b8287SPavel Emelyanov struct ipq *qp; 1381e4b8287SPavel Emelyanov 1391e4b8287SPavel Emelyanov qp = container_of(q, struct ipq, q); 1401e4b8287SPavel Emelyanov if (qp->peer) 1411e4b8287SPavel Emelyanov inet_putpeer(qp->peer); 1421da177e4SLinus Torvalds kfree(qp); 1431da177e4SLinus Torvalds } 1441da177e4SLinus Torvalds 1451da177e4SLinus Torvalds static __inline__ struct ipq *frag_alloc_queue(void) 1461da177e4SLinus Torvalds { 1471da177e4SLinus Torvalds struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); 1481da177e4SLinus Torvalds 1491da177e4SLinus Torvalds if (!qp) 1501da177e4SLinus Torvalds return NULL; 1517eb95156SPavel Emelyanov atomic_add(sizeof(struct ipq), &ip4_frags.mem); 1521da177e4SLinus Torvalds return qp; 1531da177e4SLinus Torvalds } 1541da177e4SLinus Torvalds 1551da177e4SLinus Torvalds 1561da177e4SLinus Torvalds /* Destruction primitives. */ 1571da177e4SLinus Torvalds 158*4b6cb5d8SPavel Emelyanov static __inline__ void ipq_put(struct ipq *ipq) 1591da177e4SLinus Torvalds { 1605ab11c98SPavel Emelyanov if (atomic_dec_and_test(&ipq->q.refcnt)) 161*4b6cb5d8SPavel Emelyanov inet_frag_destroy(&ipq->q, &ip4_frags, NULL); 1621da177e4SLinus Torvalds } 1631da177e4SLinus Torvalds 1641da177e4SLinus Torvalds /* Kill ipq entry. It is not destroyed immediately, 1651da177e4SLinus Torvalds * because caller (and someone more) holds reference count. 1661da177e4SLinus Torvalds */ 1671da177e4SLinus Torvalds static void ipq_kill(struct ipq *ipq) 1681da177e4SLinus Torvalds { 169277e650dSPavel Emelyanov inet_frag_kill(&ipq->q, &ip4_frags); 1701da177e4SLinus Torvalds } 1711da177e4SLinus Torvalds 1721da177e4SLinus Torvalds /* Memory limiting on fragments. Evictor trashes the oldest 1731da177e4SLinus Torvalds * fragment queue until we are back under the threshold. 1741da177e4SLinus Torvalds */ 1751da177e4SLinus Torvalds static void ip_evictor(void) 1761da177e4SLinus Torvalds { 1778e7999c4SPavel Emelyanov int evicted; 1781da177e4SLinus Torvalds 1798e7999c4SPavel Emelyanov evicted = inet_frag_evictor(&ip4_frags); 1808e7999c4SPavel Emelyanov if (evicted) 1818e7999c4SPavel Emelyanov IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted); 1821da177e4SLinus Torvalds } 1831da177e4SLinus Torvalds 1841da177e4SLinus Torvalds /* 1851da177e4SLinus Torvalds * Oops, a fragment queue timed out. Kill it and send an ICMP reply. 1861da177e4SLinus Torvalds */ 1871da177e4SLinus Torvalds static void ip_expire(unsigned long arg) 1881da177e4SLinus Torvalds { 1891da177e4SLinus Torvalds struct ipq *qp = (struct ipq *) arg; 1901da177e4SLinus Torvalds 1915ab11c98SPavel Emelyanov spin_lock(&qp->q.lock); 1921da177e4SLinus Torvalds 1935ab11c98SPavel Emelyanov if (qp->q.last_in & COMPLETE) 1941da177e4SLinus Torvalds goto out; 1951da177e4SLinus Torvalds 1961da177e4SLinus Torvalds ipq_kill(qp); 1971da177e4SLinus Torvalds 1981da177e4SLinus Torvalds IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); 1991da177e4SLinus Torvalds IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 2001da177e4SLinus Torvalds 2015ab11c98SPavel Emelyanov if ((qp->q.last_in&FIRST_IN) && qp->q.fragments != NULL) { 2025ab11c98SPavel Emelyanov struct sk_buff *head = qp->q.fragments; 2031da177e4SLinus Torvalds /* Send an ICMP "Fragment Reassembly Timeout" message. */ 204881d966bSEric W. Biederman if ((head->dev = dev_get_by_index(&init_net, qp->iif)) != NULL) { 2051da177e4SLinus Torvalds icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 2061da177e4SLinus Torvalds dev_put(head->dev); 2071da177e4SLinus Torvalds } 2081da177e4SLinus Torvalds } 2091da177e4SLinus Torvalds out: 2105ab11c98SPavel Emelyanov spin_unlock(&qp->q.lock); 211*4b6cb5d8SPavel Emelyanov ipq_put(qp); 2121da177e4SLinus Torvalds } 2131da177e4SLinus Torvalds 2141da177e4SLinus Torvalds /* Creation primitives. */ 2151da177e4SLinus Torvalds 21655c0022eSDavid S. Miller static struct ipq *ip_frag_intern(struct ipq *qp_in) 2171da177e4SLinus Torvalds { 2181da177e4SLinus Torvalds struct ipq *qp; 219e7c8a41eSYasuyuki Kozakai #ifdef CONFIG_SMP 220e7c8a41eSYasuyuki Kozakai struct hlist_node *n; 221e7c8a41eSYasuyuki Kozakai #endif 22255c0022eSDavid S. Miller unsigned int hash; 22355c0022eSDavid S. Miller 2247eb95156SPavel Emelyanov write_lock(&ip4_frags.lock); 22555c0022eSDavid S. Miller hash = ipqhashfn(qp_in->id, qp_in->saddr, qp_in->daddr, 22655c0022eSDavid S. Miller qp_in->protocol); 2271da177e4SLinus Torvalds #ifdef CONFIG_SMP 2281da177e4SLinus Torvalds /* With SMP race we have to recheck hash table, because 2291da177e4SLinus Torvalds * such entry could be created on other cpu, while we 2301da177e4SLinus Torvalds * promoted read lock to write lock. 2311da177e4SLinus Torvalds */ 2327eb95156SPavel Emelyanov hlist_for_each_entry(qp, n, &ip4_frags.hash[hash], q.list) { 2331da177e4SLinus Torvalds if (qp->id == qp_in->id && 2341da177e4SLinus Torvalds qp->saddr == qp_in->saddr && 2351da177e4SLinus Torvalds qp->daddr == qp_in->daddr && 2361da177e4SLinus Torvalds qp->protocol == qp_in->protocol && 2371da177e4SLinus Torvalds qp->user == qp_in->user) { 2385ab11c98SPavel Emelyanov atomic_inc(&qp->q.refcnt); 2397eb95156SPavel Emelyanov write_unlock(&ip4_frags.lock); 2405ab11c98SPavel Emelyanov qp_in->q.last_in |= COMPLETE; 241*4b6cb5d8SPavel Emelyanov ipq_put(qp_in); 2421da177e4SLinus Torvalds return qp; 2431da177e4SLinus Torvalds } 2441da177e4SLinus Torvalds } 2451da177e4SLinus Torvalds #endif 2461da177e4SLinus Torvalds qp = qp_in; 2471da177e4SLinus Torvalds 24804128f23SPavel Emelyanov if (!mod_timer(&qp->q.timer, jiffies + ip4_frags_ctl.timeout)) 2495ab11c98SPavel Emelyanov atomic_inc(&qp->q.refcnt); 2501da177e4SLinus Torvalds 2515ab11c98SPavel Emelyanov atomic_inc(&qp->q.refcnt); 2527eb95156SPavel Emelyanov hlist_add_head(&qp->q.list, &ip4_frags.hash[hash]); 2535ab11c98SPavel Emelyanov INIT_LIST_HEAD(&qp->q.lru_list); 2547eb95156SPavel Emelyanov list_add_tail(&qp->q.lru_list, &ip4_frags.lru_list); 2557eb95156SPavel Emelyanov ip4_frags.nqueues++; 2567eb95156SPavel Emelyanov write_unlock(&ip4_frags.lock); 2571da177e4SLinus Torvalds return qp; 2581da177e4SLinus Torvalds } 2591da177e4SLinus Torvalds 2601da177e4SLinus Torvalds /* Add an entry to the 'ipq' queue for a newly received IP datagram. */ 26155c0022eSDavid S. Miller static struct ipq *ip_frag_create(struct iphdr *iph, u32 user) 2621da177e4SLinus Torvalds { 2631da177e4SLinus Torvalds struct ipq *qp; 2641da177e4SLinus Torvalds 2651da177e4SLinus Torvalds if ((qp = frag_alloc_queue()) == NULL) 2661da177e4SLinus Torvalds goto out_nomem; 2671da177e4SLinus Torvalds 2681da177e4SLinus Torvalds qp->protocol = iph->protocol; 2695ab11c98SPavel Emelyanov qp->q.last_in = 0; 2701da177e4SLinus Torvalds qp->id = iph->id; 2711da177e4SLinus Torvalds qp->saddr = iph->saddr; 2721da177e4SLinus Torvalds qp->daddr = iph->daddr; 2731da177e4SLinus Torvalds qp->user = user; 2745ab11c98SPavel Emelyanov qp->q.len = 0; 2755ab11c98SPavel Emelyanov qp->q.meat = 0; 2765ab11c98SPavel Emelyanov qp->q.fragments = NULL; 2771da177e4SLinus Torvalds qp->iif = 0; 27889cee8b1SHerbert Xu qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; 2791da177e4SLinus Torvalds 2801da177e4SLinus Torvalds /* Initialize a timer for this entry. */ 2815ab11c98SPavel Emelyanov init_timer(&qp->q.timer); 2825ab11c98SPavel Emelyanov qp->q.timer.data = (unsigned long) qp; /* pointer to queue */ 2835ab11c98SPavel Emelyanov qp->q.timer.function = ip_expire; /* expire function */ 2845ab11c98SPavel Emelyanov spin_lock_init(&qp->q.lock); 2855ab11c98SPavel Emelyanov atomic_set(&qp->q.refcnt, 1); 2861da177e4SLinus Torvalds 28755c0022eSDavid S. Miller return ip_frag_intern(qp); 2881da177e4SLinus Torvalds 2891da177e4SLinus Torvalds out_nomem: 29064ce2073SPatrick McHardy LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); 2911da177e4SLinus Torvalds return NULL; 2921da177e4SLinus Torvalds } 2931da177e4SLinus Torvalds 2941da177e4SLinus Torvalds /* Find the correct entry in the "incomplete datagrams" queue for 2951da177e4SLinus Torvalds * this IP datagram, and create new one, if nothing is found. 2961da177e4SLinus Torvalds */ 2971da177e4SLinus Torvalds static inline struct ipq *ip_find(struct iphdr *iph, u32 user) 2981da177e4SLinus Torvalds { 29976ab608dSAlexey Dobriyan __be16 id = iph->id; 30018277770SAl Viro __be32 saddr = iph->saddr; 30118277770SAl Viro __be32 daddr = iph->daddr; 3021da177e4SLinus Torvalds __u8 protocol = iph->protocol; 30355c0022eSDavid S. Miller unsigned int hash; 3041da177e4SLinus Torvalds struct ipq *qp; 305e7c8a41eSYasuyuki Kozakai struct hlist_node *n; 3061da177e4SLinus Torvalds 3077eb95156SPavel Emelyanov read_lock(&ip4_frags.lock); 30855c0022eSDavid S. Miller hash = ipqhashfn(id, saddr, daddr, protocol); 3097eb95156SPavel Emelyanov hlist_for_each_entry(qp, n, &ip4_frags.hash[hash], q.list) { 3101da177e4SLinus Torvalds if (qp->id == id && 3111da177e4SLinus Torvalds qp->saddr == saddr && 3121da177e4SLinus Torvalds qp->daddr == daddr && 3131da177e4SLinus Torvalds qp->protocol == protocol && 3141da177e4SLinus Torvalds qp->user == user) { 3155ab11c98SPavel Emelyanov atomic_inc(&qp->q.refcnt); 3167eb95156SPavel Emelyanov read_unlock(&ip4_frags.lock); 3171da177e4SLinus Torvalds return qp; 3181da177e4SLinus Torvalds } 3191da177e4SLinus Torvalds } 3207eb95156SPavel Emelyanov read_unlock(&ip4_frags.lock); 3211da177e4SLinus Torvalds 32255c0022eSDavid S. Miller return ip_frag_create(iph, user); 3231da177e4SLinus Torvalds } 3241da177e4SLinus Torvalds 32589cee8b1SHerbert Xu /* Is the fragment too far ahead to be part of ipq? */ 32689cee8b1SHerbert Xu static inline int ip_frag_too_far(struct ipq *qp) 32789cee8b1SHerbert Xu { 32889cee8b1SHerbert Xu struct inet_peer *peer = qp->peer; 32989cee8b1SHerbert Xu unsigned int max = sysctl_ipfrag_max_dist; 33089cee8b1SHerbert Xu unsigned int start, end; 33189cee8b1SHerbert Xu 33289cee8b1SHerbert Xu int rc; 33389cee8b1SHerbert Xu 33489cee8b1SHerbert Xu if (!peer || !max) 33589cee8b1SHerbert Xu return 0; 33689cee8b1SHerbert Xu 33789cee8b1SHerbert Xu start = qp->rid; 33889cee8b1SHerbert Xu end = atomic_inc_return(&peer->rid); 33989cee8b1SHerbert Xu qp->rid = end; 34089cee8b1SHerbert Xu 3415ab11c98SPavel Emelyanov rc = qp->q.fragments && (end - start) > max; 34289cee8b1SHerbert Xu 34389cee8b1SHerbert Xu if (rc) { 34489cee8b1SHerbert Xu IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 34589cee8b1SHerbert Xu } 34689cee8b1SHerbert Xu 34789cee8b1SHerbert Xu return rc; 34889cee8b1SHerbert Xu } 34989cee8b1SHerbert Xu 35089cee8b1SHerbert Xu static int ip_frag_reinit(struct ipq *qp) 35189cee8b1SHerbert Xu { 35289cee8b1SHerbert Xu struct sk_buff *fp; 35389cee8b1SHerbert Xu 35404128f23SPavel Emelyanov if (!mod_timer(&qp->q.timer, jiffies + ip4_frags_ctl.timeout)) { 3555ab11c98SPavel Emelyanov atomic_inc(&qp->q.refcnt); 35689cee8b1SHerbert Xu return -ETIMEDOUT; 35789cee8b1SHerbert Xu } 35889cee8b1SHerbert Xu 3595ab11c98SPavel Emelyanov fp = qp->q.fragments; 36089cee8b1SHerbert Xu do { 36189cee8b1SHerbert Xu struct sk_buff *xp = fp->next; 36289cee8b1SHerbert Xu frag_kfree_skb(fp, NULL); 36389cee8b1SHerbert Xu fp = xp; 36489cee8b1SHerbert Xu } while (fp); 36589cee8b1SHerbert Xu 3665ab11c98SPavel Emelyanov qp->q.last_in = 0; 3675ab11c98SPavel Emelyanov qp->q.len = 0; 3685ab11c98SPavel Emelyanov qp->q.meat = 0; 3695ab11c98SPavel Emelyanov qp->q.fragments = NULL; 37089cee8b1SHerbert Xu qp->iif = 0; 37189cee8b1SHerbert Xu 37289cee8b1SHerbert Xu return 0; 37389cee8b1SHerbert Xu } 37489cee8b1SHerbert Xu 3751da177e4SLinus Torvalds /* Add new segment to existing queue. */ 3761706d587SHerbert Xu static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 3771da177e4SLinus Torvalds { 3781da177e4SLinus Torvalds struct sk_buff *prev, *next; 3791706d587SHerbert Xu struct net_device *dev; 3801da177e4SLinus Torvalds int flags, offset; 3811da177e4SLinus Torvalds int ihl, end; 3821706d587SHerbert Xu int err = -ENOENT; 3831da177e4SLinus Torvalds 3845ab11c98SPavel Emelyanov if (qp->q.last_in & COMPLETE) 3851da177e4SLinus Torvalds goto err; 3861da177e4SLinus Torvalds 38789cee8b1SHerbert Xu if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && 3881706d587SHerbert Xu unlikely(ip_frag_too_far(qp)) && 3891706d587SHerbert Xu unlikely(err = ip_frag_reinit(qp))) { 39089cee8b1SHerbert Xu ipq_kill(qp); 39189cee8b1SHerbert Xu goto err; 39289cee8b1SHerbert Xu } 39389cee8b1SHerbert Xu 394eddc9ec5SArnaldo Carvalho de Melo offset = ntohs(ip_hdr(skb)->frag_off); 3951da177e4SLinus Torvalds flags = offset & ~IP_OFFSET; 3961da177e4SLinus Torvalds offset &= IP_OFFSET; 3971da177e4SLinus Torvalds offset <<= 3; /* offset is in 8-byte chunks */ 398c9bdd4b5SArnaldo Carvalho de Melo ihl = ip_hdrlen(skb); 3991da177e4SLinus Torvalds 4001da177e4SLinus Torvalds /* Determine the position of this fragment. */ 4011da177e4SLinus Torvalds end = offset + skb->len - ihl; 4021706d587SHerbert Xu err = -EINVAL; 4031da177e4SLinus Torvalds 4041da177e4SLinus Torvalds /* Is this the final fragment? */ 4051da177e4SLinus Torvalds if ((flags & IP_MF) == 0) { 4061da177e4SLinus Torvalds /* If we already have some bits beyond end 4071da177e4SLinus Torvalds * or have different end, the segment is corrrupted. 4081da177e4SLinus Torvalds */ 4095ab11c98SPavel Emelyanov if (end < qp->q.len || 4105ab11c98SPavel Emelyanov ((qp->q.last_in & LAST_IN) && end != qp->q.len)) 4111da177e4SLinus Torvalds goto err; 4125ab11c98SPavel Emelyanov qp->q.last_in |= LAST_IN; 4135ab11c98SPavel Emelyanov qp->q.len = end; 4141da177e4SLinus Torvalds } else { 4151da177e4SLinus Torvalds if (end&7) { 4161da177e4SLinus Torvalds end &= ~7; 4171da177e4SLinus Torvalds if (skb->ip_summed != CHECKSUM_UNNECESSARY) 4181da177e4SLinus Torvalds skb->ip_summed = CHECKSUM_NONE; 4191da177e4SLinus Torvalds } 4205ab11c98SPavel Emelyanov if (end > qp->q.len) { 4211da177e4SLinus Torvalds /* Some bits beyond end -> corruption. */ 4225ab11c98SPavel Emelyanov if (qp->q.last_in & LAST_IN) 4231da177e4SLinus Torvalds goto err; 4245ab11c98SPavel Emelyanov qp->q.len = end; 4251da177e4SLinus Torvalds } 4261da177e4SLinus Torvalds } 4271da177e4SLinus Torvalds if (end == offset) 4281da177e4SLinus Torvalds goto err; 4291da177e4SLinus Torvalds 4301706d587SHerbert Xu err = -ENOMEM; 4311da177e4SLinus Torvalds if (pskb_pull(skb, ihl) == NULL) 4321da177e4SLinus Torvalds goto err; 4331706d587SHerbert Xu 4341706d587SHerbert Xu err = pskb_trim_rcsum(skb, end - offset); 4351706d587SHerbert Xu if (err) 4361da177e4SLinus Torvalds goto err; 4371da177e4SLinus Torvalds 4381da177e4SLinus Torvalds /* Find out which fragments are in front and at the back of us 4391da177e4SLinus Torvalds * in the chain of fragments so far. We must know where to put 4401da177e4SLinus Torvalds * this fragment, right? 4411da177e4SLinus Torvalds */ 4421da177e4SLinus Torvalds prev = NULL; 4435ab11c98SPavel Emelyanov for (next = qp->q.fragments; next != NULL; next = next->next) { 4441da177e4SLinus Torvalds if (FRAG_CB(next)->offset >= offset) 4451da177e4SLinus Torvalds break; /* bingo! */ 4461da177e4SLinus Torvalds prev = next; 4471da177e4SLinus Torvalds } 4481da177e4SLinus Torvalds 4491da177e4SLinus Torvalds /* We found where to put this one. Check for overlap with 4501da177e4SLinus Torvalds * preceding fragment, and, if needed, align things so that 4511da177e4SLinus Torvalds * any overlaps are eliminated. 4521da177e4SLinus Torvalds */ 4531da177e4SLinus Torvalds if (prev) { 4541da177e4SLinus Torvalds int i = (FRAG_CB(prev)->offset + prev->len) - offset; 4551da177e4SLinus Torvalds 4561da177e4SLinus Torvalds if (i > 0) { 4571da177e4SLinus Torvalds offset += i; 4581706d587SHerbert Xu err = -EINVAL; 4591da177e4SLinus Torvalds if (end <= offset) 4601da177e4SLinus Torvalds goto err; 4611706d587SHerbert Xu err = -ENOMEM; 4621da177e4SLinus Torvalds if (!pskb_pull(skb, i)) 4631da177e4SLinus Torvalds goto err; 4641da177e4SLinus Torvalds if (skb->ip_summed != CHECKSUM_UNNECESSARY) 4651da177e4SLinus Torvalds skb->ip_summed = CHECKSUM_NONE; 4661da177e4SLinus Torvalds } 4671da177e4SLinus Torvalds } 4681da177e4SLinus Torvalds 4691706d587SHerbert Xu err = -ENOMEM; 4701706d587SHerbert Xu 4711da177e4SLinus Torvalds while (next && FRAG_CB(next)->offset < end) { 4721da177e4SLinus Torvalds int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ 4731da177e4SLinus Torvalds 4741da177e4SLinus Torvalds if (i < next->len) { 4751da177e4SLinus Torvalds /* Eat head of the next overlapped fragment 4761da177e4SLinus Torvalds * and leave the loop. The next ones cannot overlap. 4771da177e4SLinus Torvalds */ 4781da177e4SLinus Torvalds if (!pskb_pull(next, i)) 4791da177e4SLinus Torvalds goto err; 4801da177e4SLinus Torvalds FRAG_CB(next)->offset += i; 4815ab11c98SPavel Emelyanov qp->q.meat -= i; 4821da177e4SLinus Torvalds if (next->ip_summed != CHECKSUM_UNNECESSARY) 4831da177e4SLinus Torvalds next->ip_summed = CHECKSUM_NONE; 4841da177e4SLinus Torvalds break; 4851da177e4SLinus Torvalds } else { 4861da177e4SLinus Torvalds struct sk_buff *free_it = next; 4871da177e4SLinus Torvalds 48847c6bf77SPeter Zijlstra /* Old fragment is completely overridden with 4891da177e4SLinus Torvalds * new one drop it. 4901da177e4SLinus Torvalds */ 4911da177e4SLinus Torvalds next = next->next; 4921da177e4SLinus Torvalds 4931da177e4SLinus Torvalds if (prev) 4941da177e4SLinus Torvalds prev->next = next; 4951da177e4SLinus Torvalds else 4965ab11c98SPavel Emelyanov qp->q.fragments = next; 4971da177e4SLinus Torvalds 4985ab11c98SPavel Emelyanov qp->q.meat -= free_it->len; 4991da177e4SLinus Torvalds frag_kfree_skb(free_it, NULL); 5001da177e4SLinus Torvalds } 5011da177e4SLinus Torvalds } 5021da177e4SLinus Torvalds 5031da177e4SLinus Torvalds FRAG_CB(skb)->offset = offset; 5041da177e4SLinus Torvalds 5051da177e4SLinus Torvalds /* Insert this fragment in the chain of fragments. */ 5061da177e4SLinus Torvalds skb->next = next; 5071da177e4SLinus Torvalds if (prev) 5081da177e4SLinus Torvalds prev->next = skb; 5091da177e4SLinus Torvalds else 5105ab11c98SPavel Emelyanov qp->q.fragments = skb; 5111da177e4SLinus Torvalds 5121706d587SHerbert Xu dev = skb->dev; 5131706d587SHerbert Xu if (dev) { 5141706d587SHerbert Xu qp->iif = dev->ifindex; 5151da177e4SLinus Torvalds skb->dev = NULL; 5161706d587SHerbert Xu } 5175ab11c98SPavel Emelyanov qp->q.stamp = skb->tstamp; 5185ab11c98SPavel Emelyanov qp->q.meat += skb->len; 5197eb95156SPavel Emelyanov atomic_add(skb->truesize, &ip4_frags.mem); 5201da177e4SLinus Torvalds if (offset == 0) 5215ab11c98SPavel Emelyanov qp->q.last_in |= FIRST_IN; 5221da177e4SLinus Torvalds 5235ab11c98SPavel Emelyanov if (qp->q.last_in == (FIRST_IN | LAST_IN) && qp->q.meat == qp->q.len) 5241706d587SHerbert Xu return ip_frag_reasm(qp, prev, dev); 5251706d587SHerbert Xu 5267eb95156SPavel Emelyanov write_lock(&ip4_frags.lock); 5277eb95156SPavel Emelyanov list_move_tail(&qp->q.lru_list, &ip4_frags.lru_list); 5287eb95156SPavel Emelyanov write_unlock(&ip4_frags.lock); 5291706d587SHerbert Xu return -EINPROGRESS; 5301da177e4SLinus Torvalds 5311da177e4SLinus Torvalds err: 5321da177e4SLinus Torvalds kfree_skb(skb); 5331706d587SHerbert Xu return err; 5341da177e4SLinus Torvalds } 5351da177e4SLinus Torvalds 5361da177e4SLinus Torvalds 5371da177e4SLinus Torvalds /* Build a new IP datagram from all its fragments. */ 5381da177e4SLinus Torvalds 5391706d587SHerbert Xu static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, 5401706d587SHerbert Xu struct net_device *dev) 5411da177e4SLinus Torvalds { 5421da177e4SLinus Torvalds struct iphdr *iph; 5435ab11c98SPavel Emelyanov struct sk_buff *fp, *head = qp->q.fragments; 5441da177e4SLinus Torvalds int len; 5451da177e4SLinus Torvalds int ihlen; 5461706d587SHerbert Xu int err; 5471da177e4SLinus Torvalds 5481da177e4SLinus Torvalds ipq_kill(qp); 5491da177e4SLinus Torvalds 5501706d587SHerbert Xu /* Make the one we just received the head. */ 5511706d587SHerbert Xu if (prev) { 5521706d587SHerbert Xu head = prev->next; 5531706d587SHerbert Xu fp = skb_clone(head, GFP_ATOMIC); 5541706d587SHerbert Xu 5551706d587SHerbert Xu if (!fp) 5561706d587SHerbert Xu goto out_nomem; 5571706d587SHerbert Xu 5581706d587SHerbert Xu fp->next = head->next; 5591706d587SHerbert Xu prev->next = fp; 5601706d587SHerbert Xu 5615ab11c98SPavel Emelyanov skb_morph(head, qp->q.fragments); 5625ab11c98SPavel Emelyanov head->next = qp->q.fragments->next; 5631706d587SHerbert Xu 5645ab11c98SPavel Emelyanov kfree_skb(qp->q.fragments); 5655ab11c98SPavel Emelyanov qp->q.fragments = head; 5661706d587SHerbert Xu } 5671706d587SHerbert Xu 5681da177e4SLinus Torvalds BUG_TRAP(head != NULL); 5691da177e4SLinus Torvalds BUG_TRAP(FRAG_CB(head)->offset == 0); 5701da177e4SLinus Torvalds 5711da177e4SLinus Torvalds /* Allocate a new buffer for the datagram. */ 572c9bdd4b5SArnaldo Carvalho de Melo ihlen = ip_hdrlen(head); 5735ab11c98SPavel Emelyanov len = ihlen + qp->q.len; 5741da177e4SLinus Torvalds 5751706d587SHerbert Xu err = -E2BIG; 5761da177e4SLinus Torvalds if (len > 65535) 5771da177e4SLinus Torvalds goto out_oversize; 5781da177e4SLinus Torvalds 5791da177e4SLinus Torvalds /* Head of list must not be cloned. */ 5801706d587SHerbert Xu err = -ENOMEM; 5811da177e4SLinus Torvalds if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) 5821da177e4SLinus Torvalds goto out_nomem; 5831da177e4SLinus Torvalds 5841da177e4SLinus Torvalds /* If the first fragment is fragmented itself, we split 5851da177e4SLinus Torvalds * it to two chunks: the first with data and paged part 5861da177e4SLinus Torvalds * and the second, holding only fragments. */ 5871da177e4SLinus Torvalds if (skb_shinfo(head)->frag_list) { 5881da177e4SLinus Torvalds struct sk_buff *clone; 5891da177e4SLinus Torvalds int i, plen = 0; 5901da177e4SLinus Torvalds 5911da177e4SLinus Torvalds if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) 5921da177e4SLinus Torvalds goto out_nomem; 5931da177e4SLinus Torvalds clone->next = head->next; 5941da177e4SLinus Torvalds head->next = clone; 5951da177e4SLinus Torvalds skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 5961da177e4SLinus Torvalds skb_shinfo(head)->frag_list = NULL; 5971da177e4SLinus Torvalds for (i=0; i<skb_shinfo(head)->nr_frags; i++) 5981da177e4SLinus Torvalds plen += skb_shinfo(head)->frags[i].size; 5991da177e4SLinus Torvalds clone->len = clone->data_len = head->data_len - plen; 6001da177e4SLinus Torvalds head->data_len -= clone->len; 6011da177e4SLinus Torvalds head->len -= clone->len; 6021da177e4SLinus Torvalds clone->csum = 0; 6031da177e4SLinus Torvalds clone->ip_summed = head->ip_summed; 6047eb95156SPavel Emelyanov atomic_add(clone->truesize, &ip4_frags.mem); 6051da177e4SLinus Torvalds } 6061da177e4SLinus Torvalds 6071da177e4SLinus Torvalds skb_shinfo(head)->frag_list = head->next; 608d56f90a7SArnaldo Carvalho de Melo skb_push(head, head->data - skb_network_header(head)); 6097eb95156SPavel Emelyanov atomic_sub(head->truesize, &ip4_frags.mem); 6101da177e4SLinus Torvalds 6111da177e4SLinus Torvalds for (fp=head->next; fp; fp = fp->next) { 6121da177e4SLinus Torvalds head->data_len += fp->len; 6131da177e4SLinus Torvalds head->len += fp->len; 6141da177e4SLinus Torvalds if (head->ip_summed != fp->ip_summed) 6151da177e4SLinus Torvalds head->ip_summed = CHECKSUM_NONE; 61684fa7933SPatrick McHardy else if (head->ip_summed == CHECKSUM_COMPLETE) 6171da177e4SLinus Torvalds head->csum = csum_add(head->csum, fp->csum); 6181da177e4SLinus Torvalds head->truesize += fp->truesize; 6197eb95156SPavel Emelyanov atomic_sub(fp->truesize, &ip4_frags.mem); 6201da177e4SLinus Torvalds } 6211da177e4SLinus Torvalds 6221da177e4SLinus Torvalds head->next = NULL; 6231da177e4SLinus Torvalds head->dev = dev; 6245ab11c98SPavel Emelyanov head->tstamp = qp->q.stamp; 6251da177e4SLinus Torvalds 626eddc9ec5SArnaldo Carvalho de Melo iph = ip_hdr(head); 6271da177e4SLinus Torvalds iph->frag_off = 0; 6281da177e4SLinus Torvalds iph->tot_len = htons(len); 6291da177e4SLinus Torvalds IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); 6305ab11c98SPavel Emelyanov qp->q.fragments = NULL; 6311706d587SHerbert Xu return 0; 6321da177e4SLinus Torvalds 6331da177e4SLinus Torvalds out_nomem: 63464ce2073SPatrick McHardy LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " 63564ce2073SPatrick McHardy "queue %p\n", qp); 6361da177e4SLinus Torvalds goto out_fail; 6371da177e4SLinus Torvalds out_oversize: 6381da177e4SLinus Torvalds if (net_ratelimit()) 6391da177e4SLinus Torvalds printk(KERN_INFO 6401da177e4SLinus Torvalds "Oversized IP packet from %d.%d.%d.%d.\n", 6411da177e4SLinus Torvalds NIPQUAD(qp->saddr)); 6421da177e4SLinus Torvalds out_fail: 6431da177e4SLinus Torvalds IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 6441706d587SHerbert Xu return err; 6451da177e4SLinus Torvalds } 6461da177e4SLinus Torvalds 6471da177e4SLinus Torvalds /* Process an incoming IP datagram fragment. */ 648776c729eSHerbert Xu int ip_defrag(struct sk_buff *skb, u32 user) 6491da177e4SLinus Torvalds { 6501da177e4SLinus Torvalds struct ipq *qp; 6511da177e4SLinus Torvalds 6521da177e4SLinus Torvalds IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); 6531da177e4SLinus Torvalds 6541da177e4SLinus Torvalds /* Start by cleaning up the memory. */ 65504128f23SPavel Emelyanov if (atomic_read(&ip4_frags.mem) > ip4_frags_ctl.high_thresh) 6561da177e4SLinus Torvalds ip_evictor(); 6571da177e4SLinus Torvalds 6581da177e4SLinus Torvalds /* Lookup (or create) queue header */ 659eddc9ec5SArnaldo Carvalho de Melo if ((qp = ip_find(ip_hdr(skb), user)) != NULL) { 6601706d587SHerbert Xu int ret; 6611da177e4SLinus Torvalds 6625ab11c98SPavel Emelyanov spin_lock(&qp->q.lock); 6631da177e4SLinus Torvalds 6641706d587SHerbert Xu ret = ip_frag_queue(qp, skb); 6651da177e4SLinus Torvalds 6665ab11c98SPavel Emelyanov spin_unlock(&qp->q.lock); 667*4b6cb5d8SPavel Emelyanov ipq_put(qp); 668776c729eSHerbert Xu return ret; 6691da177e4SLinus Torvalds } 6701da177e4SLinus Torvalds 6711da177e4SLinus Torvalds IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); 6721da177e4SLinus Torvalds kfree_skb(skb); 673776c729eSHerbert Xu return -ENOMEM; 6741da177e4SLinus Torvalds } 6751da177e4SLinus Torvalds 676b7aa0bf7SEric Dumazet void __init ipfrag_init(void) 6771da177e4SLinus Torvalds { 67804128f23SPavel Emelyanov ip4_frags.ctl = &ip4_frags_ctl; 679321a3a99SPavel Emelyanov ip4_frags.hashfn = ip4_hashfn; 6801e4b8287SPavel Emelyanov ip4_frags.destructor = ip4_frag_free; 6811e4b8287SPavel Emelyanov ip4_frags.skb_free = NULL; 6821e4b8287SPavel Emelyanov ip4_frags.qsize = sizeof(struct ipq); 6837eb95156SPavel Emelyanov inet_frags_init(&ip4_frags); 6841da177e4SLinus Torvalds } 6851da177e4SLinus Torvalds 6861da177e4SLinus Torvalds EXPORT_SYMBOL(ip_defrag); 687