aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/af_inet.c21
-rw-r--r--net/ipv4/devinet.c3
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_hash.c1
-rw-r--r--net/ipv4/fib_lookup.h3
-rw-r--r--net/ipv4/fib_rules.c4
-rw-r--r--net/ipv4/fib_semantics.c3
-rw-r--r--net/ipv4/fib_trie.c9
-rw-r--r--net/ipv4/ipconfig.c53
-rw-r--r--net/ipv4/route.c60
-rw-r--r--net/ipv4/tcp.c35
-rw-r--r--net/ipv4/tcp_input.c10
13 files changed, 121 insertions, 86 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index b2cf91e4cca..5b919f7b45d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -407,8 +407,8 @@ config INET_XFRM_MODE_BEET
If unsure, say Y.
config INET_LRO
- tristate "Large Receive Offload (ipv4/tcp)"
-
+ bool "Large Receive Offload (ipv4/tcp)"
+ default y
---help---
Support for Large Receive Offload (ipv4/tcp).
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 170689681aa..5abee4c9744 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1246,13 +1246,20 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
struct sk_buff **pp = NULL;
struct sk_buff *p;
struct iphdr *iph;
+ unsigned int hlen;
+ unsigned int off;
+ unsigned int id;
int flush = 1;
int proto;
- int id;
- iph = skb_gro_header(skb, sizeof(*iph));
- if (unlikely(!iph))
- goto out;
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ iph = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+ }
proto = iph->protocol & (MAX_INET_PROTOS - 1);
@@ -1267,9 +1274,9 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto out_unlock;
- flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
- iph->frag_off != htons(IP_DF);
- id = ntohs(iph->id);
+ id = ntohl(*(u32 *)&iph->id);
+ flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+ id >>= 16;
for (p = *head; p; p = p->next) {
struct iphdr *iph2;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 126bb911880..3863c3a4223 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1347,7 +1347,8 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
struct net *net = ctl->extra2;
if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
- rtnl_lock();
+ if (!rtnl_trylock())
+ return restart_syscall();
if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
inet_forward_change(net);
} else if (*valp) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cafcc49d099..e2f95059256 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -40,7 +40,6 @@
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
-#include <net/icmp.h>
#include <net/arp.h>
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index ded8c44fb84..ecd39454235 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -263,7 +263,6 @@ fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
err = fib_semantic_match(&f->fn_alias,
flp, res,
- f->fn_key, fz->fz_mask,
fz->fz_order);
if (err <= 0)
goto out;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 2c1623d2768..637b133973b 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -22,8 +22,7 @@ struct fib_alias {
/* Exported by fib_semantics.c */
extern int fib_semantic_match(struct list_head *head,
const struct flowi *flp,
- struct fib_result *res, __be32 zone, __be32 mask,
- int prefixlen);
+ struct fib_result *res, int prefixlen);
extern void fib_release_info(struct fib_info *);
extern struct fib_info *fib_create_info(struct fib_config *cfg);
extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6080d712082..92d9d97ec5e 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -134,7 +134,7 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
};
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
+ struct fib_rule_hdr *frh,
struct nlattr **tb)
{
struct net *net = sock_net(skb->sk);
@@ -209,7 +209,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
}
static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh)
+ struct fib_rule_hdr *frh)
{
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f831df50090..9b096d6ff3f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -866,8 +866,7 @@ failure:
/* Note! fib_semantic_match intentionally uses RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, __be32 zone, __be32 mask,
- int prefixlen)
+ struct fib_result *res, int prefixlen)
{
struct fib_alias *fa;
int nh_sel = 0;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ec0ae490f0b..538d2a9a511 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -986,9 +986,12 @@ fib_find_node(struct trie *t, u32 key)
static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
{
int wasfull;
- t_key cindex, key = tn->key;
+ t_key cindex, key;
struct tnode *tp;
+ preempt_disable();
+ key = tn->key;
+
while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
@@ -1007,6 +1010,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
if (IS_TNODE(tn))
tn = (struct tnode *)resize(t, (struct tnode *)tn);
+ preempt_enable();
return (struct node *)tn;
}
@@ -1347,8 +1351,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
if (l->key != (key & ntohl(mask)))
continue;
- err = fib_semantic_match(&li->falh, flp, res,
- htonl(l->key), mask, plen);
+ err = fib_semantic_match(&li->falh, flp, res, plen);
#ifdef CONFIG_IP_FIB_TRIE_STATS
if (err <= 0)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 90d22ae0a41..f8d04c25645 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -139,6 +139,8 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */
__be32 root_server_addr = NONE; /* Address of NFS server */
u8 root_server_path[256] = { 0, }; /* Path to mount as root */
+u32 ic_dev_xid; /* Device under configuration */
+
/* vendor class identifier */
static char vendor_class_identifier[253] __initdata;
@@ -158,6 +160,9 @@ static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
/* Protocols supported by available interfaces */
static int ic_proto_have_if __initdata = 0;
+/* MTU for boot device */
+static int ic_dev_mtu __initdata = 0;
+
#ifdef IPCONFIG_DYNAMIC
static DEFINE_SPINLOCK(ic_recv_lock);
static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */
@@ -284,7 +289,7 @@ set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
sin->sin_port = port;
}
-static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg)
{
int res;
@@ -295,6 +300,17 @@ static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
return res;
}
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
{
int res;
@@ -319,20 +335,31 @@ static int __init ic_setup_if(void)
memset(&ir, 0, sizeof(ir));
strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
set_sockaddr(sin, ic_myaddr, 0);
- if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) {
+ if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
return -1;
}
set_sockaddr(sin, ic_netmask, 0);
- if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+ if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
return -1;
}
set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
- if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+ if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
return -1;
}
+ /* Handle the case where we need non-standard MTU on the boot link (a network
+ * using jumbo frames, for instance). If we can't set the mtu, don't error
+ * out, we'll try to muddle along.
+ */
+ if (ic_dev_mtu != 0) {
+ strcpy(ir.ifr_name, ic_dev->name);
+ ir.ifr_mtu = ic_dev_mtu;
+ if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
+ printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n",
+ ic_dev_mtu, err);
+ }
return 0;
}
@@ -621,6 +648,7 @@ ic_dhcp_init_options(u8 *options)
12, /* Host name */
15, /* Domain name */
17, /* Boot path */
+ 26, /* MTU */
40, /* NIS domain name */
};
@@ -796,6 +824,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
{
u8 servers;
int i;
+ u16 mtu;
#ifdef IPCONFIG_DEBUG
u8 *c;
@@ -835,6 +864,10 @@ static void __init ic_do_bootp_ext(u8 *ext)
if (!root_server_path[0])
ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
break;
+ case 26: /* Interface MTU */
+ memcpy(&mtu, ext+1, sizeof(mtu));
+ ic_dev_mtu = ntohs(mtu);
+ break;
case 40: /* NIS Domain name (_not_ DNS) */
ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN);
break;
@@ -932,6 +965,13 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
goto drop_unlock;
}
+ /* Is it a reply for the device we are configuring? */
+ if (b->xid != ic_dev_xid) {
+ if (net_ratelimit())
+ printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet \n");
+ goto drop_unlock;
+ }
+
/* Parse extensions */
if (ext_len >= 4 &&
!memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
@@ -1115,6 +1155,9 @@ static int __init ic_dynamic(void)
get_random_bytes(&timeout, sizeof(timeout));
timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
for (;;) {
+ /* Track the device we are configuring */
+ ic_dev_xid = d->xid;
+
#ifdef IPCONFIG_BOOTP
if (do_bootp && (d->able & IC_BOOTP))
ic_bootp_send_if(d, jiffies - start_jiffies);
@@ -1391,6 +1434,8 @@ static int __init ip_auto_config(void)
printk(",\n bootserver=%pI4", &ic_servaddr);
printk(", rootserver=%pI4", &root_server_addr);
printk(", rootpath=%s", root_server_path);
+ if (ic_dev_mtu)
+ printk(", mtu=%d", ic_dev_mtu);
printk("\n");
#endif /* !SILENT */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c4c60e9f068..28205e5bfa9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -784,8 +784,8 @@ static void rt_check_expire(void)
{
static unsigned int rover;
unsigned int i = rover, goal;
- struct rtable *rth, **rthp;
- unsigned long length = 0, samples = 0;
+ struct rtable *rth, *aux, **rthp;
+ unsigned long samples = 0;
unsigned long sum = 0, sum2 = 0;
u64 mult;
@@ -795,9 +795,9 @@ static void rt_check_expire(void)
goal = (unsigned int)mult;
if (goal > rt_hash_mask)
goal = rt_hash_mask + 1;
- length = 0;
for (; goal > 0; goal--) {
unsigned long tmo = ip_rt_gc_timeout;
+ unsigned long length;
i = (i + 1) & rt_hash_mask;
rthp = &rt_hash_table[i].chain;
@@ -809,8 +809,10 @@ static void rt_check_expire(void)
if (*rthp == NULL)
continue;
+ length = 0;
spin_lock_bh(rt_hash_lock_addr(i));
while ((rth = *rthp) != NULL) {
+ prefetch(rth->u.dst.rt_next);
if (rt_is_expired(rth)) {
*rthp = rth->u.dst.rt_next;
rt_free(rth);
@@ -819,33 +821,30 @@ static void rt_check_expire(void)
if (rth->u.dst.expires) {
/* Entry is expired even if it is in use */
if (time_before_eq(jiffies, rth->u.dst.expires)) {
+nofree:
tmo >>= 1;
rthp = &rth->u.dst.rt_next;
/*
- * Only bump our length if the hash
- * inputs on entries n and n+1 are not
- * the same, we only count entries on
+ * We only count entries on
* a chain with equal hash inputs once
* so that entries for different QOS
* levels, and other non-hash input
* attributes don't unfairly skew
* the length computation
*/
- if ((*rthp == NULL) ||
- !compare_hash_inputs(&(*rthp)->fl,
- &rth->fl))
- length += ONE;
+ for (aux = rt_hash_table[i].chain;;) {
+ if (aux == rth) {
+ length += ONE;
+ break;
+ }
+ if (compare_hash_inputs(&aux->fl, &rth->fl))
+ break;
+ aux = aux->u.dst.rt_next;
+ }
continue;
}
- } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
- tmo >>= 1;
- rthp = &rth->u.dst.rt_next;
- if ((*rthp == NULL) ||
- !compare_hash_inputs(&(*rthp)->fl,
- &rth->fl))
- length += ONE;
- continue;
- }
+ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+ goto nofree;
/* Cleanup aged off entries. */
*rthp = rth->u.dst.rt_next;
@@ -1068,7 +1067,6 @@ out: return 0;
static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
{
struct rtable *rth, **rthp;
- struct rtable *rthi;
unsigned long now;
struct rtable *cand, **candp;
u32 min_score;
@@ -1088,7 +1086,6 @@ restart:
}
rthp = &rt_hash_table[hash].chain;
- rthi = NULL;
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
@@ -1134,17 +1131,6 @@ restart:
chain_length++;
rthp = &rth->u.dst.rt_next;
-
- /*
- * check to see if the next entry in the chain
- * contains the same hash input values as rt. If it does
- * This is where we will insert into the list, instead of
- * at the head. This groups entries that differ by aspects not
- * relvant to the hash function together, which we use to adjust
- * our chain length
- */
- if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
- rthi = rth;
}
if (cand) {
@@ -1205,10 +1191,7 @@ restart:
}
}
- if (rthi)
- rt->u.dst.rt_next = rthi->u.dst.rt_next;
- else
- rt->u.dst.rt_next = rt_hash_table[hash].chain;
+ rt->u.dst.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2
if (rt->u.dst.rt_next) {
@@ -1224,10 +1207,7 @@ restart:
* previous writes to rt are comitted to memory
* before making rt visible to other CPUS.
*/
- if (rthi)
- rcu_assign_pointer(rthi->u.dst.rt_next, rt);
- else
- rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+ rcu_assign_pointer(rt_hash_table[hash].chain, rt);
spin_unlock_bh(rt_hash_lock_addr(hash));
*rp = rt;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1d7f49c6f0c..0fb8b441f1f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1321,6 +1321,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct task_struct *user_recv = NULL;
int copied_early = 0;
struct sk_buff *skb;
+ u32 urg_hole = 0;
lock_sock(sk);
@@ -1532,7 +1533,8 @@ do_prequeue:
}
}
}
- if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
+ if ((flags & MSG_PEEK) &&
+ (peek_seq - copied - urg_hole != tp->copied_seq)) {
if (net_ratelimit())
printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
current->comm, task_pid_nr(current));
@@ -1553,6 +1555,7 @@ do_prequeue:
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
++*seq;
+ urg_hole++;
offset++;
used--;
if (!used)
@@ -2515,20 +2518,30 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
unsigned int thlen;
unsigned int flags;
unsigned int mss = 1;
+ unsigned int hlen;
+ unsigned int off;
int flush = 1;
int i;
- th = skb_gro_header(skb, sizeof(*th));
- if (unlikely(!th))
- goto out;
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*th);
+ th = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
thlen = th->doff * 4;
if (thlen < sizeof(*th))
goto out;
- th = skb_gro_header(skb, thlen);
- if (unlikely(!th))
- goto out;
+ hlen = off + thlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
skb_gro_pull(skb, thlen);
@@ -2541,7 +2554,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
th2 = tcp_hdr(p);
- if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) {
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@@ -2556,14 +2569,14 @@ found:
flush |= flags & TCP_FLAG_CWR;
flush |= (flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
- flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window);
- for (i = sizeof(*th); !flush && i < thlen; i += 4)
+ flush |= th->ack_seq ^ th2->ack_seq;
+ for (i = sizeof(*th); i < thlen; i += 4)
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
mss = skb_shinfo(p)->gso_size;
- flush |= (len > mss) | !len;
+ flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
if (flush || skb_gro_receive(head, skb)) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 56dcf97a97f..eeb8a92aa41 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -597,16 +597,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
tcp_grow_window(sk, skb);
}
-static u32 tcp_rto_min(struct sock *sk)
-{
- struct dst_entry *dst = __sk_dst_get(sk);
- u32 rto_min = TCP_RTO_MIN;
-
- if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
- rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
- return rto_min;
-}
-
/* Called to compute a smoothed rtt estimate. The data fed to this
* routine either comes from timestamps, or from segments that were
* known _not_ to have been retransmitted [see Karn/Partridge