diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Kconfig | 4 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 21 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 3 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 1 | ||||
-rw-r--r-- | net/ipv4/fib_hash.c | 1 | ||||
-rw-r--r-- | net/ipv4/fib_lookup.h | 3 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 3 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 9 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 53 | ||||
-rw-r--r-- | net/ipv4/route.c | 60 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 35 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 10 |
13 files changed, 121 insertions, 86 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index b2cf91e4cca..5b919f7b45d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -407,8 +407,8 @@ config INET_XFRM_MODE_BEET If unsure, say Y. config INET_LRO - tristate "Large Receive Offload (ipv4/tcp)" - + bool "Large Receive Offload (ipv4/tcp)" + default y ---help--- Support for Large Receive Offload (ipv4/tcp). diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 170689681aa..5abee4c9744 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1246,13 +1246,20 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff **pp = NULL; struct sk_buff *p; struct iphdr *iph; + unsigned int hlen; + unsigned int off; + unsigned int id; int flush = 1; int proto; - int id; - iph = skb_gro_header(skb, sizeof(*iph)); - if (unlikely(!iph)) - goto out; + off = skb_gro_offset(skb); + hlen = off + sizeof(*iph); + iph = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + iph = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!iph)) + goto out; + } proto = iph->protocol & (MAX_INET_PROTOS - 1); @@ -1267,9 +1274,9 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto out_unlock; - flush = ntohs(iph->tot_len) != skb_gro_len(skb) || - iph->frag_off != htons(IP_DF); - id = ntohs(iph->id); + id = ntohl(*(u32 *)&iph->id); + flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); + id >>= 16; for (p = *head; p; p = p->next) { struct iphdr *iph2; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 126bb911880..3863c3a4223 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1347,7 +1347,8 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, struct net *net = ctl->extra2; if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { - rtnl_lock(); + if (!rtnl_trylock()) + return restart_syscall(); if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else if (*valp) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cafcc49d099..e2f95059256 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -40,7 +40,6 @@ #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> -#include <net/icmp.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index ded8c44fb84..ecd39454235 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -263,7 +263,6 @@ fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result err = fib_semantic_match(&f->fn_alias, flp, res, - f->fn_key, fz->fz_mask, fz->fz_order); if (err <= 0) goto out; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 2c1623d2768..637b133973b 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -22,8 +22,7 @@ struct fib_alias { /* Exported by fib_semantics.c */ extern int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, __be32 zone, __be32 mask, - int prefixlen); + struct fib_result *res, int prefixlen); extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 6080d712082..92d9d97ec5e 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -134,7 +134,7 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { }; static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_rule_hdr *frh, + struct fib_rule_hdr *frh, struct nlattr **tb) { struct net *net = sock_net(skb->sk); @@ -209,7 +209,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, } static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_rule_hdr *frh) + struct fib_rule_hdr *frh) { struct fib4_rule *rule4 = (struct fib4_rule *) rule; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f831df50090..9b096d6ff3f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -866,8 +866,7 @@ failure: /* Note! fib_semantic_match intentionally uses RCU list functions. */ int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, __be32 zone, __be32 mask, - int prefixlen) + struct fib_result *res, int prefixlen) { struct fib_alias *fa; int nh_sel = 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ec0ae490f0b..538d2a9a511 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -986,9 +986,12 @@ fib_find_node(struct trie *t, u32 key) static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { int wasfull; - t_key cindex, key = tn->key; + t_key cindex, key; struct tnode *tp; + preempt_disable(); + key = tn->key; + while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); @@ -1007,6 +1010,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) if (IS_TNODE(tn)) tn = (struct tnode *)resize(t, (struct tnode *)tn); + preempt_enable(); return (struct node *)tn; } @@ -1347,8 +1351,7 @@ static int check_leaf(struct trie *t, struct leaf *l, if (l->key != (key & ntohl(mask))) continue; - err = fib_semantic_match(&li->falh, flp, res, - htonl(l->key), mask, plen); + err = fib_semantic_match(&li->falh, flp, res, plen); #ifdef CONFIG_IP_FIB_TRIE_STATS if (err <= 0) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 90d22ae0a41..f8d04c25645 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -139,6 +139,8 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ +u32 ic_dev_xid; /* Device under configuration */ + /* vendor class identifier */ static char vendor_class_identifier[253] __initdata; @@ -158,6 +160,9 @@ static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; /* Protocols supported by available interfaces */ static int ic_proto_have_if __initdata = 0; +/* MTU for boot device */ +static int ic_dev_mtu __initdata = 0; + #ifdef IPCONFIG_DYNAMIC static DEFINE_SPINLOCK(ic_recv_lock); static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ @@ -284,7 +289,7 @@ set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) sin->sin_port = port; } -static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) +static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg) { int res; @@ -295,6 +300,17 @@ static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) return res; } +static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg); + set_fs(oldfs); + return res; +} + static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) { int res; @@ -319,20 +335,31 @@ static int __init ic_setup_if(void) memset(&ir, 0, sizeof(ir)); strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); set_sockaddr(sin, ic_myaddr, 0); - if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) { + if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); return -1; } set_sockaddr(sin, ic_netmask, 0); - if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) { + if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); return -1; } set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); - if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { + if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); return -1; } + /* Handle the case where we need non-standard MTU on the boot link (a network + * using jumbo frames, for instance). If we can't set the mtu, don't error + * out, we'll try to muddle along. + */ + if (ic_dev_mtu != 0) { + strcpy(ir.ifr_name, ic_dev->name); + ir.ifr_mtu = ic_dev_mtu; + if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) + printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n", + ic_dev_mtu, err); + } return 0; } @@ -621,6 +648,7 @@ ic_dhcp_init_options(u8 *options) 12, /* Host name */ 15, /* Domain name */ 17, /* Boot path */ + 26, /* MTU */ 40, /* NIS domain name */ }; @@ -796,6 +824,7 @@ static void __init ic_do_bootp_ext(u8 *ext) { u8 servers; int i; + u16 mtu; #ifdef IPCONFIG_DEBUG u8 *c; @@ -835,6 +864,10 @@ static void __init ic_do_bootp_ext(u8 *ext) if (!root_server_path[0]) ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); break; + case 26: /* Interface MTU */ + memcpy(&mtu, ext+1, sizeof(mtu)); + ic_dev_mtu = ntohs(mtu); + break; case 40: /* NIS Domain name (_not_ DNS) */ ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); break; @@ -932,6 +965,13 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str goto drop_unlock; } + /* Is it a reply for the device we are configuring? */ + if (b->xid != ic_dev_xid) { + if (net_ratelimit()) + printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet \n"); + goto drop_unlock; + } + /* Parse extensions */ if (ext_len >= 4 && !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */ @@ -1115,6 +1155,9 @@ static int __init ic_dynamic(void) get_random_bytes(&timeout, sizeof(timeout)); timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); for (;;) { + /* Track the device we are configuring */ + ic_dev_xid = d->xid; + #ifdef IPCONFIG_BOOTP if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); @@ -1391,6 +1434,8 @@ static int __init ip_auto_config(void) printk(",\n bootserver=%pI4", &ic_servaddr); printk(", rootserver=%pI4", &root_server_addr); printk(", rootpath=%s", root_server_path); + if (ic_dev_mtu) + printk(", mtu=%d", ic_dev_mtu); printk("\n"); #endif /* !SILENT */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c4c60e9f068..28205e5bfa9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -784,8 +784,8 @@ static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, **rthp; - unsigned long length = 0, samples = 0; + struct rtable *rth, *aux, **rthp; + unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; u64 mult; @@ -795,9 +795,9 @@ static void rt_check_expire(void) goal = (unsigned int)mult; if (goal > rt_hash_mask) goal = rt_hash_mask + 1; - length = 0; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; + unsigned long length; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; @@ -809,8 +809,10 @@ static void rt_check_expire(void) if (*rthp == NULL) continue; + length = 0; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { + prefetch(rth->u.dst.rt_next); if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); @@ -819,33 +821,30 @@ static void rt_check_expire(void) if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ if (time_before_eq(jiffies, rth->u.dst.expires)) { +nofree: tmo >>= 1; rthp = &rth->u.dst.rt_next; /* - * Only bump our length if the hash - * inputs on entries n and n+1 are not - * the same, we only count entries on + * We only count entries on * a chain with equal hash inputs once * so that entries for different QOS * levels, and other non-hash input * attributes don't unfairly skew * the length computation */ - if ((*rthp == NULL) || - !compare_hash_inputs(&(*rthp)->fl, - &rth->fl)) - length += ONE; + for (aux = rt_hash_table[i].chain;;) { + if (aux == rth) { + length += ONE; + break; + } + if (compare_hash_inputs(&aux->fl, &rth->fl)) + break; + aux = aux->u.dst.rt_next; + } continue; } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - if ((*rthp == NULL) || - !compare_hash_inputs(&(*rthp)->fl, - &rth->fl)) - length += ONE; - continue; - } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) + goto nofree; /* Cleanup aged off entries. */ *rthp = rth->u.dst.rt_next; @@ -1068,7 +1067,6 @@ out: return 0; static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) { struct rtable *rth, **rthp; - struct rtable *rthi; unsigned long now; struct rtable *cand, **candp; u32 min_score; @@ -1088,7 +1086,6 @@ restart: } rthp = &rt_hash_table[hash].chain; - rthi = NULL; spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { @@ -1134,17 +1131,6 @@ restart: chain_length++; rthp = &rth->u.dst.rt_next; - - /* - * check to see if the next entry in the chain - * contains the same hash input values as rt. If it does - * This is where we will insert into the list, instead of - * at the head. This groups entries that differ by aspects not - * relvant to the hash function together, which we use to adjust - * our chain length - */ - if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl)) - rthi = rth; } if (cand) { @@ -1205,10 +1191,7 @@ restart: } } - if (rthi) - rt->u.dst.rt_next = rthi->u.dst.rt_next; - else - rt->u.dst.rt_next = rt_hash_table[hash].chain; + rt->u.dst.rt_next = rt_hash_table[hash].chain; #if RT_CACHE_DEBUG >= 2 if (rt->u.dst.rt_next) { @@ -1224,10 +1207,7 @@ restart: * previous writes to rt are comitted to memory * before making rt visible to other CPUS. */ - if (rthi) - rcu_assign_pointer(rthi->u.dst.rt_next, rt); - else - rcu_assign_pointer(rt_hash_table[hash].chain, rt); + rcu_assign_pointer(rt_hash_table[hash].chain, rt); spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1d7f49c6f0c..0fb8b441f1f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1321,6 +1321,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct task_struct *user_recv = NULL; int copied_early = 0; struct sk_buff *skb; + u32 urg_hole = 0; lock_sock(sk); @@ -1532,7 +1533,8 @@ do_prequeue: } } } - if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { + if ((flags & MSG_PEEK) && + (peek_seq - copied - urg_hole != tp->copied_seq)) { if (net_ratelimit()) printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", current->comm, task_pid_nr(current)); @@ -1553,6 +1555,7 @@ do_prequeue: if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { ++*seq; + urg_hole++; offset++; used--; if (!used) @@ -2515,20 +2518,30 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) unsigned int thlen; unsigned int flags; unsigned int mss = 1; + unsigned int hlen; + unsigned int off; int flush = 1; int i; - th = skb_gro_header(skb, sizeof(*th)); - if (unlikely(!th)) - goto out; + off = skb_gro_offset(skb); + hlen = off + sizeof(*th); + th = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + goto out; + } thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; - th = skb_gro_header(skb, thlen); - if (unlikely(!th)) - goto out; + hlen = off + thlen; + if (skb_gro_header_hard(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + goto out; + } skb_gro_pull(skb, thlen); @@ -2541,7 +2554,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) th2 = tcp_hdr(p); - if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) { + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { NAPI_GRO_CB(p)->same_flow = 0; continue; } @@ -2556,14 +2569,14 @@ found: flush |= flags & TCP_FLAG_CWR; flush |= (flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); - flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window); - for (i = sizeof(*th); !flush && i < thlen; i += 4) + flush |= th->ack_seq ^ th2->ack_seq; + for (i = sizeof(*th); i < thlen; i += 4) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); mss = skb_shinfo(p)->gso_size; - flush |= (len > mss) | !len; + flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); if (flush || skb_gro_receive(head, skb)) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 56dcf97a97f..eeb8a92aa41 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -597,16 +597,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_grow_window(sk, skb); } -static u32 tcp_rto_min(struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; - - if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); - return rto_min; -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge |