From 0a64b4b811025ce0386ad84d81504e4ff7985856 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 19 May 2008 13:51:29 -0700 Subject: inet: Rename fragmentation sysctl-related functions/variables. The fragments sysctls also contains some, that are to be visible, but read-only in net namespaces. The naming in net/core/sysctl_net_core.c is - tables, that are to be registered in namespaces have a "ns" word in their names. So rename ones in ipv4/ip_fragment.c and ipv6/reassembly.c to fit this. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cd6ce6ac635..7f102eeb618 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -598,7 +598,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) #ifdef CONFIG_SYSCTL static int zero; -static struct ctl_table ip4_frags_ctl_table[] = { +static struct ctl_table ip4_frags_ns_ctl_table[] = { { .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, .procname = "ipfrag_high_thresh", @@ -644,14 +644,14 @@ static struct ctl_table ip4_frags_ctl_table[] = { { } }; -static int ip4_frags_ctl_register(struct net *net) +static int ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; - table = ip4_frags_ctl_table; + table = ip4_frags_ns_ctl_table; if (net != &init_net) { - table = kmemdup(table, sizeof(ip4_frags_ctl_table), GFP_KERNEL); + table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; @@ -676,7 +676,7 @@ err_alloc: return -ENOMEM; } -static void ip4_frags_ctl_unregister(struct net *net) +static void ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; @@ -685,12 +685,12 @@ static void ip4_frags_ctl_unregister(struct net *net) kfree(table); } #else -static inline int ip4_frags_ctl_register(struct net *net) +static inline int ip4_frags_ns_ctl_register(struct net *net) { return 0; } -static inline void ip4_frags_ctl_unregister(struct net *net) +static inline void ip4_frags_ns_ctl_unregister(struct net *net) { } #endif @@ -714,12 +714,12 @@ static int ipv4_frags_init_net(struct net *net) inet_frags_init_net(&net->ipv4.frags); - return ip4_frags_ctl_register(net); + return ip4_frags_ns_ctl_register(net); } static void ipv4_frags_exit_net(struct net *net) { - ip4_frags_ctl_unregister(net); + ip4_frags_ns_ctl_unregister(net); inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); } -- cgit v1.2.3 From 7d291ebb834278e30c211b26fb7076adcb636ad9 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 19 May 2008 13:53:02 -0700 Subject: inet: Register fragmentation some ctls at read-only root. Parts of fragments-related sysctls are read-only, but this is done by cloning all the tables and dropping write-bits from mode. Do the same but with read-only root. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7f102eeb618..be1cb89a8d5 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -624,6 +624,10 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies }, + { } +}; + +static struct ctl_table ip4_frags_ctl_table[] = { { .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, .procname = "ipfrag_secret_interval", @@ -658,8 +662,6 @@ static int ip4_frags_ns_ctl_register(struct net *net) table[0].data = &net->ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[2].data = &net->ipv4.frags.timeout; - table[3].mode &= ~0222; - table[4].mode &= ~0222; } hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); @@ -684,6 +686,11 @@ static void ip4_frags_ns_ctl_unregister(struct net *net) unregister_net_sysctl_table(net->ipv4.frags_hdr); kfree(table); } + +static void ip4_frags_ctl_register(void) +{ + register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); +} #else static inline int ip4_frags_ns_ctl_register(struct net *net) { @@ -693,6 +700,10 @@ static inline int ip4_frags_ns_ctl_register(struct net *net) static inline void ip4_frags_ns_ctl_unregister(struct net *net) { } + +static inline void ip4_frags_ctl_register(void) +{ +} #endif static int ipv4_frags_init_net(struct net *net) @@ -730,6 +741,7 @@ static struct pernet_operations ip4_frags_ops = { void __init ipfrag_init(void) { + ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; -- cgit v1.2.3 From addd68eb6fe6f10017974947c90cc85ab2848cf2 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 May 2008 14:14:22 -0700 Subject: ipgre: Use on-device stats instead of private ones. Just switch from tunnel->stat to tunnel->dev->stats. The ip_tunnel->stat member itself will be removed after I fix its other users (very soon). Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2ada033406d..eede36e5570 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -617,6 +617,8 @@ static int ipgre_rcv(struct sk_buff *skb) read_lock(&ipgre_lock); if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr, key)) != NULL) { + struct net_device_stats *stats = &tunnel->dev->stats; + secpath_reset(skb); skb->protocol = *(__be16*)(h + 2); @@ -641,28 +643,28 @@ static int ipgre_rcv(struct sk_buff *skb) /* Looped back packet, drop it! */ if (skb->rtable->fl.iif == 0) goto drop; - tunnel->stat.multicast++; + stats->multicast++; skb->pkt_type = PACKET_BROADCAST; } #endif if (((flags&GRE_CSUM) && csum) || (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - tunnel->stat.rx_crc_errors++; - tunnel->stat.rx_errors++; + stats->rx_crc_errors++; + stats->rx_errors++; goto drop; } if (tunnel->parms.i_flags&GRE_SEQ) { if (!(flags&GRE_SEQ) || (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { - tunnel->stat.rx_fifo_errors++; - tunnel->stat.rx_errors++; + stats->rx_fifo_errors++; + stats->rx_errors++; goto drop; } tunnel->i_seqno = seqno + 1; } - tunnel->stat.rx_packets++; - tunnel->stat.rx_bytes += skb->len; + stats->rx_packets++; + stats->rx_bytes += skb->len; skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; @@ -684,7 +686,7 @@ drop_nolock: static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->stat; + struct net_device_stats *stats = &tunnel->dev->stats; struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; @@ -698,7 +700,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int mtu; if (tunnel->recursion++) { - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -714,7 +716,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) /* NBMA tunnel */ if (skb->dst == NULL) { - tunnel->stat.tx_fifo_errors++; + stats->tx_fifo_errors++; goto tx_error; } @@ -765,7 +767,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .tos = RT_TOS(tos) } }, .proto = IPPROTO_GRE }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - tunnel->stat.tx_carrier_errors++; + stats->tx_carrier_errors++; goto tx_error; } } @@ -773,7 +775,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -1098,11 +1100,6 @@ done: return err; } -static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev) -{ - return &(((struct ip_tunnel*)netdev_priv(dev))->stat); -} - static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); @@ -1228,7 +1225,6 @@ static void ipgre_tunnel_setup(struct net_device *dev) dev->uninit = ipgre_tunnel_uninit; dev->destructor = free_netdev; dev->hard_start_xmit = ipgre_tunnel_xmit; - dev->get_stats = ipgre_tunnel_get_stats; dev->do_ioctl = ipgre_tunnel_ioctl; dev->change_mtu = ipgre_tunnel_change_mtu; -- cgit v1.2.3 From 50f59cea075875d84018a5fc62cf2f5e6173a919 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 May 2008 14:15:16 -0700 Subject: ipip: Use on-device stats instead of private ones. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 149111f08e8..26c85c23ca4 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -496,8 +496,8 @@ static int ipip_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; - tunnel->stat.rx_packets++; - tunnel->stat.rx_bytes += skb->len; + tunnel->dev->stats.rx_packets++; + tunnel->dev->stats.rx_bytes += skb->len; skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; @@ -520,7 +520,7 @@ static int ipip_rcv(struct sk_buff *skb) static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->stat; + struct net_device_stats *stats = &tunnel->dev->stats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; @@ -533,7 +533,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int mtu; if (tunnel->recursion++) { - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -546,7 +546,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (!dst) { /* NBMA tunnel */ if ((rt = skb->rtable) == NULL) { - tunnel->stat.tx_fifo_errors++; + stats->tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) @@ -561,7 +561,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .tos = RT_TOS(tos) } }, .proto = IPPROTO_IPIP }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - tunnel->stat.tx_carrier_errors++; + stats->tx_carrier_errors++; goto tx_error_icmp; } } @@ -569,7 +569,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -579,7 +579,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; if (mtu < 68) { - tunnel->stat.collisions++; + stats->collisions++; ip_rt_put(rt); goto tx_error; } @@ -813,11 +813,6 @@ done: return err; } -static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev) -{ - return &(((struct ip_tunnel*)netdev_priv(dev))->stat); -} - static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) @@ -830,7 +825,6 @@ static void ipip_tunnel_setup(struct net_device *dev) { dev->uninit = ipip_tunnel_uninit; dev->hard_start_xmit = ipip_tunnel_xmit; - dev->get_stats = ipip_tunnel_get_stats; dev->do_ioctl = ipip_tunnel_ioctl; dev->change_mtu = ipip_tunnel_change_mtu; dev->destructor = free_netdev; -- cgit v1.2.3 From 2f4c02d40c0d59f4efc59801ad1498cb0f8550ea Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 May 2008 14:16:14 -0700 Subject: ipmr: Ipip tunnel uses on-device stats. The ipmr uses ipip tunnels for its purposes and updates the tunnels' stats, but the ipip driver is already switched to use on-device ones. Actually, this is a part of the patch #4 from this set. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 11700a4dcd9..65f12005785 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1230,8 +1230,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) if (vif->flags & VIFF_TUNNEL) { ip_encap(skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ - ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++; - ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len; + vif->dev->stats.tx_packets++; + vif->dev->stats.tx_bytes += skb->len; } IPCB(skb)->flags |= IPSKB_FORWARDED; -- cgit v1.2.3 From cf3677ae19c2f62979b39143f5d2f6b3dfb3b3e4 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 May 2008 14:17:33 -0700 Subject: ipmr: Use on-device stats instead of private ones. These devices use the private area of appropriate size for statistics. Turning them to use on-device ones make them "privless" and thus - really small wrt kmalloc cache, they are allocated from. Besides, code looks nicer, because of absence of multi-braced type casts and dereferences. [ Fix build failures -DaveM ] Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 65f12005785..a34da4977c7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -181,26 +181,20 @@ static int reg_vif_num = -1; static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { read_lock(&mrt_lock); - ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(dev))->tx_packets++; + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); return 0; } -static struct net_device_stats *reg_vif_get_stats(struct net_device *dev) -{ - return (struct net_device_stats*)netdev_priv(dev); -} - static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; dev->hard_start_xmit = reg_vif_xmit; - dev->get_stats = reg_vif_get_stats; dev->destructor = free_netdev; } @@ -209,8 +203,7 @@ static struct net_device *ipmr_reg_vif(void) struct net_device *dev; struct in_device *in_dev; - dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg", - reg_vif_setup); + dev = alloc_netdev(0, "pimreg", reg_vif_setup); if (dev == NULL) return NULL; @@ -1170,8 +1163,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) if (vif->flags & VIFF_REGISTER) { vif->pkt_out++; vif->bytes_out+=skb->len; - ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++; + vif->dev->stats.tx_bytes += skb->len; + vif->dev->stats.tx_packets++; ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); kfree_skb(skb); return; @@ -1487,8 +1480,8 @@ int pim_rcv_v1(struct sk_buff * skb) skb->pkt_type = PACKET_HOST; dst_release(skb->dst); skb->dst = NULL; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++; + reg_dev->stats.rx_bytes += skb->len; + reg_dev->stats.rx_packets++; nf_reset(skb); netif_rx(skb); dev_put(reg_dev); @@ -1542,8 +1535,8 @@ static int pim_rcv(struct sk_buff * skb) skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; dst_release(skb->dst); - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++; + reg_dev->stats.rx_bytes += skb->len; + reg_dev->stats.rx_packets++; skb->dst = NULL; nf_reset(skb); netif_rx(skb); -- cgit v1.2.3 From 560ee653b67074b805f1b661988a72a0e58811a5 Mon Sep 17 00:00:00 2001 From: James Morris Date: Mon, 9 Jun 2008 15:57:24 -0700 Subject: netfilter: ip_tables: add iptables security table for mandatory access control rules The following patch implements a new "security" table for iptables, so that MAC (SELinux etc.) networking rules can be managed separately to standard DAC rules. This is to help with distro integration of the new secmark-based network controls, per various previous discussions. The need for a separate table arises from the fact that existing tools and usage of iptables will likely clash with centralized MAC policy management. The SECMARK and CONNSECMARK targets will still be valid in the mangle table to prevent breakage of existing users. Signed-off-by: James Morris Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 12 +++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/iptable_security.c | 180 ++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 net/ipv4/netfilter/iptable_security.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2767841a8ce..6e251402506 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -365,6 +365,18 @@ config IP_NF_RAW If you want to compile it as a module, say M here and read . If unsure, say `N'. +# security table for MAC policy +config IP_NF_SECURITY + tristate "Security table" + depends on IP_NF_IPTABLES + depends on SECURITY + default m if NETFILTER_ADVANCED=n + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. + # ARP tables config IP_NF_ARPTABLES tristate "ARP tables support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index d9b92fbf557..3f31291f37c 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o obj-$(CONFIG_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o +obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o # matches obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c new file mode 100644 index 00000000000..2b472ac2263 --- /dev/null +++ b/net/ipv4/netfilter/iptable_security.c @@ -0,0 +1,180 @@ +/* + * "security" table + * + * This is for use by Mandatory Access Control (MAC) security models, + * which need to be able to manage security policy in separate context + * to DAC. + * + * Based on iptable_mangle.c + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team netfilter.org> + * Copyright (C) 2008 Red Hat, Inc., James Morris redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris redhat.com>"); +MODULE_DESCRIPTION("iptables security table, for MAC rules"); + +#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __initdata = { + .repl = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .num_entries = 4, + .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + .hook_entry = { + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + }, + .underflow = { + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + }, + }, + .entries = { + IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ + IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ + IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ + }, + .term = IPT_ERROR_INIT, /* ERROR */ +}; + +static struct xt_table security_table = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .lock = __RW_LOCK_UNLOCKED(security_table.lock), + .me = THIS_MODULE, + .af = AF_INET, +}; + +static unsigned int +ipt_local_in_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + nf_local_in_net(in, out)->ipv4.iptable_security); +} + +static unsigned int +ipt_forward_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + nf_forward_net(in, out)->ipv4.iptable_security); +} + +static unsigned int +ipt_local_out_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* Somebody is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) + || ip_hdrlen(skb) < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk(KERN_INFO "iptable_security: ignoring short " + "SOCK_RAW packet.\n"); + return NF_ACCEPT; + } + return ipt_do_table(skb, hook, in, out, + nf_local_out_net(in, out)->ipv4.iptable_security); +} + +static struct nf_hook_ops ipt_ops[] __read_mostly = { + { + .hook = ipt_local_in_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_SECURITY, + }, + { + .hook = ipt_forward_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = NF_IP_PRI_SECURITY, + }, + { + .hook = ipt_local_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_SECURITY, + }, +}; + +static int __net_init iptable_security_net_init(struct net *net) +{ + net->ipv4.iptable_security = + ipt_register_table(net, &security_table, &initial_table.repl); + + if (IS_ERR(net->ipv4.iptable_security)) + return PTR_ERR(net->ipv4.iptable_security); + + return 0; +} + +static void __net_exit iptable_security_net_exit(struct net *net) +{ + ipt_unregister_table(net->ipv4.iptable_security); +} + +static struct pernet_operations iptable_security_net_ops = { + .init = iptable_security_net_init, + .exit = iptable_security_net_exit, +}; + +static int __init iptable_security_init(void) +{ + int ret; + + ret = register_pernet_subsys(&iptable_security_net_ops); + if (ret < 0) + return ret; + + ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + if (ret < 0) + goto cleanup_table; + + return ret; + +cleanup_table: + unregister_pernet_subsys(&iptable_security_net_ops); + return ret; +} + +static void __exit iptable_security_fini(void) +{ + nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + unregister_pernet_subsys(&iptable_security_net_ops); +} + +module_init(iptable_security_init); +module_exit(iptable_security_fini); -- cgit v1.2.3 From 51091764f26ec36c02e35166f083193a30f426fc Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 9 Jun 2008 15:59:06 -0700 Subject: netfilter: nf_conntrack: add nf_ct_kill() Encapsulate the common if (del_timer(&ct->timeout)) ct->timeout.function((unsigned long)ct) sequence in a new function. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 78ab19accac..0e21a46184f 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -87,9 +87,8 @@ static int icmp_packet(struct nf_conn *ct, means this will only run once even if count hits zero twice (theoretically possible with SMP) */ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - if (atomic_dec_and_test(&ct->proto.icmp.count) - && del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); + if (atomic_dec_and_test(&ct->proto.icmp.count)) + nf_ct_kill(ct); } else { atomic_inc(&ct->proto.icmp.count); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); -- cgit v1.2.3 From 718d4ad98e272daebc258e49dc02f52a6a8de9d3 Mon Sep 17 00:00:00 2001 From: Fabian Hugelshofer Date: Mon, 9 Jun 2008 15:59:40 -0700 Subject: netfilter: nf_conntrack: properly account terminating packets Currently the last packet of a connection isn't accounted when its causing abnormal termination. Introduces nf_ct_kill_acct() which increments the accounting counters on conntrack kill. The new function was necessary, because there are calls to nf_ct_kill() which don't need accounting: nf_conntrack_proto_tcp.c line ~847: Kills ct and returns NF_REPEAT. We don't want to count twice. nf_conntrack_proto_tcp.c line ~880: Kills ct and returns NF_DROP. I think we don't want to count dropped packets. nf_conntrack_netlink.c line ~824: As far as I can see ctnetlink_del_conntrack() is used to destroy a conntrack on behalf of the user. There is an sk_buff, but I don't think this is an actual packet. Incrementing counters here is therefore not desired. Signed-off-by: Fabian Hugelshofer Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 0e21a46184f..97791048fa9 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -88,7 +88,7 @@ static int icmp_packet(struct nf_conn *ct, (theoretically possible with SMP) */ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { if (atomic_dec_and_test(&ct->proto.icmp.count)) - nf_ct_kill(ct); + nf_ct_kill_acct(ct, ctinfo, skb); } else { atomic_inc(&ct->proto.icmp.count); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); -- cgit v1.2.3 From e64bda89b8fe81cce9b4a20885d2c204c2d52532 Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Mon, 9 Jun 2008 16:00:45 -0700 Subject: netfilter: {ip,ip6,nfnetlink}_queue: misc cleanups - No need to perform data_len = 0 in the switch command, since data_len is initialized to 0 in the beginning of the ipq_build_packet_message() method. - {ip,ip6}_queue: We can reach nlmsg_failure only from one place; skb is sure to be NULL when getting there; since skb is NULL, there is no need to check this fact and call kfree_skb(). Signed-off-by: Rami Rosen Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 26a37cedcf2..aa33a4a7a71 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -156,7 +156,6 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) case IPQ_COPY_META: case IPQ_COPY_NONE: size = NLMSG_SPACE(sizeof(*pmsg)); - data_len = 0; break; case IPQ_COPY_PACKET: @@ -224,8 +223,6 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) return skb; nlmsg_failure: - if (skb) - kfree_skb(skb); *errp = -EINVAL; printk(KERN_ERR "ip_queue: error creating packet message\n"); return NULL; -- cgit v1.2.3 From 573bf470e693f73a6ac437b17a64a10902ba54bf Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 10 Jun 2008 15:40:04 -0700 Subject: ipv4 addr: Send netlink notification for address label changes Makes people happy who try to keep a list of addresses up to date by listening to notifications. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 79a7ef6209f..61011e1d580 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1013,7 +1013,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) - continue; + goto skip; dot = strchr(old, ':'); if (dot == NULL) { sprintf(old, ":%d", named); @@ -1024,6 +1024,8 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) } else { strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); } +skip: + rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } -- cgit v1.2.3 From 7d5d5525bd88313e6fd90c0659665aee5114bc2d Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 17 Apr 2008 12:29:53 +0900 Subject: tcp md5sig: Share MD5 Signature option parser between IPv4 and IPv6. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/tcp_input.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 42 +----------------------------------------- 2 files changed, 41 insertions(+), 41 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eba873e9b56..f331e67f232 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3450,6 +3450,43 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, return 1; } +#ifdef CONFIG_TCP_MD5SIG +/* + * Parse MD5 Signature option + */ +u8 *tcp_parse_md5sig_option(struct tcphdr *th) +{ + int length = (th->doff << 2) - sizeof (*th); + u8 *ptr = (u8*)(th + 1); + + /* If the TCP option is too short, we can short cut */ + if (length < TCPOLEN_MD5SIG) + return NULL; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch(opcode) { + case TCPOPT_EOL: + return NULL; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2 || opsize > length) + return NULL; + if (opcode == TCPOPT_MD5SIG) + return ptr; + } + ptr += opsize - 2; + length -= opsize; + } + return NULL; +} +#endif + static inline void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; @@ -5467,6 +5504,9 @@ EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); EXPORT_SYMBOL(tcp_parse_options); +#ifdef CONFIG_TCP_MD5SIG +EXPORT_SYMBOL(tcp_parse_md5sig_option); +#endif EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cd601a866c2..56f55093364 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1134,52 +1134,12 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - int length = (th->doff << 2) - sizeof(struct tcphdr); int genhash; - unsigned char *ptr; unsigned char newhash[16]; hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); + hash_location = tcp_parse_md5sig_option(th); - /* - * If the TCP option length is less than the TCP_MD5SIG - * option length, then we can shortcut - */ - if (length < TCPOLEN_MD5SIG) { - if (hash_expected) - return 1; - else - return 0; - } - - /* Okay, we can't shortcut - we have to grub through the options */ - ptr = (unsigned char *)(th + 1); - while (length > 0) { - int opcode = *ptr++; - int opsize; - - switch (opcode) { - case TCPOPT_EOL: - goto done_opts; - case TCPOPT_NOP: - length--; - continue; - default: - opsize = *ptr++; - if (opsize < 2) - goto done_opts; - if (opsize > length) - goto done_opts; - - if (opcode == TCPOPT_MD5SIG) { - hash_location = ptr; - goto done_opts; - } - } - ptr += opsize-2; - length -= opsize; - } -done_opts: /* We've parsed the options - do we have a hash? */ if (!hash_expected && !hash_location) return 0; -- cgit v1.2.3 From 076fb7223357769c39f3ddf900bba6752369c76a Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 17 Apr 2008 12:48:12 +0900 Subject: tcp md5sig: Remove redundant protocol argument. Protocol is always TCP, so remove useless protocol argument. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/tcp_ipv4.c | 20 ++++++++------------ net/ipv4/tcp_output.c | 3 +-- 2 files changed, 9 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 56f55093364..f25445d21bd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -95,8 +95,7 @@ static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr); static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, - struct tcphdr *th, int protocol, - unsigned int tcplen); + struct tcphdr *th, unsigned int tcplen); #endif struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { @@ -586,8 +585,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) key, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, - &rep.th, IPPROTO_TCP, - arg.iov[0].iov_len); + &rep.th, arg.iov[0].iov_len); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, @@ -680,8 +678,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, key, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, - &rep.th, IPPROTO_TCP, - arg.iov[0].iov_len); + &rep.th, arg.iov[0].iov_len); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, @@ -1006,7 +1003,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, - struct tcphdr *th, int protocol, + struct tcphdr *th, unsigned int tcplen) { struct scatterlist sg[4]; @@ -1039,7 +1036,7 @@ static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; - bp->protocol = protocol; + bp->protocol = IPPROTO_TCP; bp->len = htons(tcplen); sg_init_table(sg, 4); @@ -1099,7 +1096,7 @@ int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcphdr *th, int protocol, + struct tcphdr *th, unsigned int tcplen) { __be32 saddr, daddr; @@ -1115,7 +1112,7 @@ int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, } return tcp_v4_do_calc_md5_hash(md5_hash, key, saddr, daddr, - th, protocol, tcplen); + th, tcplen); } EXPORT_SYMBOL(tcp_v4_calc_md5_hash); @@ -1166,8 +1163,7 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) genhash = tcp_v4_do_calc_md5_hash(newhash, hash_expected, iph->saddr, iph->daddr, - th, sk->sk_protocol, - skb->len); + th, skb->len); if (genhash || memcmp(hash_location, newhash, 16) != 0) { if (net_ratelimit()) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ad993ecb481..f3ffd674edd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -607,7 +607,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, md5, sk, NULL, NULL, tcp_hdr(skb), - sk->sk_protocol, skb->len); } #endif @@ -2266,7 +2265,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, tp->af_specific->calc_md5_hash(md5_hash_location, md5, NULL, dst, req, - tcp_hdr(skb), sk->sk_protocol, + tcp_hdr(skb), skb->len); } #endif -- cgit v1.2.3 From 8d26d76dd4a4c87ef037a44a42a0608ffc730199 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 17 Apr 2008 13:19:16 +0900 Subject: tcp md5sig: Share most of hash calcucaltion bits between IPv4 and IPv6. We can share most part of the hash calculation code because the only difference between IPv4 and IPv6 is their pseudo headers. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/tcp.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 52 +++------------------------------------ 2 files changed, 74 insertions(+), 48 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ab66683b804..6efbae0e551 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2459,6 +2459,76 @@ static unsigned long tcp_md5sig_users; static struct tcp_md5sig_pool **tcp_md5sig_pool; static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); +int tcp_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, + int bplen, + struct tcphdr *th, unsigned int tcplen, + struct tcp_md5sig_pool *hp) +{ + struct scatterlist sg[4]; + __u16 data_len; + int block = 0; + __sum16 cksum; + struct hash_desc *desc = &hp->md5_desc; + int err; + unsigned int nbytes = 0; + + sg_init_table(sg, 4); + + /* 1. The TCP pseudo-header */ + sg_set_buf(&sg[block++], &hp->md5_blk, bplen); + nbytes += bplen; + + /* 2. The TCP header, excluding options, and assuming a + * checksum of zero + */ + cksum = th->check; + th->check = 0; + sg_set_buf(&sg[block++], th, sizeof(*th)); + nbytes += sizeof(*th); + + /* 3. The TCP segment data (if any) */ + data_len = tcplen - (th->doff << 2); + if (data_len > 0) { + u8 *data = (u8 *)th + (th->doff << 2); + sg_set_buf(&sg[block++], data, data_len); + nbytes += data_len; + } + + /* 4. an independently-specified key or password, known to both + * TCPs and presumably connection-specific + */ + sg_set_buf(&sg[block++], key->key, key->keylen); + nbytes += key->keylen; + + sg_mark_end(&sg[block - 1]); + + /* Now store the hash into the packet */ + err = crypto_hash_init(desc); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_init failed\n", __func__); + return -1; + } + err = crypto_hash_update(desc, sg, nbytes); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_update failed\n", __func__); + return -1; + } + err = crypto_hash_final(desc, md5_hash); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_final failed\n", __func__); + return -1; + } + + /* Reset header */ + th->check = cksum; + + return 0; +} +EXPORT_SYMBOL(tcp_calc_md5_hash); + static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) { int cpu; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f25445d21bd..e331cdbd095 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1006,15 +1006,9 @@ static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, struct tcphdr *th, unsigned int tcplen) { - struct scatterlist sg[4]; - __u16 data_len; - int block = 0; - __sum16 old_checksum; struct tcp_md5sig_pool *hp; struct tcp4_pseudohdr *bp; - struct hash_desc *desc; int err; - unsigned int nbytes = 0; /* * Okay, so RFC2385 is turned on for this connection, @@ -1026,10 +1020,9 @@ static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, goto clear_hash_noput; bp = &hp->md5_blk.ip4; - desc = &hp->md5_desc; /* - * 1. the TCP pseudo-header (in the order: source IP address, + * The TCP pseudo-header (in the order: source IP address, * destination IP address, zero-padded protocol number, and * segment length) */ @@ -1039,50 +1032,13 @@ static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, bp->protocol = IPPROTO_TCP; bp->len = htons(tcplen); - sg_init_table(sg, 4); - - sg_set_buf(&sg[block++], bp, sizeof(*bp)); - nbytes += sizeof(*bp); - - /* 2. the TCP header, excluding options, and assuming a - * checksum of zero/ - */ - old_checksum = th->check; - th->check = 0; - sg_set_buf(&sg[block++], th, sizeof(struct tcphdr)); - nbytes += sizeof(struct tcphdr); - - /* 3. the TCP segment data (if any) */ - data_len = tcplen - (th->doff << 2); - if (data_len > 0) { - unsigned char *data = (unsigned char *)th + (th->doff << 2); - sg_set_buf(&sg[block++], data, data_len); - nbytes += data_len; - } - - /* 4. an independently-specified key or password, known to both - * TCPs and presumably connection-specific - */ - sg_set_buf(&sg[block++], key->key, key->keylen); - nbytes += key->keylen; - - sg_mark_end(&sg[block - 1]); - - /* Now store the Hash into the packet */ - err = crypto_hash_init(desc); - if (err) - goto clear_hash; - err = crypto_hash_update(desc, sg, nbytes); - if (err) - goto clear_hash; - err = crypto_hash_final(desc, md5_hash); + err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp), + th, tcplen, hp); if (err) goto clear_hash; - /* Reset header, and free up the crypto */ + /* Free up the crypto pool */ tcp_put_md5sig_pool(); - th->check = old_checksum; - out: return 0; clear_hash: -- cgit v1.2.3 From 9501f9722922f2e80e1f9dc6682311d65c2b5690 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 18 Apr 2008 12:45:16 +0900 Subject: tcp md5sig: Let the caller pass appropriate key for tcp_v{4,6}_do_calc_md5_hash(). As we do for other socket/timewait-socket specific parameters, let the callers pass appropriate arguments to tcp_v{4,6}_do_calc_md5_hash(). Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/tcp_ipv4.c | 50 ++++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e331cdbd095..f7ff2a64a7f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -96,6 +96,12 @@ static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, struct tcphdr *th, unsigned int tcplen); +#else +static inline +struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) +{ + return NULL; +} #endif struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { @@ -604,9 +610,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, - struct sk_buff *skb, u32 seq, u32 ack, - u32 win, u32 ts) +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 ts, int oif, + struct tcp_md5sig_key *key) { struct tcphdr *th = tcp_hdr(skb); struct { @@ -618,10 +624,6 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, ]; } rep; struct ip_reply_arg arg; -#ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; - struct tcp_md5sig_key tw_key; -#endif memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -647,23 +649,6 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, rep.th.window = htons(win); #ifdef CONFIG_TCP_MD5SIG - /* - * The SKB holds an imcoming packet, but may not have a valid ->sk - * pointer. This is especially the case when we're dealing with a - * TIME_WAIT ack, because the sk structure is long gone, and only - * the tcp_timewait_sock remains. So the md5 key is stashed in that - * structure, and we use it in preference. I believe that (twsk || - * skb->sk) holds true, but we program defensively. - */ - if (!twsk && skb->sk) { - key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr); - } else if (twsk && twsk->tw_md5_keylen) { - tw_key.key = twsk->tw_md5_key; - tw_key.keylen = twsk->tw_md5_keylen; - key = &tw_key; - } else - key = NULL; - if (key) { int offset = (ts) ? 3 : 0; @@ -685,8 +670,8 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - if (twsk) - arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if; + if (oif) + arg.bound_dev_if = oif; ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); @@ -699,9 +684,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcptw->tw_ts_recent); + tcptw->tw_ts_recent, + tw->tw_bound_dev_if, + tcp_twsk_md5_key(tcptw) + ); inet_twsk_put(tw); } @@ -709,9 +697,11 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) { - tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1, + tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, - req->ts_recent); + req->ts_recent, + 0, + tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr)); } /* -- cgit v1.2.3 From 0b040829952d84bf2a62526f0e24b624e0699447 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 10 Jun 2008 22:46:50 -0700 Subject: net: remove CVS keywords This patch removes CVS keywords that weren't updated for a long time from comments. Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 -- net/ipv4/arp.c | 2 -- net/ipv4/devinet.c | 2 -- net/ipv4/fib_frontend.c | 2 -- net/ipv4/fib_hash.c | 2 -- net/ipv4/fib_semantics.c | 2 -- net/ipv4/fib_trie.c | 2 -- net/ipv4/icmp.c | 2 -- net/ipv4/igmp.c | 2 -- net/ipv4/inet_diag.c | 2 -- net/ipv4/inetpeer.c | 2 -- net/ipv4/ip_forward.c | 2 -- net/ipv4/ip_fragment.c | 2 -- net/ipv4/ip_input.c | 2 -- net/ipv4/ip_options.c | 2 -- net/ipv4/ip_output.c | 2 -- net/ipv4/ip_sockglue.c | 2 -- net/ipv4/ipconfig.c | 2 -- net/ipv4/ipip.c | 2 -- net/ipv4/ipmr.c | 2 -- net/ipv4/ipvs/ip_vs_app.c | 2 -- net/ipv4/ipvs/ip_vs_conn.c | 2 -- net/ipv4/ipvs/ip_vs_core.c | 2 -- net/ipv4/ipvs/ip_vs_ctl.c | 2 -- net/ipv4/ipvs/ip_vs_dh.c | 2 -- net/ipv4/ipvs/ip_vs_est.c | 2 -- net/ipv4/ipvs/ip_vs_ftp.c | 2 -- net/ipv4/ipvs/ip_vs_lblc.c | 2 -- net/ipv4/ipvs/ip_vs_lblcr.c | 2 -- net/ipv4/ipvs/ip_vs_lc.c | 2 -- net/ipv4/ipvs/ip_vs_nq.c | 2 -- net/ipv4/ipvs/ip_vs_proto.c | 2 -- net/ipv4/ipvs/ip_vs_proto_ah.c | 2 -- net/ipv4/ipvs/ip_vs_proto_esp.c | 2 -- net/ipv4/ipvs/ip_vs_proto_tcp.c | 2 -- net/ipv4/ipvs/ip_vs_proto_udp.c | 2 -- net/ipv4/ipvs/ip_vs_rr.c | 2 -- net/ipv4/ipvs/ip_vs_sched.c | 2 -- net/ipv4/ipvs/ip_vs_sed.c | 2 -- net/ipv4/ipvs/ip_vs_sh.c | 2 -- net/ipv4/ipvs/ip_vs_sync.c | 2 -- net/ipv4/ipvs/ip_vs_wlc.c | 2 -- net/ipv4/ipvs/ip_vs_wrr.c | 2 -- net/ipv4/ipvs/ip_vs_xmit.c | 2 -- net/ipv4/proc.c | 2 -- net/ipv4/protocol.c | 2 -- net/ipv4/raw.c | 2 -- net/ipv4/route.c | 2 -- net/ipv4/syncookies.c | 2 -- net/ipv4/sysctl_net_ipv4.c | 2 -- net/ipv4/tcp.c | 2 -- net/ipv4/tcp_diag.c | 2 -- net/ipv4/tcp_input.c | 2 -- net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_minisocks.c | 2 -- net/ipv4/tcp_output.c | 2 -- net/ipv4/tcp_timer.c | 2 -- net/ipv4/udp.c | 2 -- net/ipv4/udplite.c | 2 -- 59 files changed, 118 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 24eca23c2db..42bd24b64b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,8 +5,6 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Florian La Roche, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9b539fa9fe1..20c515a1be2 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,4 @@ /* linux/net/ipv4/arp.c - * - * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $ * * Copyright (C) 1994 by Florian La Roche * diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 61011e1d580..f8c0b0aea93 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,8 +1,6 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0b2ac6a3d90..5ad01d63f83 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,8 +5,6 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $ - * * Authors: Alexey Kuznetsov, * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 2e2fc3376ac..eeec4bf982b 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -5,8 +5,6 @@ * * IPv4 FIB: lookup engine and maintenance routines. * - * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $ - * * Authors: Alexey Kuznetsov, * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3b83c34019f..9335eba683c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,8 +5,6 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $ - * * Authors: Alexey Kuznetsov, * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4b02d14e7ab..394db9c941a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -22,8 +22,6 @@ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * - * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $ - * * * Code from fib_hash has been reused which includes the following header: * diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 87397351dda..aa7cf46853b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,8 +3,6 @@ * * Alan Cox, * - * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2769dc4a4c8..68e84a933e9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,8 +8,6 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $ - * * Authors: * Alan Cox * diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index da97695e709..c10036e7a46 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1,8 +1,6 @@ /* * inet_diag.c Module for monitoring INET transport protocols sockets. * - * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ - * * Authors: Alexey Kuznetsov, * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index af995198f64..a456ceeac3f 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -3,8 +3,6 @@ * * This source is covered by the GNU GPL, the same as all kernel sources. * - * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $ - * * Authors: Andrey V. Savochkin */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 4813c39b438..37d36a3f33c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,8 +5,6 @@ * * The IP forwarding functionality. * - * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $ - * * Authors: see ip.c * * Fixes: diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index be1cb89a8d5..91e32140731 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,8 +5,6 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $ - * * Authors: Fred N. van Kempen * Alan Cox * diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index ff77a4a7f9e..7c26428ea67 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Donald Becker, diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 33126ad2cfd..be3f18a7a40 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,8 +5,6 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $ - * * Authors: A.N.Kuznetsov * */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e527628f56c..f1278eecf56 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Donald Becker, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e0514e82308..105d92a039b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,8 +5,6 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $ - * * Authors: see ip.c * * Fixes: diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index ed45037ce9b..b88aa9afa42 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,6 +1,4 @@ /* - * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $ - * * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or * user-supplied information to configure own IP address and routes. * diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 86d8836551b..4c6d2caf920 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,8 +1,6 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $ - * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a34da4977c7..300ab0c2919 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,8 +9,6 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $ - * * Fixes: * Michael Chastain : Incorrect size of copying. * Alan Cox : Added the cache manager code diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index 535abe0c45e..1f1897a1a70 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -1,8 +1,6 @@ /* * ip_vs_app.c: Application module support for IPVS * - * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 65f1ba11275..f8bdae47a77 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ - * * Authors: Wensong Zhang * Peter Kese * Julian Anastasov diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 963981a9d50..bcf6276ba4b 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $ - * * Authors: Wensong Zhang * Peter Kese * Julian Anastasov diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 94c5767c8e0..9a5ace0b4dd 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $ - * * Authors: Wensong Zhang * Peter Kese * Julian Anastasov diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index dcf5d46aaa5..8afc1503ed2 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -1,8 +1,6 @@ /* * IPVS: Destination Hashing scheduling module * - * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang * * Inspired by the consistent hashing scheduler patch from diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index dfa0d713c80..bc04eedd6db 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -1,8 +1,6 @@ /* * ip_vs_est.c: simple rate estimator for IPVS * - * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index 59aa166b767..c1c758e4f73 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -1,8 +1,6 @@ /* * ip_vs_ftp.c: IPVS ftp application module * - * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang * * Changes: diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 3888642706a..0efa3db4b18 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -1,8 +1,6 @@ /* * IPVS: Locality-Based Least-Connection scheduling module * - * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index daa260eb21c..8e3bbeb4513 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -1,8 +1,6 @@ /* * IPVS: Locality-Based Least-Connection with Replication scheduler * - * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c index d88fef90a64..ac9f08e065d 100644 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -1,8 +1,6 @@ /* * IPVS: Least-Connection Scheduling module * - * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c index bc2a9e5f2a7..a46bf258d42 100644 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -1,8 +1,6 @@ /* * IPVS: Never Queue scheduling module * - * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index 4b1c16cbb16..876714f23d6 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -1,8 +1,6 @@ /* * ip_vs_proto.c: transport protocol load balancing support for IPVS * - * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $ - * * Authors: Wensong Zhang * Julian Anastasov * diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 4bf835e1d86..73e0ea87c1f 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -1,8 +1,6 @@ /* * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS * - * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ - * * Authors: Julian Anastasov , February 2002 * Wensong Zhang * diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index db6a6b7b1a0..21d70c8ffa5 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -1,8 +1,6 @@ /* * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS * - * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ - * * Authors: Julian Anastasov , February 2002 * Wensong Zhang * diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index b83dc14b0a4..d0ea467986a 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -1,8 +1,6 @@ /* * ip_vs_proto_tcp.c: TCP load balancing support for IPVS * - * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ - * * Authors: Wensong Zhang * Julian Anastasov * diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 75771cb3cd6..c6be5d56823 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -1,8 +1,6 @@ /* * ip_vs_proto_udp.c: UDP load balancing support for IPVS * - * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ - * * Authors: Wensong Zhang * Julian Anastasov * diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c index 433f8a94792..c8db12d39e6 100644 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -1,8 +1,6 @@ /* * IPVS: Round-Robin Scheduling module * - * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang * Peter Kese * diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c index 121a32b1b75..b6476730985 100644 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $ - * * Authors: Wensong Zhang * Peter Kese * diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c index dd7c128f9db..2a7d3135818 100644 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -1,8 +1,6 @@ /* * IPVS: Shortest Expected Delay scheduling module * - * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 1b25b00ef1e..b8fdfac6500 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -1,8 +1,6 @@ /* * IPVS: Source Hashing scheduling module * - * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index eff54efe035..2d4a86f7332 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $ - * * Authors: Wensong Zhang * * ip_vs_sync: sync connection info from master load balancer to backups diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c index 8a9d913261d..772c3cb4eca 100644 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -1,8 +1,6 @@ /* * IPVS: Weighted Least-Connection Scheduling module * - * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $ - * * Authors: Wensong Zhang * Peter Kese * diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c index 85c680add6d..1d6932d7dc9 100644 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -1,8 +1,6 @@ /* * IPVS: Weighted Round-Robin Scheduling module * - * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index f63006caea0..9892d4aca42 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -1,8 +1,6 @@ /* * ip_vs_xmit.c: various packet transmitters for IPVS * - * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ - * * Authors: Wensong Zhang * Julian Anastasov * diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 552169b41b1..eb5cee279c5 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,8 +7,6 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $ - * * Authors: Fred N. van Kempen, * Gerald J. Heim, * Fred Baumgarten, diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 971ab9356e5..ea50da0649f 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -5,8 +5,6 @@ * * INET protocol dispatch tables. * - * Version: $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e7e091d365f..1d0c97c8712 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,8 +5,6 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 96be336064f..fe3a0223728 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,8 +5,6 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Alan Cox, diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 73ba98921d6..6317d3c8dc0 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -8,8 +8,6 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. - * - * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $ */ #include diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c437f804ee3..90160700320 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,8 +1,6 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $ - * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ab66683b804..ad66b09e0bc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 2fbcc7d1b1a..838d491dfda 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -1,8 +1,6 @@ /* * tcp_diag.c Module for monitoring TCP transport protocols sockets. * - * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ - * * Authors: Alexey Kuznetsov, * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eba873e9b56..b68c3c7d906 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cd601a866c2..f2926ae1de5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $ - * * IPv4 specific functions * * diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 019c8c16e5c..1276cab85e3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ad993ecb481..b171ac65cca 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4de68cf5f2a..e77e7ae0bf2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 56fcda3694b..355e6d62d48 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,8 +5,6 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Arnt Gulbrandsen, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 72ce26b6c4d..4ad16b6d513 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -1,8 +1,6 @@ /* * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). * - * Version: $Id: udplite.c,v 1.25 2006/10/19 07:22:36 gerrit Exp $ - * * Authors: Gerrit Renker * * Changes: -- cgit v1.2.3 From 7d06b2e053d2d536348e3a0f6bb02982a41bea37 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Sat, 14 Jun 2008 17:04:49 -0700 Subject: net: change proto destroy method to return void Change struct proto destroy function pointer to return void. Noticed by Al Viro. Signed-off-by: Brian Haley Signed-off-by: David S. Miller --- net/ipv4/raw.c | 3 +-- net/ipv4/tcp_ipv4.c | 4 +--- net/ipv4/udp.c | 3 +-- net/ipv4/udp_impl.h | 2 +- 4 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1d0c97c8712..36035a0c6dc 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -606,12 +606,11 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int raw_destroy(struct sock *sk) +static void raw_destroy(struct sock *sk) { lock_sock(sk); ip_flush_pending_frames(sk); release_sock(sk); - return 0; } /* This gets rid of all the nasties in af_inet. -DaveM */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b219a7a7cd0..64b385f6593 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1775,7 +1775,7 @@ static int tcp_v4_init_sock(struct sock *sk) return 0; } -int tcp_v4_destroy_sock(struct sock *sk) +void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1819,8 +1819,6 @@ int tcp_v4_destroy_sock(struct sock *sk) } atomic_dec(&tcp_sockets_allocated); - - return 0; } EXPORT_SYMBOL(tcp_v4_destroy_sock); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 355e6d62d48..eba790dcd16 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1253,12 +1253,11 @@ int udp_rcv(struct sk_buff *skb) return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP); } -int udp_destroy_sock(struct sock *sk) +void udp_destroy_sock(struct sock *sk) { lock_sock(sk); udp_flush_pending_frames(sk); release_sock(sk); - return 0; } /* diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 7288bf7977f..2e9bad2fa1b 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -26,7 +26,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, extern int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); -extern int udp_destroy_sock(struct sock *sk); +extern void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS extern int udp4_seq_show(struct seq_file *seq, void *v); -- cgit v1.2.3 From d6266281f8175e3ad68c28b20a609b278b47ade5 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 16 Jun 2008 17:11:50 -0700 Subject: udp: introduce a udp_hashfn function Currently the chain to store a UDP socket is calculated with simple (x & (UDP_HTABLE_SIZE - 1)). But taking net into account would make this calculation a bit more complex, so moving it into a function would help. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eba790dcd16..d8f527d1570 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -134,7 +134,7 @@ static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, struct sock *sk; struct hlist_node *node; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) + sk_for_each(sk, node, &udptable[udp_hashfn(num)]) if (net_eq(sock_net(sk), net) && sk->sk_hash == num) return 1; return 0; @@ -174,7 +174,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++) { int size = 0; - head = &udptable[rover & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(rover)]; if (hlist_empty(head)) goto gotit; @@ -211,7 +211,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: snum = rover; } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(snum)]; sk_for_each(sk2, node, head) if (sk2->sk_hash == snum && @@ -227,7 +227,7 @@ gotit: inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(snum)]; sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } @@ -264,7 +264,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int badness = -1; read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + sk_for_each(sk, node, &udptable[udp_hashfn(hnum)]) { struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && @@ -1068,7 +1068,7 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, int dif; read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + sk = sk_head(&udptable[udp_hashfn(ntohs(uh->dest))]); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { -- cgit v1.2.3 From e31634931d00081c75e3fb3f3ec51a50dbf108bb Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 16 Jun 2008 17:12:11 -0700 Subject: udp: provide a struct net pointer for __udp[46]_lib_mcast_deliver They both calculate the hash chain, but currently do not have a struct net pointer, so pass one there via additional argument, all the more so their callers already have such. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/udp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d8f527d1570..6b0acb438f3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1059,7 +1059,7 @@ drop: * Note: called only from the BH handler context, * so we don't need to lock the hashes. */ -static int __udp4_lib_mcast_deliver(struct sk_buff *skb, +static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) @@ -1156,6 +1156,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], struct rtable *rt = (struct rtable*)skb->dst; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; + struct net *net; /* * Validate the packet. @@ -1177,10 +1178,12 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (udp4_csum_init(skb, uh, proto)) goto csum_error; + net = dev_net(skb->dev); if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable); + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable); - sk = __udp4_lib_lookup(dev_net(skb->dev), saddr, uh->source, daddr, + sk = __udp4_lib_lookup(net, saddr, uh->source, daddr, uh->dest, inet_iif(skb), udptable); if (sk != NULL) { -- cgit v1.2.3 From 19c7578fb22b0aef103222cae9b522f03ae489d6 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 16 Jun 2008 17:12:29 -0700 Subject: udp: add struct net argument to udp_hashfn Every caller already has this one. The new argument is currently unused, but this will be fixed shortly. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6b0acb438f3..11eabf13614 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -134,7 +134,7 @@ static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, struct sock *sk; struct hlist_node *node; - sk_for_each(sk, node, &udptable[udp_hashfn(num)]) + sk_for_each(sk, node, &udptable[udp_hashfn(net, num)]) if (net_eq(sock_net(sk), net) && sk->sk_hash == num) return 1; return 0; @@ -174,7 +174,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++) { int size = 0; - head = &udptable[udp_hashfn(rover)]; + head = &udptable[udp_hashfn(net, rover)]; if (hlist_empty(head)) goto gotit; @@ -211,7 +211,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: snum = rover; } else { - head = &udptable[udp_hashfn(snum)]; + head = &udptable[udp_hashfn(net, snum)]; sk_for_each(sk2, node, head) if (sk2->sk_hash == snum && @@ -227,7 +227,7 @@ gotit: inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - head = &udptable[udp_hashfn(snum)]; + head = &udptable[udp_hashfn(net, snum)]; sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } @@ -264,7 +264,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int badness = -1; read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[udp_hashfn(hnum)]) { + sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) { struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && @@ -1068,7 +1068,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif; read_lock(&udp_hash_lock); - sk = sk_head(&udptable[udp_hashfn(ntohs(uh->dest))]); + sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { -- cgit v1.2.3 From 7f635ab71eef8da012320c0092b662d6af8c1e69 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 16 Jun 2008 17:12:49 -0700 Subject: inet: add struct net argument to inet_bhashfn Binding to some port in many namespaces may create too long chains in bhash-es, so prepare the hashfn to take struct net into account. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 6 ++++-- net/ipv4/inet_hashtables.c | 11 +++++++---- net/ipv4/inet_timewait_sock.c | 6 ++++-- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 045e799d3e1..4c804b3c287 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -103,7 +103,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) rover = net_random() % remaining + low; do { - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(net, rover, + hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->ib_net == net && tb->port == rover) @@ -130,7 +131,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(net, snum, + hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->ib_net == net && tb->port == snum) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2023d37b270..dc1b78de899 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -70,7 +70,8 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, + hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; @@ -95,7 +96,8 @@ EXPORT_SYMBOL(inet_put_port); void __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, + table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; @@ -438,7 +440,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -493,7 +496,7 @@ ok: goto out; } - head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ce16e9ac24c..06006a5ac8b 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -32,7 +32,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, write_unlock(lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); @@ -81,7 +82,8 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; BUG_TRAP(icsk->icsk_bind_hash); -- cgit v1.2.3 From 2086a65078bd24682bdcf413d9c91d81988b8359 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 16 Jun 2008 17:13:08 -0700 Subject: inet: add struct net argument to inet_lhashfn Listening-on-one-port sockets in many namespaces produce long chains in the listening_hash-es, so prepare the inet_lhashfn to take struct net into account. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index dc1b78de899..4f597b3175e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -194,7 +194,7 @@ struct sock *__inet_lookup_listener(struct net *net, const struct hlist_head *head; read_lock(&hashinfo->lhash_lock); - head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; + head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; if (!hlist_empty(head)) { const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); -- cgit v1.2.3 From 9f26b3add3783c0af869ea2207871da5dafefffa Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 16 Jun 2008 17:13:27 -0700 Subject: inet: add struct net argument to inet_ehashfn Although this hash takes addresses into account, the ehash chains can also be too long when, for instance, communications via lo occur. So, prepare the inet_hashfn to take struct net into account. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4f597b3175e..eca5899729e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -227,7 +227,7 @@ struct sock * __inet_lookup_established(struct net *net, /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); + unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); @@ -267,13 +267,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, int dif = sk->sk_bound_dev_if; INET_ADDR_COOKIE(acookie, saddr, daddr) const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + struct net *net = sock_net(sk); + unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; - struct net *net = sock_net(sk); prefetch(head->chain.first); write_lock(lock); -- cgit v1.2.3 From cb61cb9b8b5ef6c2697d84e5015e314626eb2fba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jun 2008 21:04:56 -0700 Subject: udp: sk_drops handling In commits 33c732c36169d7022ad7d6eb474b0c9be43a2dc1 ([IPV4]: Add raw drops counter) and a92aa318b4b369091fd80433c80e62838db8bc1c ([IPV6]: Add raw drops counter), Wang Chen added raw drops counter for /proc/net/raw & /proc/net/raw6 This patch adds this capability to UDP sockets too (/proc/net/udp & /proc/net/udp6). This means that 'RcvbufErrors' errors found in /proc/net/snmp can be also be examined for each udp socket. # grep Udp: /proc/net/snmp Udp: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors Udp: 23971006 75 899420 16390693 146348 0 # cat /proc/net/udp sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt --- uid timeout inode ref pointer drops 75: 00000000:02CB 00000000:0000 07 00000000:00000000 00:00000000 00000000 --- 0 0 2358 2 ffff81082a538c80 0 111: 00000000:006F 00000000:0000 07 00000000:00000000 00:00000000 00000000 --- 0 0 2286 2 ffff81042dd35c80 146348 In this example, only port 111 (0x006F) was flooded by messages that user program could not read fast enough. 146348 messages were lost. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7d449468409..925fdf18cf9 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -944,7 +944,7 @@ static int raw_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_printf(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " - "inode drops\n"); + "inode ref pointer drops\n"); else raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); return 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 11eabf13614..3bbf6fb6e4f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1040,8 +1040,10 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) + if (rc == -ENOMEM) { UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite); + atomic_inc(&sk->sk_drops); + } goto drop; } @@ -1629,12 +1631,13 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", bucket, src, srcp, dest, destp, sp->sk_state, atomic_read(&sp->sk_wmem_alloc), atomic_read(&sp->sk_rmem_alloc), 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, len); + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops), len); } int udp4_seq_show(struct seq_file *seq, void *v) @@ -1643,7 +1646,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-127s\n", " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " - "inode"); + "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; int len; -- cgit v1.2.3 From 0187bdfb05674147774ca79a79942537f3ad54bd Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 19 Jun 2008 16:15:47 -0700 Subject: net: Disable LRO on devices that are forwarding Large Receive Offload (LRO) is only appropriate for packets that are destined for the host, and should be disabled if received packets may be forwarded. It can also confuse the GSO on output. Add dev_disable_lro() function which uses the appropriate ethtool ops to disable LRO if enabled. Add calls to dev_disable_lro() in br_add_if() and functions that enable IPv4 and IPv6 forwarding. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f8c0b0aea93..9de2514946c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -168,6 +168,8 @@ static struct in_device *inetdev_init(struct net_device *dev) in_dev->dev = dev; if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) goto out_kfree; + if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) + dev_disable_lro(dev); /* Reference in_dev->dev */ dev_hold(dev); /* Account for reference dev->ip_ptr (below) */ @@ -1241,6 +1243,8 @@ static void inet_forward_change(struct net *net) read_lock(&dev_base_lock); for_each_netdev(net, dev) { struct in_device *in_dev; + if (on) + dev_disable_lro(dev); rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) @@ -1248,8 +1252,6 @@ static void inet_forward_change(struct net *net) rcu_read_unlock(); } read_unlock(&dev_base_lock); - - rt_cache_flush(0); } static int devinet_conf_proc(ctl_table *ctl, int write, @@ -1335,10 +1337,19 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, if (write && *valp != val) { struct net *net = ctl->extra2; - if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) - inet_forward_change(net); - else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) + if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { + rtnl_lock(); + if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { + inet_forward_change(net); + } else if (*valp) { + struct ipv4_devconf *cnf = ctl->extra1; + struct in_device *idev = + container_of(cnf, struct in_device, cnf); + dev_disable_lro(idev->dev); + } + rtnl_unlock(); rt_cache_flush(0); + } } return ret; -- cgit v1.2.3 From 4497b0763cb1afae463f5e144c28b5d806e28b60 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 19 Jun 2008 16:22:28 -0700 Subject: net: Discard and warn about LRO'd skbs received for forwarding Add skb_warn_if_lro() to test whether an skb was received with LRO and warn if so. Change br_forward(), ip_forward() and ip6_forward() to call it) and discard the skb if it returns true. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 37d36a3f33c..da14725916d 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -56,6 +56,9 @@ int ip_forward(struct sk_buff *skb) struct rtable *rt; /* Route we use */ struct ip_options * opt = &(IPCB(skb)->opt); + if (skb_warn_if_lro(skb)) + goto drop; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) goto drop; -- cgit v1.2.3 From 6dbf4bcac98bbc76ef425b3a2b4169f31199f6c7 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 1 Jul 2008 19:29:07 -0700 Subject: icmp: fix units for ratelimit Convert the sysctl values for icmp ratelimit to use milliseconds instead of jiffies which is based on kernel configured HZ. Internal kernel jiffies are not a proper unit for any userspace API. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 90160700320..14ef202a225 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -793,7 +793,8 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies }, { .ctl_name = NET_IPV4_ICMP_RATEMASK, -- cgit v1.2.3 From 40b215e594b65a3488576c9d24b367548e18902a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 3 Jul 2008 01:05:41 -0700 Subject: tcp: de-bloat a bit with factoring NET_INC_STATS_BH out There are some places in TCP that select one MIB index to bump snmp statistics like this: if () NET_INC_STATS_BH(); else if () NET_INC_STATS_BH(); ... else NET_INC_STATS_BH(); or in a more tricky but still similar way. On the other hand, this NET_INC_STATS_BH is a camouflaged increment of percpu variable, which is not that small. Factoring those cases out de-bloats 235 bytes on non-preemptible i386 config and drives parts of the code into 80 columns. add/remove: 0/0 grow/shrink: 0/7 up/down: 0/-235 (-235) function old new delta tcp_fastretrans_alert 1437 1424 -13 tcp_dsack_set 137 124 -13 tcp_xmit_retransmit_queue 690 676 -14 tcp_try_undo_recovery 283 265 -18 tcp_sacktag_write_queue 1550 1515 -35 tcp_update_reordering 162 106 -56 tcp_retransmit_timer 990 904 -86 Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 46 ++++++++++++++++++++++++++++++++-------------- net/ipv4/tcp_output.c | 7 +++++-- net/ipv4/tcp_timer.c | 15 +++++++++------ 3 files changed, 46 insertions(+), 22 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index de30e70ff25..d6ea970a151 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -947,17 +947,21 @@ static void tcp_update_reordering(struct sock *sk, const int metric, { struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { + int mib_idx; + tp->reordering = min(TCP_MAX_REORDERING, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) - NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); + mib_idx = LINUX_MIB_TCPTSREORDER; else if (tcp_is_reno(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); + mib_idx = LINUX_MIB_TCPRENOREORDER; else if (tcp_is_fack(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); + mib_idx = LINUX_MIB_TCPFACKREORDER; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); + mib_idx = LINUX_MIB_TCPSACKREORDER; + + NET_INC_STATS_BH(mib_idx); #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, @@ -1456,18 +1460,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, if (!tcp_is_sackblock_valid(tp, dup_sack, sp[used_sacks].start_seq, sp[used_sacks].end_seq)) { + int mib_idx; + if (dup_sack) { if (!tp->undo_marker) - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO; else - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD; } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && !after(sp[used_sacks].end_seq, tp->snd_una)) continue; - NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); + mib_idx = LINUX_MIB_TCPSACKDISCARD; } + + NET_INC_STATS_BH(mib_idx); if (i == 0) first_sack_index = -1; continue; @@ -2380,15 +2388,19 @@ static int tcp_try_undo_recovery(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_may_undo(tp)) { + int mib_idx; + /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + mib_idx = LINUX_MIB_TCPLOSSUNDO; else - NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); + mib_idx = LINUX_MIB_TCPFULLUNDO; + + NET_INC_STATS_BH(mib_idx); tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { @@ -2560,7 +2572,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0; + int fast_rexmit = 0, mib_idx; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -2683,9 +2695,11 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) /* Otherwise enter Recovery state */ if (tcp_is_reno(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); + mib_idx = LINUX_MIB_TCPRENORECOVERY; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); + mib_idx = LINUX_MIB_TCPSACKRECOVERY; + + NET_INC_STATS_BH(mib_idx); tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; @@ -3700,10 +3714,14 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + int mib_idx; + if (before(seq, tp->rcv_nxt)) - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); + mib_idx = LINUX_MIB_TCPDSACKOLDSENT; else - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT); + mib_idx = LINUX_MIB_TCPDSACKOFOSENT; + + NET_INC_STATS_BH(mib_idx); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8f83ab43270..edef2afe905 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1985,14 +1985,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (sacked & TCPCB_LOST) { if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { + int mib_idx; + if (tcp_retransmit_skb(sk, skb)) { tp->retransmit_skb_hint = NULL; return; } if (icsk->icsk_ca_state != TCP_CA_Loss) - NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); + mib_idx = LINUX_MIB_TCPFASTRETRANS; else - NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; + NET_INC_STATS_BH(mib_idx); if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3e358cbb124..6a480d1fd8f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -326,24 +326,27 @@ static void tcp_retransmit_timer(struct sock *sk) goto out; if (icsk->icsk_retransmits == 0) { + int mib_idx; + if (icsk->icsk_ca_state == TCP_CA_Disorder || icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) { if (icsk->icsk_ca_state == TCP_CA_Recovery) - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); + mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); + mib_idx = LINUX_MIB_TCPSACKFAILURES; } else { if (icsk->icsk_ca_state == TCP_CA_Recovery) - NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); + mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; else - NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); + mib_idx = LINUX_MIB_TCPRENOFAILURES; } } else if (icsk->icsk_ca_state == TCP_CA_Loss) { - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); + mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else { - NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); + mib_idx = LINUX_MIB_TCPTIMEOUTS; } + NET_INC_STATS_BH(mib_idx); } if (tcp_use_frto(sk)) { -- cgit v1.2.3 From 03d2f897e9fb3218989baa2139a951ce7f5414bf Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Thu, 3 Jul 2008 12:13:36 +0800 Subject: ipv4: Do cleanup for ip_mr_init Same as ip6_mr_init(), make ip_mr_init() return errno if fails. But do not do error handling in inet_init(), just print a msg. Signed-off-by: Wang Chen Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/af_inet.c | 5 +++-- net/ipv4/ipmr.c | 28 ++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 42bd24b64b5..dc411335c14 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1479,14 +1479,15 @@ static int __init inet_init(void) * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) - ip_mr_init(); + if (ip_mr_init()) + printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n"); #endif /* * Initialise per-cpu ipv4 mibs */ if (init_ipv4_mibs()) - printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; + printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ipv4_proc_init(); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 300ab0c2919..438fab9c62a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1878,16 +1878,36 @@ static struct net_protocol pim_protocol = { * Setup for IP multicast routing */ -void __init ip_mr_init(void) +int __init ip_mr_init(void) { + int err; + mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + if (!mrt_cachep) + return -ENOMEM; + setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); - register_netdevice_notifier(&ip_mr_notifier); + err = register_netdevice_notifier(&ip_mr_notifier); + if (err) + goto reg_notif_fail; #ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops); - proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops); + err = -ENOMEM; + if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops)) + goto proc_vif_fail; + if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops)) + goto proc_cache_fail; #endif + return 0; +reg_notif_fail: + kmem_cache_destroy(mrt_cachep); +#ifdef CONFIG_PROC_FS +proc_vif_fail: + unregister_netdevice_notifier(&ip_mr_notifier); +proc_cache_fail: + proc_net_remove(&init_net, "ip_mr_vif"); +#endif + return err; } -- cgit v1.2.3 From 76e6ebfb40a2455c18234dcb0f9df37533215461 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:00:44 -0700 Subject: netns: add namespace parameter to rt_cache_flush Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- net/ipv4/devinet.c | 8 +++++--- net/ipv4/fib_frontend.c | 17 +++++++++-------- net/ipv4/fib_hash.c | 6 +++--- net/ipv4/fib_rules.c | 2 +- net/ipv4/fib_trie.c | 6 +++--- net/ipv4/route.c | 8 ++++---- 7 files changed, 26 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 20c515a1be2..29df75a6bcc 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1197,7 +1197,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); break; default: break; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 9de2514946c..2e667e2f90d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1348,7 +1348,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, dev_disable_lro(idev->dev); } rtnl_unlock(); - rt_cache_flush(0); + rt_cache_flush(net, 0); } } @@ -1362,9 +1362,10 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write, int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + struct net *net = ctl->extra2; if (write && *valp != val) - rt_cache_flush(0); + rt_cache_flush(net, 0); return ret; } @@ -1375,9 +1376,10 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, { int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp, newval, newlen); + struct net *net = table->extra2; if (ret == 1) - rt_cache_flush(0); + rt_cache_flush(net, 0); return ret; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 5ad01d63f83..65c1503f8cc 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -144,7 +144,7 @@ static void fib_flush(struct net *net) } if (flushed) - rt_cache_flush(-1); + rt_cache_flush(net, -1); } /* @@ -897,21 +897,22 @@ static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; + struct net_device *dev = ifa->ifa_dev->dev; switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(ifa->ifa_dev->dev); + fib_sync_up(dev); #endif - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_del_ifaddr(ifa); @@ -919,9 +920,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. Disable IP. */ - fib_disable_ip(ifa->ifa_dev->dev, 1); + fib_disable_ip(dev, 1); } else { - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); } break; } @@ -949,14 +950,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_disable_ip(dev, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); break; } return NOTIFY_DONE; diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index eeec4bf982b..c8cac6c7f88 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -472,7 +472,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); return 0; @@ -532,7 +532,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) if (new_f) fz->fz_nent++; - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -614,7 +614,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) write_unlock_bh(&fib_hash_lock); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); fn_free_alias(fa, f); if (kill_fn) { fn_free_node(f); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 1fb56876be5..bc05de41308 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -260,7 +260,7 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) static void fib4_rule_flush_cache(void) { - rt_cache_flush(-1); + rt_cache_flush(&init_net, -1); } static struct fib_rules_ops fib4_rules_ops_template = { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 394db9c941a..d16ae4623be 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1271,7 +1271,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); @@ -1316,7 +1316,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); succeeded: @@ -1664,7 +1664,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fe3a0223728..cedc366505b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -791,7 +791,7 @@ static void rt_cache_invalidate(void) * delay < 0 : invalidate cache (fast : entries will be deleted later) * delay >= 0 : invalidate & flush cache (can be long) */ -void rt_cache_flush(int delay) +void rt_cache_flush(struct net *net, int delay) { rt_cache_invalidate(); if (delay >= 0) @@ -2825,7 +2825,7 @@ done: void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(dev_net(in_dev->dev), 0); } #ifdef CONFIG_SYSCTL @@ -2837,7 +2837,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, { if (write) { proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - rt_cache_flush(flush_delay); + rt_cache_flush(&init_net, flush_delay); return 0; } @@ -2857,7 +2857,7 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, return -EINVAL; if (get_user(delay, (int __user *)newval)) return -EFAULT; - rt_cache_flush(delay); + rt_cache_flush(&init_net, delay); return 0; } -- cgit v1.2.3 From ae299fc051aa68ca6ef1807c37bb92d9b6ff817c Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:01:28 -0700 Subject: net: add fib_rules_ops to flush_cache method This is required to pass namespace context into rt_cache_flush called from ->flush_cache. Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/fib_rules.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index bc05de41308..6080d712082 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -258,9 +258,9 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(4); /* flow */ } -static void fib4_rule_flush_cache(void) +static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { - rt_cache_flush(&init_net, -1); + rt_cache_flush(ops->fro_net, -1); } static struct fib_rules_ops fib4_rules_ops_template = { -- cgit v1.2.3 From 639e104facec20f64f2eb940851ae45e5f255e6b Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:02:06 -0700 Subject: ipv4: remove static flush_delay variable flush delay is used as an external storage for net.ipv4.route.flush sysctl entry. It is write-only. The ctl_table->data for this entry is used once. Fix this case to point to the stack to remove global variable. Do this to avoid additional variable on struct net in the next patch. Possible race (as it was before) accessing this local variable is removed using flush_mutex. Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/route.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cedc366505b..790de32cd7d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2829,14 +2829,20 @@ void ip_rt_multicast_event(struct in_device *in_dev) } #ifdef CONFIG_SYSCTL -static int flush_delay; - static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { + int flush_delay; + static DEFINE_MUTEX(flush_mutex); + + mutex_lock(&flush_mutex); + ctl->data = &flush_delay; proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + ctl->data = NULL; + mutex_unlock(&flush_mutex); + rt_cache_flush(&init_net, flush_delay); return 0; } @@ -2865,7 +2871,6 @@ ctl_table ipv4_route_table[] = { { .ctl_name = NET_IPV4_ROUTE_FLUSH, .procname = "flush", - .data = &flush_delay, .maxlen = sizeof(int), .mode = 0200, .proc_handler = &ipv4_sysctl_rtcache_flush, -- cgit v1.2.3 From 39a23e75087ce815abbddbd565b9a2e567ac47da Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:02:33 -0700 Subject: netns: register net.ipv4.route.flush in each namespace Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/route.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 790de32cd7d..6fe799de9b9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2835,6 +2835,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, { if (write) { int flush_delay; + struct net *net; static DEFINE_MUTEX(flush_mutex); mutex_lock(&flush_mutex); @@ -2843,7 +2844,8 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, ctl->data = NULL; mutex_unlock(&flush_mutex); - rt_cache_flush(&init_net, flush_delay); + net = (struct net *)ctl->extra1; + rt_cache_flush(net, flush_delay); return 0; } @@ -2859,23 +2861,17 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, size_t newlen) { int delay; + struct net *net; if (newlen != sizeof(int)) return -EINVAL; if (get_user(delay, (int __user *)newval)) return -EFAULT; - rt_cache_flush(&init_net, delay); + net = (struct net *)table->extra1; + rt_cache_flush(net, delay); return 0; } ctl_table ipv4_route_table[] = { - { - .ctl_name = NET_IPV4_ROUTE_FLUSH, - .procname = "flush", - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = &ipv4_sysctl_rtcache_flush, - .strategy = &ipv4_sysctl_rtcache_flush_strategy, - }, { .ctl_name = NET_IPV4_ROUTE_GC_THRESH, .procname = "gc_thresh", @@ -3014,6 +3010,66 @@ ctl_table ipv4_route_table[] = { }, { .ctl_name = 0 } }; + +static __net_initdata struct ctl_path ipv4_route_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, + { }, +}; + + +static struct ctl_table ipv4_route_flush_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &ipv4_sysctl_rtcache_flush, + .strategy = &ipv4_sysctl_rtcache_flush_strategy, + }, + { .ctl_name = 0 }, +}; + +static __net_init int sysctl_route_net_init(struct net *net) +{ + struct ctl_table *tbl; + + tbl = ipv4_route_flush_table; + if (net != &init_net) { + tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + } + tbl[0].extra1 = net; + + net->ipv4.route_hdr = + register_net_sysctl_table(net, ipv4_route_path, tbl); + if (net->ipv4.route_hdr == NULL) + goto err_reg; + return 0; + +err_reg: + if (tbl != ipv4_route_flush_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_route_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->ipv4.route_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.route_hdr); + BUG_ON(tbl == ipv4_route_flush_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_route_ops = { + .init = sysctl_route_net_init, + .exit = sysctl_route_net_exit, +}; #endif #ifdef CONFIG_NET_CLS_ROUTE @@ -3090,6 +3146,9 @@ int __init ip_rt_init(void) #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); +#ifdef CONFIG_SYSCTL + register_pernet_subsys(&sysctl_route_ops); +#endif return rc; } -- cgit v1.2.3 From 9f5e97e53675caeda48e9988122a30470f4d309d Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:02:59 -0700 Subject: netns: make rt_secret_rebuild timer per namespace Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/route.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6fe799de9b9..f99d9dbb772 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -132,7 +132,6 @@ static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; static void rt_worker_func(struct work_struct *work); static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); -static struct timer_list rt_secret_timer; /* * Interface to generic destination cache. @@ -801,10 +800,11 @@ void rt_cache_flush(struct net *net, int delay) /* * We change rt_genid and let gc do the cleanup */ -static void rt_secret_rebuild(unsigned long dummy) +static void rt_secret_rebuild(unsigned long __net) { + struct net *net = (struct net *)__net; rt_cache_invalidate(); - mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); + mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); } /* @@ -3072,6 +3072,31 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { }; #endif + +static __net_init int rt_secret_timer_init(struct net *net) +{ + net->ipv4.rt_secret_timer.function = rt_secret_rebuild; + net->ipv4.rt_secret_timer.data = (unsigned long)net; + init_timer_deferrable(&net->ipv4.rt_secret_timer); + + net->ipv4.rt_secret_timer.expires = + jiffies + net_random() % ip_rt_secret_interval + + ip_rt_secret_interval; + add_timer(&net->ipv4.rt_secret_timer); + return 0; +} + +static __net_exit void rt_secret_timer_exit(struct net *net) +{ + del_timer_sync(&net->ipv4.rt_secret_timer); +} + +static __net_initdata struct pernet_operations rt_secret_timer_ops = { + .init = rt_secret_timer_init, + .exit = rt_secret_timer_exit, +}; + + #ifdef CONFIG_NET_CLS_ROUTE struct ip_rt_acct *ip_rt_acct __read_mostly; #endif /* CONFIG_NET_CLS_ROUTE */ @@ -3124,19 +3149,14 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - rt_secret_timer.function = rt_secret_rebuild; - rt_secret_timer.data = 0; - init_timer_deferrable(&rt_secret_timer); - /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&rt_secret_timer); + if (register_pernet_subsys(&rt_secret_timer_ops)) + printk(KERN_ERR "Unable to setup rt_secret_timer\n"); if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); -- cgit v1.2.3 From 86c657f6b5bbf2f3ec2213eca528998134a9b344 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:03:31 -0700 Subject: netns: add struct net parameter to rt_cache_invalidate Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/route.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f99d9dbb772..9725223ffe9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -778,7 +778,7 @@ static void rt_worker_func(struct work_struct *work) * many times (2^24) without giving recent rt_genid. * Jenkins hash is strong enough that litle changes of rt_genid are OK. */ -static void rt_cache_invalidate(void) +static void rt_cache_invalidate(struct net *net) { unsigned char shuffle; @@ -792,7 +792,7 @@ static void rt_cache_invalidate(void) */ void rt_cache_flush(struct net *net, int delay) { - rt_cache_invalidate(); + rt_cache_invalidate(net); if (delay >= 0) rt_do_flush(!in_softirq()); } @@ -803,7 +803,7 @@ void rt_cache_flush(struct net *net, int delay) static void rt_secret_rebuild(unsigned long __net) { struct net *net = (struct net *)__net; - rt_cache_invalidate(); + rt_cache_invalidate(net); mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); } -- cgit v1.2.3 From b00180defdeeac8e07e3dc02e53e7395d42bbd19 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:04:09 -0700 Subject: ipv4: pass current value of rt_genid into rt_hash Basically, there is no difference to atomic_read internally or pass it as a parameter as rt_hash is inline. Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/route.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9725223ffe9..e4e37edbad6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -256,11 +256,12 @@ static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ (__raw_get_cpu_var(rt_cache_stat).field++) -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx) +static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, + int genid) { return jhash_3words((__force u32)(__be32)(daddr), (__force u32)(__be32)(saddr), - idx, atomic_read(&rt_genid)) + idx, genid) & rt_hash_mask; } @@ -1180,7 +1181,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, for (i = 0; i < 2; i++) { for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], + atomic_read(&rt_genid)); rthp=&rt_hash_table[hash].chain; @@ -1295,7 +1297,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) } else if ((rt->rt_flags & RTCF_REDIRECTED) || rt->u.dst.expires) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif); + rt->fl.oif, + atomic_read(&rt_genid)); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ipv4_negative_advice: redirect to " NIPQUAD_FMT "/%02x dropped\n", @@ -1444,7 +1447,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, for (k = 0; k < 2; k++) { for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], + atomic_read(&rt_genid)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -1709,7 +1713,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, RT_CACHE_STAT_INC(in_slow_mc); in_dev_put(in_dev); - hash = rt_hash(daddr, saddr, dev->ifindex); + hash = rt_hash(daddr, saddr, dev->ifindex, atomic_read(&rt_genid)); return rt_intern_hash(hash, rth, &skb->rtable); e_nobufs: @@ -1870,7 +1874,7 @@ static int ip_mkroute_input(struct sk_buff *skb, return err; /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif); + hash = rt_hash(daddr, saddr, fl->iif, atomic_read(&rt_genid)); return rt_intern_hash(hash, rth, &skb->rtable); } @@ -2026,7 +2030,7 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif); + hash = rt_hash(daddr, saddr, fl.iif, atomic_read(&rt_genid)); err = rt_intern_hash(hash, rth, &skb->rtable); goto done; @@ -2077,7 +2081,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, net = dev_net(dev); tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif); + hash = rt_hash(daddr, saddr, iif, atomic_read(&rt_genid)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -2266,7 +2270,8 @@ static int ip_mkroute_output(struct rtable **rp, int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); + hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, + atomic_read(&rt_genid)); err = rt_intern_hash(hash, rth, rp); } @@ -2478,7 +2483,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, unsigned hash; struct rtable *rth; - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); + hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, + atomic_read(&rt_genid)); rcu_read_lock_bh(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; -- cgit v1.2.3 From e84f84f276473dcc673f360e8ff3203148bdf0e2 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:04:32 -0700 Subject: netns: place rt_genid into struct net Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/route.c | 76 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 33 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e4e37edbad6..67c3ed772c2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -250,7 +250,6 @@ static inline void rt_hash_lock_init(void) static struct rt_hash_bucket *rt_hash_table __read_mostly; static unsigned rt_hash_mask __read_mostly; static unsigned int rt_hash_log __read_mostly; -static atomic_t rt_genid __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ @@ -265,6 +264,11 @@ static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, & rt_hash_mask; } +static inline int rt_genid(struct net *net) +{ + return atomic_read(&net->ipv4.rt_genid); +} + #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { struct seq_net_private p; @@ -334,7 +338,7 @@ static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) struct rt_cache_iter_state *st = seq->private; if (*pos) return rt_cache_get_idx(seq, *pos - 1); - st->genid = atomic_read(&rt_genid); + st->genid = rt_genid(seq_file_net(seq)); return SEQ_START_TOKEN; } @@ -681,6 +685,11 @@ static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); } +static inline int rt_is_expired(struct rtable *rth) +{ + return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); +} + /* * Perform a full scan of hash table and free all entries. * Can be called by a softirq or a process. @@ -736,7 +745,7 @@ static void rt_check_expire(void) continue; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid != atomic_read(&rt_genid)) { + if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); continue; @@ -784,7 +793,7 @@ static void rt_cache_invalidate(struct net *net) unsigned char shuffle; get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &rt_genid); + atomic_add(shuffle + 1U, &net->ipv4.rt_genid); } /* @@ -881,7 +890,7 @@ static int rt_garbage_collect(struct dst_ops *ops) rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid == atomic_read(&rt_genid) && + if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; @@ -963,7 +972,7 @@ restart: spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid != atomic_read(&rt_genid)) { + if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); continue; @@ -1139,7 +1148,7 @@ static void rt_del(unsigned hash, struct rtable *rt) spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); while ((aux = *rthp) != NULL) { - if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) { + if (aux == rt || rt_is_expired(aux)) { *rthp = aux->u.dst.rt_next; rt_free(aux); continue; @@ -1182,7 +1191,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, for (i = 0; i < 2; i++) { for (k = 0; k < 2; k++) { unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - atomic_read(&rt_genid)); + rt_genid(net)); rthp=&rt_hash_table[hash].chain; @@ -1194,7 +1203,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rth->fl.fl4_src != skeys[i] || rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || - rth->rt_genid != atomic_read(&rt_genid) || + rt_is_expired(rth) || !net_eq(dev_net(rth->u.dst.dev), net)) { rthp = &rth->u.dst.rt_next; continue; @@ -1233,7 +1242,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; rt->u.dst.xfrm = NULL; - rt->rt_genid = atomic_read(&rt_genid); + rt->rt_genid = rt_genid(net); rt->rt_flags |= RTCF_REDIRECTED; /* Gateway is different ... */ @@ -1298,7 +1307,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) rt->u.dst.expires) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, rt->fl.oif, - atomic_read(&rt_genid)); + rt_genid(dev_net(dst->dev))); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ipv4_negative_advice: redirect to " NIPQUAD_FMT "/%02x dropped\n", @@ -1448,7 +1457,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, for (k = 0; k < 2; k++) { for (i = 0; i < 2; i++) { unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - atomic_read(&rt_genid)); + rt_genid(net)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -1463,7 +1472,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rth->fl.iif != 0 || dst_metric_locked(&rth->u.dst, RTAX_MTU) || !net_eq(dev_net(rth->u.dst.dev), net) || - rth->rt_genid != atomic_read(&rt_genid)) + !rt_is_expired(rth)) continue; if (new_mtu < 68 || new_mtu >= old_mtu) { @@ -1698,7 +1707,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; if (our) { @@ -1713,7 +1722,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, RT_CACHE_STAT_INC(in_slow_mc); in_dev_put(in_dev); - hash = rt_hash(daddr, saddr, dev->ifindex, atomic_read(&rt_genid)); + hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); return rt_intern_hash(hash, rth, &skb->rtable); e_nobufs: @@ -1839,7 +1848,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); rt_set_nexthop(rth, res, itag); @@ -1874,7 +1883,8 @@ static int ip_mkroute_input(struct sk_buff *skb, return err; /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif, atomic_read(&rt_genid)); + hash = rt_hash(daddr, saddr, fl->iif, + rt_genid(dev_net(rth->u.dst.dev))); return rt_intern_hash(hash, rth, &skb->rtable); } @@ -2000,7 +2010,7 @@ local_input: goto e_nobufs; rth->u.dst.output= ip_rt_bug; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(net); atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; @@ -2030,7 +2040,7 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif, atomic_read(&rt_genid)); + hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); err = rt_intern_hash(hash, rth, &skb->rtable); goto done; @@ -2081,7 +2091,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, net = dev_net(dev); tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif, atomic_read(&rt_genid)); + hash = rt_hash(daddr, saddr, iif, rt_genid(net)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -2093,7 +2103,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && - rth->rt_genid == atomic_read(&rt_genid)) { + !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); @@ -2221,7 +2231,7 @@ static int __mkroute_output(struct rtable **result, rth->rt_spec_dst= fl->fl4_src; rth->u.dst.output=ip_output; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); @@ -2271,7 +2281,7 @@ static int ip_mkroute_output(struct rtable **rp, unsigned hash; if (err == 0) { hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, - atomic_read(&rt_genid)); + rt_genid(dev_net(dev_out))); err = rt_intern_hash(hash, rth, rp); } @@ -2483,8 +2493,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, unsigned hash; struct rtable *rth; - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, - atomic_read(&rt_genid)); + hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -2497,7 +2506,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->u.dst.dev), net) && - rth->rt_genid == atomic_read(&rt_genid)) { + !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2528,7 +2537,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { }; -static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) +static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) { struct rtable *ort = *rp; struct rtable *rt = (struct rtable *) @@ -2552,7 +2561,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) rt->idev = ort->idev; if (rt->idev) in_dev_hold(rt->idev); - rt->rt_genid = atomic_read(&rt_genid); + rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_dst = ort->rt_dst; @@ -2588,7 +2597,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) - err = ipv4_dst_blackhole(rp, flp); + err = ipv4_dst_blackhole(net, rp, flp); return err; } @@ -2807,7 +2816,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) rt = rcu_dereference(rt->u.dst.rt_next), idx++) { if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) continue; - if (rt->rt_genid != atomic_read(&rt_genid)) + if (rt_is_expired(rt)) continue; skb->dst = dst_clone(&rt->u.dst); if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, @@ -3081,6 +3090,10 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_secret_timer_init(struct net *net) { + atomic_set(&net->ipv4.rt_genid, + (int) ((num_physpages ^ (num_physpages>>8)) ^ + (jiffies ^ (jiffies >> 7)))); + net->ipv4.rt_secret_timer.function = rt_secret_rebuild; net->ipv4.rt_secret_timer.data = (unsigned long)net; init_timer_deferrable(&net->ipv4.rt_secret_timer); @@ -3121,9 +3134,6 @@ int __init ip_rt_init(void) { int rc = 0; - atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7)))); - #ifdef CONFIG_NET_CLS_ROUTE ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); if (!ip_rt_acct) -- cgit v1.2.3 From 32cb5b4e035e3d7b52f1e9de87920645a00e5234 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:06:12 -0700 Subject: netns: selective flush of rt_cache dst cache is marked as expired on the per/namespace basis by previous path. Right now we have to implement selective cache shrinking. This procedure has been ported from older OpenVz codebase. Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/route.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 67c3ed772c2..113cd2512ba 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -699,6 +699,7 @@ static void rt_do_flush(int process_context) { unsigned int i; struct rtable *rth, *next; + struct rtable * tail; for (i = 0; i <= rt_hash_mask; i++) { if (process_context && need_resched()) @@ -708,11 +709,39 @@ static void rt_do_flush(int process_context) continue; spin_lock_bh(rt_hash_lock_addr(i)); +#ifdef CONFIG_NET_NS + { + struct rtable ** prev, * p; + + rth = rt_hash_table[i].chain; + + /* defer releasing the head of the list after spin_unlock */ + for (tail = rth; tail; tail = tail->u.dst.rt_next) + if (!rt_is_expired(tail)) + break; + if (rth != tail) + rt_hash_table[i].chain = tail; + + /* call rt_free on entries after the tail requiring flush */ + prev = &rt_hash_table[i].chain; + for (p = *prev; p; p = next) { + next = p->u.dst.rt_next; + if (!rt_is_expired(p)) { + prev = &p->u.dst.rt_next; + } else { + *prev = next; + rt_free(p); + } + } + } +#else rth = rt_hash_table[i].chain; rt_hash_table[i].chain = NULL; + tail = NULL; +#endif spin_unlock_bh(rt_hash_lock_addr(i)); - for (; rth; rth = next) { + for (; rth != tail; rth = next) { next = rth->u.dst.rt_next; rt_free(rth); } -- cgit v1.2.3 From 629ca23c331ec75ac87b016debbb3c4d2fe62650 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Sat, 5 Jul 2008 21:18:07 -0700 Subject: MIB: add struct net to UDP_INC_STATS_USER Nothing special - all the places already have a struct sock at hands, so use the sock_net() net. Signed-off-by: Pavel Emelyanov Acked-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/udp.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3bbf6fb6e4f..c97da5394d7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -526,7 +526,8 @@ out: up->len = 0; up->pending = 0; if (!err) - UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -725,7 +726,8 @@ out: * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP_INC_STATS_USER(UDP_MIB_SNDBUFERRORS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); } return err; @@ -888,7 +890,8 @@ try_again: goto out_free; if (!peeked) - UDP_INC_STATS_USER(UDP_MIB_INDATAGRAMS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); sock_recv_timestamp(msg, sk, skb); @@ -917,7 +920,7 @@ out: csum_copy_err: lock_sock(sk); if (!skb_kill_datagram(sk, skb, flags)) - UDP_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); release_sock(sk); if (noblock) -- cgit v1.2.3 From 0283328e2360bbf3081e97f1b720dd4c4dbae111 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Sat, 5 Jul 2008 21:18:48 -0700 Subject: MIB: add struct net to UDP_INC_STATS_BH Two special cases here - one is rxrpc - I put init_net there explicitly, since we haven't touched this part yet. The second place is in __udp4_lib_rcv - we already have a struct net there, but I have to move its initialization above to make it ready at the "drop" label. Signed-off-by: Pavel Emelyanov Acked-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/udp.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c97da5394d7..7187121e922 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -991,7 +991,8 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) ret = (*up->encap_rcv)(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); return -ret; } @@ -1044,7 +1045,8 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) { - UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite); + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); atomic_inc(&sk->sk_drops); } goto drop; @@ -1053,7 +1055,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) return 0; drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; } @@ -1161,7 +1163,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], struct rtable *rt = (struct rtable*)skb->dst; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; - struct net *net; + struct net *net = dev_net(skb->dev); /* * Validate the packet. @@ -1183,7 +1185,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (udp4_csum_init(skb, uh, proto)) goto csum_error; - net = dev_net(skb->dev); if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable); @@ -1217,7 +1218,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP_INC_STATS_BH(UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* @@ -1251,7 +1252,7 @@ csum_error: ntohs(uh->dest), ulen); drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } @@ -1458,7 +1459,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INERRORS, is_lite); __skb_unlink(skb, rcvq); kfree_skb(skb); } -- cgit v1.2.3 From b11c16beb92112885edccc79e17d39c5d218f441 Mon Sep 17 00:00:00 2001 From: Russ Dill Date: Tue, 8 Jul 2008 02:35:27 -0700 Subject: netfilter: Get rid of refrences to no longer existant Fast NAT. Get rid of refrences to no longer existant Fast NAT. IP_ROUTE_NAT support was removed in August of 2004, but references to Fast NAT were left in a couple of config options. Signed-off-by: Russ Dill Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 6e251402506..f23e60c93ef 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -213,8 +213,7 @@ config IP_NF_TARGET_NETMAP help NETMAP is an implementation of static 1:1 NAT mapping of network addresses. It maps the network address part, while keeping the host - address part intact. It is similar to Fast NAT, except that - Netfilter's connection tracking doesn't work well with Fast NAT. + address part intact. To compile it as a module, choose M here. If unsure, say N. -- cgit v1.2.3 From 81c684d12ddc05bba4953e36e9cdd5939dde344b Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Tue, 8 Jul 2008 03:05:28 -0700 Subject: ipv4: remove flush_mutex from ipv4_sysctl_rtcache_flush It is possible to avoid locking at all in ipv4_sysctl_rtcache_flush by defining local ctl_table on the stack. The patch is based on the suggestion from Eric W. Biederman. Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/route.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 113cd2512ba..79c1e74263a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2873,22 +2873,20 @@ void ip_rt_multicast_event(struct in_device *in_dev) } #ifdef CONFIG_SYSCTL -static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, +static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { int flush_delay; + ctl_table ctl; struct net *net; - static DEFINE_MUTEX(flush_mutex); - mutex_lock(&flush_mutex); - ctl->data = &flush_delay; - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - ctl->data = NULL; - mutex_unlock(&flush_mutex); + memcpy(&ctl, __ctl, sizeof(ctl)); + ctl.data = &flush_delay; + proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); - net = (struct net *)ctl->extra1; + net = (struct net *)__ctl->extra1; rt_cache_flush(net, flush_delay); return 0; } -- cgit v1.2.3 From d607032db0ccd7274bee348df3214f6f52b24816 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 14 Jul 2008 20:55:26 -0700 Subject: ipv4: Check return of dev_set_allmulti allmulti might overflow. Commit: "netdevice: Fix promiscuity and allmulti overflow" in net-next makes dev_set_promiscuity/allmulti return error number if overflow happened. Here, we check the positive increment for allmulti to get error return. PS: For unwinding tunnel creating, we let ipip->ioctl() to handle it. Signed-off-by: Wang Chen Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 438fab9c62a..2f4d8afd067 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -118,6 +118,31 @@ static struct timer_list ipmr_expire_timer; /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ +static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) +{ + dev_close(dev); + + dev = __dev_get_by_name(&init_net, "tunl0"); + if (dev) { + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL); + set_fs(oldfs); + } +} + static struct net_device *ipmr_new_tunnel(struct vifctl *v) { @@ -389,6 +414,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) struct vif_device *v = &vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; + int err; /* Is vif busy ? */ if (VIF_EXISTS(vifi)) @@ -406,18 +432,31 @@ static int vif_add(struct vifctl *vifc, int mrtsock) dev = ipmr_reg_vif(); if (!dev) return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + unregister_netdevice(dev); + return err; + } break; #endif case VIFF_TUNNEL: dev = ipmr_new_tunnel(vifc); if (!dev) return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + ipmr_del_tunnel(dev, vifc); + return err; + } break; case 0: dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); + err = dev_set_allmulti(dev, 1); + if (err) + return err; break; default: return -EINVAL; @@ -426,7 +465,6 @@ static int vif_add(struct vifctl *vifc, int mrtsock) if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) return -EADDRNOTAVAIL; IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; - dev_set_allmulti(dev, +1); ip_rt_multicast_event(in_dev); /* -- cgit v1.2.3 From 7dc00c82cbb0119cf4663f65bbaa2cc55f961db2 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 14 Jul 2008 20:56:34 -0700 Subject: ipv4: Fix ipmr unregister device oops An oops happens during device unregister. The following oops happened when I add two tunnels, which use a same device, and then delete one tunnel. Obviously deleting tunnel "A" causes device unregister, which send a notification, and after receiving notification, ipmr do unregister again for tunnel "B" which also use same device. That is wrong. After receiving notification, ipmr only needs to decrease reference count and don't do duplicated unregister. Fortunately, IPv6 side doesn't add tunnel in ip6mr, so it's clean. This patch fixs: - unregister device oops - using after dev_put() Here is the oops: === Jul 11 15:39:29 wangchen kernel: ------------[ cut here ]------------ Jul 11 15:39:29 wangchen kernel: kernel BUG at net/core/dev.c:3651! Jul 11 15:39:29 wangchen kernel: invalid opcode: 0000 [#1] Jul 11 15:39:29 wangchen kernel: Modules linked in: ipip tunnel4 nfsd lockd nfs_acl auth_rpcgss sunrpc exportfs ipv6 snd_pcm_oss snd_mixer_oss snd_seq snd_seq_device af_packet binfmt_misc button battery ac loop dm_mod usbhid ff_memless pcmcia firmware_class ohci1394 8139too mii ieee1394 yenta_socket rsrc_nonstatic pcmcia_core ide_cd_mod cdrom snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm i2c_i801 snd_timer snd i2c_core soundcore snd_page_alloc rng_core shpchp ehci_hcd uhci_hcd pci_hotplug intel_agp agpgart usbcore ext3 jbd ata_piix ahci libata dock edd fan thermal processor thermal_sys piix sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table] Jul 11 15:39:29 wangchen kernel: Jul 11 15:39:29 wangchen kernel: Pid: 4102, comm: mroute Not tainted (2.6.26-rc9-default #69) Jul 11 15:39:29 wangchen kernel: EIP: 0060:[] EFLAGS: 00010202 CPU: 0 Jul 11 15:39:29 wangchen kernel: EIP is at rollback_registered+0x61/0xe3 Jul 11 15:39:29 wangchen kernel: EAX: 00000001 EBX: ecba6000 ECX: 00000000 EDX: ffffffff Jul 11 15:39:29 wangchen kernel: ESI: 00000001 EDI: ecba6000 EBP: c03de2e8 ESP: ed8e7c3c Jul 11 15:39:29 wangchen kernel: DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 Jul 11 15:39:29 wangchen kernel: Process mroute (pid: 4102, ti=ed8e6000 task=ed41e830 task.ti=ed8e6000) Jul 11 15:39:29 wangchen kernel: Stack: ecba6000 c024641c 00000028 c0284e1a 00000001 c03de2e8 ecba6000 eecff360 Jul 11 15:39:29 wangchen kernel: c0284e4c c03536f4 fffffff8 00000000 c029a819 ecba6000 00000006 ecba6000 Jul 11 15:39:29 wangchen kernel: 00000000 ecba6000 c03de2c0 c012841b ffffffff 00000000 c024639f ecba6000 Jul 11 15:39:29 wangchen kernel: Call Trace: Jul 11 15:39:29 wangchen kernel: [] unregister_netdevice+0x2f/0x51 Jul 11 15:39:29 wangchen kernel: [] vif_delete+0xaf/0xc3 Jul 11 15:39:29 wangchen kernel: [] ipmr_device_event+0x1e/0x30 Jul 11 15:39:29 wangchen kernel: [] notifier_call_chain+0x2a/0x47 Jul 11 15:39:29 wangchen kernel: [] raw_notifier_call_chain+0x9/0xc Jul 11 15:39:29 wangchen kernel: [] rollback_registered+0x95/0xe3 Jul 11 15:39:29 wangchen kernel: [] unregister_netdevice+0x2f/0x51 Jul 11 15:39:29 wangchen kernel: [] vif_delete+0xaf/0xc3 Jul 11 15:39:29 wangchen kernel: [] ip_mroute_setsockopt+0x47a/0x801 Jul 11 15:39:29 wangchen kernel: [] do_get_write_access+0x2df/0x313 [jbd] Jul 11 15:39:29 wangchen kernel: [] __find_get_block_slow+0xda/0xe4 Jul 11 15:39:29 wangchen kernel: [] __find_get_block+0xf8/0x122 Jul 11 15:39:29 wangchen kernel: [] __find_get_block+0xf8/0x122 Jul 11 15:39:29 wangchen kernel: [] journal_cancel_revoke+0xda/0x110 [jbd] Jul 11 15:39:29 wangchen kernel: [] ip_setsockopt+0xa9/0x9ee Jul 11 15:39:29 wangchen kernel: [] journal_cancel_revoke+0xda/0x110 [jbd] Jul 11 15:39:29 wangchen kernel: [] do_get_write_access+0x2df/0x313 [jbd] Jul 11 15:39:29 wangchen kernel: [] __ext3_get_inode_loc+0xcf/0x271 [ext3] Jul 11 15:39:29 wangchen kernel: [] __ext3_journal_dirty_metadata+0x13/0x32 [ext3] Jul 11 15:39:29 wangchen kernel: [] __wake_up+0xf/0x15 Jul 11 15:39:29 wangchen kernel: [] journal_stop+0x1bd/0x1c6 [jbd] Jul 11 15:39:29 wangchen kernel: [] __ext3_journal_stop+0x19/0x34 [ext3] Jul 11 15:39:29 wangchen kernel: [] get_page_from_freelist+0x94/0x369 Jul 11 15:39:29 wangchen kernel: [] filemap_fault+0x1ac/0x2fe Jul 11 15:39:29 wangchen kernel: [] security_sk_alloc+0xd/0xf Jul 11 15:39:29 wangchen kernel: [] sk_prot_alloc+0x36/0x78 Jul 11 15:39:29 wangchen kernel: [] sk_alloc+0x3a/0x40 Jul 11 15:39:29 wangchen kernel: [] raw_hash_sk+0x46/0x4e Jul 11 15:39:29 wangchen kernel: [] d_alloc+0x1b/0x157 Jul 11 15:39:29 wangchen kernel: [] sock_common_setsockopt+0x12/0x16 Jul 11 15:39:29 wangchen kernel: [] sys_setsockopt+0x6f/0x8e Jul 11 15:39:29 wangchen kernel: [] sys_socketcall+0x15c/0x19e Jul 11 15:39:29 wangchen kernel: [] sysenter_past_esp+0x6a/0x99 Jul 11 15:39:29 wangchen kernel: [] unix_poll+0x69/0x78 Jul 11 15:39:29 wangchen kernel: ======================= Jul 11 15:39:29 wangchen kernel: Code: 83 e0 01 00 00 85 c0 75 1f 53 53 68 12 81 31 c0 e8 3c 30 ed ff ba 3f 0e 00 00 b8 b9 7f 31 c0 83 c4 0c 5b e9 f5 26 ed ff 48 74 04 <0f> 0b eb fe 89 d8 e8 21 ff ff ff 89 d8 e8 62 ea ff ff c7 83 e0 Jul 11 15:39:29 wangchen kernel: EIP: [] rollback_registered+0x61/0xe3 SS:ESP 0068:ed8e7c3c Jul 11 15:39:29 wangchen kernel: ---[ end trace c311acf85d169786 ]--- === Signed-off-by: Wang Chen Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2f4d8afd067..c9ab47b966b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -184,6 +184,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) if (dev_open(dev)) goto failure; + dev_hold(dev); } } return dev; @@ -250,6 +251,8 @@ static struct net_device *ipmr_reg_vif(void) if (dev_open(dev)) goto failure; + dev_hold(dev); + return dev; failure: @@ -264,9 +267,10 @@ failure: /* * Delete a VIF entry + * @notify: Set to 1, if the caller is a notifier_call */ -static int vif_delete(int vifi) +static int vif_delete(int vifi, int notify) { struct vif_device *v; struct net_device *dev; @@ -309,7 +313,7 @@ static int vif_delete(int vifi) ip_rt_multicast_event(in_dev); } - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) unregister_netdevice(dev); dev_put(dev); @@ -435,6 +439,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); + dev_put(dev); return err; } break; @@ -446,6 +451,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) err = dev_set_allmulti(dev, 1); if (err) { ipmr_del_tunnel(dev, vifc); + dev_put(dev); return err; } break; @@ -453,10 +459,11 @@ static int vif_add(struct vifctl *vifc, int mrtsock) dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); if (!dev) return -EADDRNOTAVAIL; - dev_put(dev); err = dev_set_allmulti(dev, 1); - if (err) + if (err) { + dev_put(dev); return err; + } break; default: return -EINVAL; @@ -487,7 +494,6 @@ static int vif_add(struct vifctl *vifc, int mrtsock) /* And finish update writing critical data */ write_lock_bh(&mrt_lock); - dev_hold(dev); v->dev=dev; #ifdef CONFIG_IP_PIMSM if (v->flags&VIFF_REGISTER) @@ -834,7 +840,7 @@ static void mroute_clean_tables(struct sock *sk) */ for (i=0; idev==dev) - vif_delete(ct); + vif_delete(ct, 1); } return NOTIFY_DONE; } -- cgit v1.2.3 From 0388b0042624714e6f8db8cc7994101a0a02d392 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 14 Jul 2008 23:00:43 -0700 Subject: icmp: add struct net argument to icmp_out_count This routine deals with ICMP statistics, but doesn't have a struct net at hands, so add one. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 +- net/ipv4/ip_output.c | 3 ++- net/ipv4/raw.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index aa7cf46853b..1ffe7add492 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -296,7 +296,7 @@ out: /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ -void icmp_out_count(unsigned char type) +void icmp_out_count(struct net *net, unsigned char type) { ICMPMSGOUT_INC_STATS(type); ICMP_INC_STATS(ICMP_MIB_OUTMSGS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f1278eecf56..f003186b93b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1211,6 +1211,7 @@ int ip_push_pending_frames(struct sock *sk) struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options *opt = NULL; struct rtable *rt = (struct rtable *)inet->cork.dst; struct iphdr *iph; @@ -1280,7 +1281,7 @@ int ip_push_pending_frames(struct sock *sk) skb->dst = dst_clone(&rt->u.dst); if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(((struct icmphdr *) + icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 925fdf18cf9..7f39ea443ec 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -320,6 +320,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct iphdr *iph; struct sk_buff *skb; unsigned int iphlen; @@ -368,7 +369,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(((struct icmphdr *) + icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, -- cgit v1.2.3 From fd54d716b1f6a3551ec17a4bb34027727b2db09a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 14 Jul 2008 23:01:40 -0700 Subject: inet: toss struct net initialization around Some places, that deal with ICMP statistics already have where to get a struct net from, but use it directly, without declaring a separate variable on the stack. Since I will need this net soon, I declare a struct net on the stack and use it in the existing places in a separate patch not to spoil the future ones. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 4 +--- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/udp.c | 3 ++- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1ffe7add492..56d6b943345 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -973,6 +973,7 @@ int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; struct rtable *rt = skb->rtable; + struct net *net = dev_net(rt->u.dst.dev); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { int nh; @@ -1027,9 +1028,6 @@ int icmp_rcv(struct sk_buff *skb) */ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - struct net *net; - - net = dev_net(rt->u.dst.dev); /* * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be * silently ignored (we let user decide with a sysctl). diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4300bcf2cea..ca41b77f3f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -343,13 +343,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) struct sock *sk; __u32 seq; int err; + struct net *net = dev_net(skb->dev); if (skb->len < (iph->ihl << 2) + 8) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; } - sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, + sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7187121e922..9342cfda3d0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -354,8 +354,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) struct sock *sk; int harderr; int err; + struct net *net = dev_net(skb->dev); - sk = __udp4_lib_lookup(dev_net(skb->dev), iph->daddr, uh->dest, + sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, udptable); if (sk == NULL) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); -- cgit v1.2.3 From 75c939bb4d6da790f758a2a3dcc7432f6d8778ee Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 14 Jul 2008 23:02:35 -0700 Subject: mib: add struct net to ICMP_INC_STATS Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 56d6b943345..e94de411cfa 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -299,7 +299,7 @@ out: void icmp_out_count(struct net *net, unsigned char type) { ICMPMSGOUT_INC_STATS(type); - ICMP_INC_STATS(ICMP_MIB_OUTMSGS); + ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS); } /* -- cgit v1.2.3 From dcfc23cac103b54dbc00a6f52f47656ad5c75844 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 14 Jul 2008 23:03:00 -0700 Subject: mib: add struct net to ICMP_INC_STATS_BH Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 10 +++++----- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/udp.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e94de411cfa..56a7bbc7950 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -763,7 +763,7 @@ static void icmp_unreach(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto out; } @@ -803,7 +803,7 @@ static void icmp_redirect(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); goto out; } @@ -874,7 +874,7 @@ static void icmp_timestamp(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(dev_net(skb->dst->dev), ICMP_MIB_INERRORS); goto out; } @@ -994,7 +994,7 @@ int icmp_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -1053,7 +1053,7 @@ drop: kfree_skb(skb); return 0; error: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto drop; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ca41b77f3f3..0fefd440941 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -346,14 +346,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) struct net *net = dev_net(skb->dev); if (skb->len < (iph->ihl << 2) + 8) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(skb)); if (!sk) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9342cfda3d0..fdd4faa1f12 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -359,7 +359,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, udptable); if (sk == NULL) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ } -- cgit v1.2.3 From 903fc1964e746b8d8e2971ea20c89b7aeab8bd9a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 14 Jul 2008 23:03:35 -0700 Subject: mib: add struct net to ICMPMSGOUT_INC_STATS Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 56a7bbc7950..6f4c65b898b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -298,7 +298,7 @@ out: */ void icmp_out_count(struct net *net, unsigned char type) { - ICMPMSGOUT_INC_STATS(type); + ICMPMSGOUT_INC_STATS(net, type); ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS); } -- cgit v1.2.3 From f66ac03d497c162c70cd0ccc802ce1777073cdf3 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 14 Jul 2008 23:04:00 -0700 Subject: mib: add struct net to ICMPMSGIN_INC_STATS_BH Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 6f4c65b898b..ea60ad41008 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1012,7 +1012,7 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); - ICMPMSGIN_INC_STATS_BH(icmph->type); + ICMPMSGIN_INC_STATS_BH(net, icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * -- cgit v1.2.3 From d56400504a40a4aa197af629300d76544169e821 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 16 Jul 2008 11:13:35 +0000 Subject: ipvs: Initialize mcast addr at compile time There's no need to do it at runtime, the values are constant. Signed-off-by: Sven Wegener Acked-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2d4a86f7332..8c900dfe832 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -139,7 +139,11 @@ char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; /* multicast addr */ -static struct sockaddr_in mcast_addr; +static struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = __constant_htons(IP_VS_SYNC_PORT), + .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), +}; static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) @@ -862,11 +866,6 @@ static int sync_thread(void *startup) /* set the maximum length of sync message */ set_sync_mesg_maxlen(state); - /* set up multicast address */ - mcast_addr.sin_family = AF_INET; - mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); - mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); - add_wait_queue(&sync_wait, &wait); set_sync_pid(state, task_pid_nr(current)); -- cgit v1.2.3 From e6dd731c75cba986a485924f908e6e05b088ea9e Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 16 Jul 2008 11:13:43 +0000 Subject: ipvs: Use ERR_PTR for returning errors from make_receive_sock() and make_send_sock() The additional information we now return to the caller is currently not used, but will be used to return errors to user space. Signed-off-by: Sven Wegener Acked-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 8c900dfe832..60b96823c9a 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -27,6 +27,7 @@ #include #include /* for ip_mc_join_group */ #include +#include #include #include @@ -576,14 +577,17 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) static struct socket * make_send_sock(void) { struct socket *sock; + int result; /* First create a socket */ - if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { IP_VS_ERR("Error during creation of socket; terminating\n"); - return NULL; + return ERR_PTR(result); } - if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) { + result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); + if (result < 0) { IP_VS_ERR("Error setting outbound mcast interface\n"); goto error; } @@ -591,14 +595,15 @@ static struct socket * make_send_sock(void) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); - if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) { + result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); + if (result < 0) { IP_VS_ERR("Error binding address of the mcast interface\n"); goto error; } - if (sock->ops->connect(sock, - (struct sockaddr*)&mcast_addr, - sizeof(struct sockaddr), 0) < 0) { + result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr), 0); + if (result < 0) { IP_VS_ERR("Error connecting to the multicast addr\n"); goto error; } @@ -607,7 +612,7 @@ static struct socket * make_send_sock(void) error: sock_release(sock); - return NULL; + return ERR_PTR(result); } @@ -617,27 +622,30 @@ static struct socket * make_send_sock(void) static struct socket * make_receive_sock(void) { struct socket *sock; + int result; /* First create a socket */ - if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { IP_VS_ERR("Error during creation of socket; terminating\n"); - return NULL; + return ERR_PTR(result); } /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = 1; - if (sock->ops->bind(sock, - (struct sockaddr*)&mcast_addr, - sizeof(struct sockaddr)) < 0) { + result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr)); + if (result < 0) { IP_VS_ERR("Error binding to the multicast addr\n"); goto error; } /* join the multicast group */ - if (join_mcast_group(sock->sk, - (struct in_addr*)&mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn) < 0) { + result = join_mcast_group(sock->sk, + (struct in_addr *) &mcast_addr.sin_addr, + ip_vs_backup_mcast_ifn); + if (result < 0) { IP_VS_ERR("Error joining to the multicast group\n"); goto error; } @@ -646,7 +654,7 @@ static struct socket * make_receive_sock(void) error: sock_release(sock); - return NULL; + return ERR_PTR(result); } @@ -719,7 +727,7 @@ static void sync_master_loop(void) /* create the sending multicast socket */ sock = make_send_sock(); - if (!sock) + if (IS_ERR(sock)) return; IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " @@ -772,7 +780,7 @@ static void sync_backup_loop(void) /* create the receiving multicast socket */ sock = make_receive_sock(); - if (!sock) + if (IS_ERR(sock)) goto out; IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " -- cgit v1.2.3 From 998e7a76804b7a273a0460c2cdd5a51fa9856717 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 16 Jul 2008 11:13:50 +0000 Subject: ipvs: Use kthread_run() instead of doing a double-fork via kernel_thread() This also moves the setup code out of the daemons, so that we're able to return proper error codes to user space. The current code will return success to user space when the daemon is started with an invald mcast interface. With these changes we get an appropriate "No such device" error. We longer need our own completion to be sure the daemons are actually running, because they no longer contain code that can fail and kthread_run() takes care of the rest. Signed-off-by: Sven Wegener Acked-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 369 +++++++++++++++++---------------------------- 1 file changed, 136 insertions(+), 233 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 60b96823c9a..550563a5660 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -28,10 +28,10 @@ #include /* for ip_mc_join_group */ #include #include +#include #include #include -#include /* for get_fs and set_fs */ #include @@ -67,8 +67,8 @@ struct ip_vs_sync_conn_options { }; struct ip_vs_sync_thread_data { - struct completion *startup; - int state; + struct socket *sock; + char *buf; }; #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) @@ -139,6 +139,10 @@ volatile int ip_vs_backup_syncid = 0; char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +/* sync daemon tasks */ +static struct task_struct *sync_master_thread; +static struct task_struct *sync_backup_thread; + /* multicast addr */ static struct sockaddr_in mcast_addr = { .sin_family = AF_INET, @@ -147,14 +151,7 @@ static struct sockaddr_in mcast_addr = { }; -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) -{ - spin_lock(&ip_vs_sync_lock); - list_add_tail(&sb->list, &ip_vs_sync_queue); - spin_unlock(&ip_vs_sync_lock); -} - -static inline struct ip_vs_sync_buff * sb_dequeue(void) +static inline struct ip_vs_sync_buff *sb_dequeue(void) { struct ip_vs_sync_buff *sb; @@ -198,6 +195,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ip_vs_sync_queue); + else + ip_vs_sync_buff_release(sb); + spin_unlock(&ip_vs_sync_lock); +} + /* * Get the current sync buffer if it has been created for more * than the specified time or the specified time is zero. @@ -712,43 +719,28 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) } -static DECLARE_WAIT_QUEUE_HEAD(sync_wait); -static pid_t sync_master_pid = 0; -static pid_t sync_backup_pid = 0; - -static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); -static int stop_master_sync = 0; -static int stop_backup_sync = 0; - -static void sync_master_loop(void) +static int sync_thread_master(void *data) { - struct socket *sock; + struct ip_vs_sync_thread_data *tinfo = data; struct ip_vs_sync_buff *sb; - /* create the sending multicast socket */ - sock = make_send_sock(); - if (IS_ERR(sock)) - return; - IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " "syncid = %d\n", ip_vs_master_mcast_ifn, ip_vs_master_syncid); - for (;;) { - while ((sb=sb_dequeue())) { - ip_vs_send_sync_msg(sock, sb->mesg); + while (!kthread_should_stop()) { + while ((sb = sb_dequeue())) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); ip_vs_sync_buff_release(sb); } /* check if entries stay in curr_sb for 2 seconds */ - if ((sb = get_curr_sync_buff(2*HZ))) { - ip_vs_send_sync_msg(sock, sb->mesg); + sb = get_curr_sync_buff(2 * HZ); + if (sb) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); ip_vs_sync_buff_release(sb); } - if (stop_master_sync) - break; - msleep_interruptible(1000); } @@ -763,262 +755,173 @@ static void sync_master_loop(void) } /* release the sending multicast socket */ - sock_release(sock); + sock_release(tinfo->sock); + kfree(tinfo); + + return 0; } -static void sync_backup_loop(void) +static int sync_thread_backup(void *data) { - struct socket *sock; - char *buf; + struct ip_vs_sync_thread_data *tinfo = data; int len; - if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) { - IP_VS_ERR("sync_backup_loop: kmalloc error\n"); - return; - } - - /* create the receiving multicast socket */ - sock = make_receive_sock(); - if (IS_ERR(sock)) - goto out; - IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " "syncid = %d\n", ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); - for (;;) { - /* do you have data now? */ - while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) { - if ((len = - ip_vs_receive(sock, buf, - sync_recv_mesg_maxlen)) <= 0) { + while (!kthread_should_stop()) { + /* do we have data now? */ + while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + sync_recv_mesg_maxlen); + if (len <= 0) { IP_VS_ERR("receiving message error\n"); break; } - /* disable bottom half, because it accessed the data + + /* disable bottom half, because it accesses the data shared by softirq while getting/creating conns */ local_bh_disable(); - ip_vs_process_message(buf, len); + ip_vs_process_message(tinfo->buf, len); local_bh_enable(); } - if (stop_backup_sync) - break; - msleep_interruptible(1000); } /* release the sending multicast socket */ - sock_release(sock); + sock_release(tinfo->sock); + kfree(tinfo->buf); + kfree(tinfo); - out: - kfree(buf); + return 0; } -static void set_sync_pid(int sync_state, pid_t sync_pid) -{ - if (sync_state == IP_VS_STATE_MASTER) - sync_master_pid = sync_pid; - else if (sync_state == IP_VS_STATE_BACKUP) - sync_backup_pid = sync_pid; -} - -static void set_stop_sync(int sync_state, int set) +int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) { - if (sync_state == IP_VS_STATE_MASTER) - stop_master_sync = set; - else if (sync_state == IP_VS_STATE_BACKUP) - stop_backup_sync = set; - else { - stop_master_sync = set; - stop_backup_sync = set; - } -} + struct ip_vs_sync_thread_data *tinfo; + struct task_struct **realtask, *task; + struct socket *sock; + char *name, *buf = NULL; + int (*threadfn)(void *data); + int result = -ENOMEM; -static int sync_thread(void *startup) -{ - DECLARE_WAITQUEUE(wait, current); - mm_segment_t oldmm; - int state; - const char *name; - struct ip_vs_sync_thread_data *tinfo = startup; + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + sizeof(struct ip_vs_sync_conn)); - /* increase the module use count */ - ip_vs_use_count_inc(); + if (state == IP_VS_STATE_MASTER) { + if (sync_master_thread) + return -EEXIST; - if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) { - state = IP_VS_STATE_MASTER; + strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, + sizeof(ip_vs_master_mcast_ifn)); + ip_vs_master_syncid = syncid; + realtask = &sync_master_thread; name = "ipvs_syncmaster"; - } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) { - state = IP_VS_STATE_BACKUP; + threadfn = sync_thread_master; + sock = make_send_sock(); + } else if (state == IP_VS_STATE_BACKUP) { + if (sync_backup_thread) + return -EEXIST; + + strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, + sizeof(ip_vs_backup_mcast_ifn)); + ip_vs_backup_syncid = syncid; + realtask = &sync_backup_thread; name = "ipvs_syncbackup"; + threadfn = sync_thread_backup; + sock = make_receive_sock(); } else { - IP_VS_BUG(); - ip_vs_use_count_dec(); return -EINVAL; } - daemonize(name); - - oldmm = get_fs(); - set_fs(KERNEL_DS); - - /* Block all signals */ - spin_lock_irq(¤t->sighand->siglock); - siginitsetinv(¤t->blocked, 0); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto out; + } - /* set the maximum length of sync message */ set_sync_mesg_maxlen(state); + if (state == IP_VS_STATE_BACKUP) { + buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); + if (!buf) + goto outsocket; + } - add_wait_queue(&sync_wait, &wait); - - set_sync_pid(state, task_pid_nr(current)); - complete(tinfo->startup); - - /* - * once we call the completion queue above, we should - * null out that reference, since its allocated on the - * stack of the creating kernel thread - */ - tinfo->startup = NULL; - - /* processing master/backup loop here */ - if (state == IP_VS_STATE_MASTER) - sync_master_loop(); - else if (state == IP_VS_STATE_BACKUP) - sync_backup_loop(); - else IP_VS_BUG(); - - remove_wait_queue(&sync_wait, &wait); - - /* thread exits */ - - /* - * If we weren't explicitly stopped, then we - * exited in error, and should undo our state - */ - if ((!stop_master_sync) && (!stop_backup_sync)) - ip_vs_sync_state -= tinfo->state; + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outbuf; - set_sync_pid(state, 0); - IP_VS_INFO("sync thread stopped!\n"); + tinfo->sock = sock; + tinfo->buf = buf; - set_fs(oldmm); + task = kthread_run(threadfn, tinfo, name); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } - /* decrease the module use count */ - ip_vs_use_count_dec(); + /* mark as active */ + *realtask = task; + ip_vs_sync_state |= state; - set_stop_sync(state, 0); - wake_up(&stop_sync_wait); + /* increase the module use count */ + ip_vs_use_count_inc(); - /* - * we need to free the structure that was allocated - * for us in start_sync_thread - */ - kfree(tinfo); return 0; -} - - -static int fork_sync_thread(void *startup) -{ - pid_t pid; - - /* fork the sync thread here, then the parent process of the - sync thread is the init process after this thread exits. */ - repeat: - if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) { - IP_VS_ERR("could not create sync_thread due to %d... " - "retrying.\n", pid); - msleep_interruptible(1000); - goto repeat; - } - return 0; +outtinfo: + kfree(tinfo); +outbuf: + kfree(buf); +outsocket: + sock_release(sock); +out: + return result; } -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +int stop_sync_thread(int state) { - DECLARE_COMPLETION_ONSTACK(startup); - pid_t pid; - struct ip_vs_sync_thread_data *tinfo; - - if ((state == IP_VS_STATE_MASTER && sync_master_pid) || - (state == IP_VS_STATE_BACKUP && sync_backup_pid)) - return -EEXIST; - - /* - * Note that tinfo will be freed in sync_thread on exit - */ - tinfo = kmalloc(sizeof(struct ip_vs_sync_thread_data), GFP_KERNEL); - if (!tinfo) - return -ENOMEM; - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); - ip_vs_sync_state |= state; if (state == IP_VS_STATE_MASTER) { - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - } else { - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - } - - tinfo->state = state; - tinfo->startup = &startup; + if (!sync_master_thread) + return -ESRCH; - repeat: - if ((pid = kernel_thread(fork_sync_thread, tinfo, 0)) < 0) { - IP_VS_ERR("could not create fork_sync_thread due to %d... " - "retrying.\n", pid); - msleep_interruptible(1000); - goto repeat; - } + IP_VS_INFO("stopping master sync thread %d ...\n", + task_pid_nr(sync_master_thread)); - wait_for_completion(&startup); - - return 0; -} - - -int stop_sync_thread(int state) -{ - DECLARE_WAITQUEUE(wait, current); + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ - if ((state == IP_VS_STATE_MASTER && !sync_master_pid) || - (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) - return -ESRCH; + spin_lock(&ip_vs_sync_lock); + ip_vs_sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock(&ip_vs_sync_lock); + kthread_stop(sync_master_thread); + sync_master_thread = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + if (!sync_backup_thread) + return -ESRCH; + + IP_VS_INFO("stopping backup sync thread %d ...\n", + task_pid_nr(sync_backup_thread)); + + ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; + kthread_stop(sync_backup_thread); + sync_backup_thread = NULL; + } else { + return -EINVAL; + } - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_INFO("stopping sync thread %d ...\n", - (state == IP_VS_STATE_MASTER) ? - sync_master_pid : sync_backup_pid); - - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&stop_sync_wait, &wait); - set_stop_sync(state, 1); - ip_vs_sync_state -= state; - wake_up(&sync_wait); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&stop_sync_wait, &wait); - - /* Note: no need to reap the sync thread, because its parent - process is the init process */ - - if ((state == IP_VS_STATE_MASTER && stop_master_sync) || - (state == IP_VS_STATE_BACKUP && stop_backup_sync)) - IP_VS_BUG(); + /* decrease the module use count */ + ip_vs_use_count_dec(); return 0; } -- cgit v1.2.3 From ba6fd85021dec97d58373d9aea4bea8fc24258be Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 16 Jul 2008 11:13:56 +0000 Subject: ipvs: Put backup thread on mcast socket wait queue Instead of doing an endless loop with sleeping for one second, we now put the backup thread onto the mcast socket wait queue and it gets woken up as soon as we have data to process. Signed-off-by: Sven Wegener Acked-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 550563a5660..cf5ed758ea1 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -772,6 +773,10 @@ static int sync_thread_backup(void *data) ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); while (!kthread_should_stop()) { + wait_event_interruptible(*tinfo->sock->sk->sk_sleep, + !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) + || kthread_should_stop()); + /* do we have data now? */ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { len = ip_vs_receive(tinfo->sock, tinfo->buf, @@ -787,8 +792,6 @@ static int sync_thread_backup(void *data) ip_vs_process_message(tinfo->buf, len); local_bh_enable(); } - - msleep_interruptible(1000); } /* release the sending multicast socket */ -- cgit v1.2.3 From 375c6bbabf210ded3151481ca1ac7f730903512b Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 16 Jul 2008 11:14:03 +0000 Subject: ipvs: Use schedule_timeout_interruptible() instead of msleep_interruptible() So that kthread_stop() can wake up the thread and we don't have to wait one second in the worst case for the daemon to actually stop. Signed-off-by: Sven Wegener Acked-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index cf5ed758ea1..45e9bd96c28 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -742,7 +742,7 @@ static int sync_thread_master(void *data) ip_vs_sync_buff_release(sb); } - msleep_interruptible(1000); + schedule_timeout_interruptible(HZ); } /* clean up the sync_buff queue */ -- cgit v1.2.3 From 9d3a0de7dc3c1c4456db5ceb92c445bef0a47681 Mon Sep 17 00:00:00 2001 From: "Rumen G. Bogdanovski" Date: Wed, 16 Jul 2008 20:04:23 -0700 Subject: ipvs: More reliable synchronization on connection close This patch enhances the synchronization of the closing connections between the master and the backup director. It prevents the closed connections to expire with the 15 min timeout of the ESTABLISHED state on the backup and makes them expire as they would do on the master with much shorter timeouts. Signed-off-by: Rumen G. Bogdanovski Acked-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index bcf6276ba4b..a7879eafc3b 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -991,7 +991,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, == sysctl_ip_vs_sync_threshold[0])) || ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE))))) + (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || + (cp->state == IP_VS_TCP_S_TIME_WAIT))))) ip_vs_sync_conn(cp); cp->old_state = cp->state; -- cgit v1.2.3 From 70efce27fc3d54271519244dc5e47da4ed711dd4 Mon Sep 17 00:00:00 2001 From: Will Newton Date: Wed, 16 Jul 2008 20:13:43 -0700 Subject: net/ipv4/tcp.c: Fix use of PULLHUP instead of POLLHUP in comments. Change PULLHUP to POLLHUP in tcp_poll comments and clean up another comment for grammar and coding style. Signed-off-by: Will Newton Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 56a133c6145..631133ebb20 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -344,8 +344,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. */ mask = 0; @@ -371,10 +371,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX - * solve this dilemma. I would prefer, if PULLHUP were maskable, + * solve this dilemma. I would prefer, if POLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains - * why PULLHUP is incompatible with POLLOUT. --ANK + * why POLLHUP is incompatible with POLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK -- cgit v1.2.3 From 84a3aa000eacbaf841d745b07ef3a3280899056b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:19:08 -0700 Subject: ipv4: prepare net initialization for IP accounting Some places, that deal with IP statistics already have where to get a struct net from, but use it directly, without declaring a separate variable on the stack. So, save this net on the stack for future IP_XXX_STATS macros. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 3 ++- net/ipv4/ip_fragment.c | 6 +++--- net/ipv4/udp.c | 4 +++- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5bbf0005151..8338e106665 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -338,9 +338,10 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, .uli_u = { .ports = { .sport = inet_sk(sk)->sport, .dport = ireq->rmt_port } } }; + struct net *net = sock_net(sk); security_req_classify_flow(req, &fl); - if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) { + if (ip_route_output_flow(net, &rt, &fl, sk, 0)) { IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fbd5804b5d8..e23be5b36e1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -187,8 +187,10 @@ static void ip_evictor(struct net *net) static void ip_expire(unsigned long arg) { struct ipq *qp; + struct net *net; qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock); @@ -202,9 +204,7 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; - struct net *net; - net = container_of(qp->q.net, struct net, ipv4.frags); /* Send an ICMP "Fragment Reassembly Timeout" message. */ if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); @@ -570,9 +570,9 @@ int ip_defrag(struct sk_buff *skb, u32 user) struct ipq *qp; struct net *net; + net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev); IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); - net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev); /* Start by cleaning up the memory. */ if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) ip_evictor(net); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fdd4faa1f12..048ef57edc1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -656,8 +656,10 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, .uli_u = { .ports = { .sport = inet->sport, .dport = dport } } }; + struct net *net = sock_net(sk); + security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); + err = ip_route_output_flow(net, &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); -- cgit v1.2.3 From 5e38e270444f2629de7a706b5a9ca1b333d14517 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:19:49 -0700 Subject: mib: add net to IP_INC_STATS All the callers already have either the net itself, or the place where to get it from. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 30 +++++++++++++++--------------- net/ipv4/raw.c | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index da14725916d..7f78a5a7e1e 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -88,7 +88,7 @@ int ip_forward(struct sk_buff *skb) if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(dst_mtu(&rt->u.dst))); goto drop; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f003186b93b..465544f6281 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -182,9 +182,9 @@ static inline int ip_finish_output2(struct sk_buff *skb) unsigned int hh_len = LL_RESERVED_SPACE(dev); if (rt->rt_type == RTN_MULTICAST) - IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) - IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTBCASTPKTS); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -244,7 +244,7 @@ int ip_mc_output(struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -298,7 +298,7 @@ int ip_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -389,7 +389,7 @@ packet_routed: return ip_local_out(skb); no_route: - IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -451,7 +451,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); @@ -542,7 +542,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); if (!err) - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -552,7 +552,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } if (err == 0) { - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return 0; } @@ -561,7 +561,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) kfree_skb(frag); frag = skb; } - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -673,15 +673,15 @@ slow_path: if (err) goto fail; - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -1047,7 +1047,7 @@ alloc_new_skb: error: inet->cork.length -= length; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1189,7 +1189,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, error: inet->cork.length -= size; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1298,7 +1298,7 @@ out: return err; error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); goto out; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7f39ea443ec..cd975743bcd 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -385,7 +385,7 @@ error_fault: err = -EFAULT; kfree_skb(skb); error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); return err; } -- cgit v1.2.3 From 7c73a6faffae0bfae70639113aecf06af666e714 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:20:11 -0700 Subject: mib: add net to IP_INC_STATS_BH Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/datagram.c | 2 +- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_forward.c | 4 ++-- net/ipv4/ip_fragment.c | 17 ++++++++++------- net/ipv4/ip_input.c | 30 ++++++++++++++++-------------- net/ipv4/ipmr.c | 4 ++-- net/ipv4/route.c | 3 ++- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 2 +- 9 files changed, 37 insertions(+), 31 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 0c0c73f368c..5e6c5a0f3fd 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -52,7 +52,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->sport, usin->sin_port, sk, 1); if (err) { if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8338e106665..bb81c958b74 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -342,12 +342,12 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, security_req_classify_flow(req, &fl); if (ip_route_output_flow(net, &rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } return &rt->u.dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 7f78a5a7e1e..450016b89a1 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -42,7 +42,7 @@ static int ip_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -123,7 +123,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e23be5b36e1..65364c06ada 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -199,8 +199,8 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); - IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; @@ -261,7 +261,10 @@ static inline int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments && (end - start) > max; if (rc) { - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + struct net *net; + + net = container_of(qp->q.net, struct net, ipv4.frags); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); } return rc; @@ -545,7 +548,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); - IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; return 0; @@ -560,7 +563,7 @@ out_oversize: "Oversized IP packet from " NIPQUAD_FMT ".\n", NIPQUAD(qp->saddr)); out_fail: - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS); return err; } @@ -571,7 +574,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) struct net *net; net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev); - IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Start by cleaning up the memory. */ if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) @@ -590,7 +593,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) return ret; } - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7c26428ea67..043f640df4b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -230,16 +230,16 @@ static int ip_local_deliver_finish(struct sk_buff *skb) protocol = -ret; goto resubmit; } - IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } } else - IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); kfree_skb(skb); } } @@ -281,7 +281,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } @@ -290,7 +290,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } @@ -334,9 +334,11 @@ static int ip_rcv_finish(struct sk_buff *skb) skb->dev); if (unlikely(err)) { if (err == -EHOSTUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INADDRERRORS); else if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INNOROUTES); goto drop; } } @@ -357,9 +359,9 @@ static int ip_rcv_finish(struct sk_buff *skb) rt = skb->rtable; if (rt->rt_type == RTN_MULTICAST) - IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) - IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCASTPKTS); return dst_input(skb); @@ -382,10 +384,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (skb->pkt_type == PACKET_OTHERHOST) goto drop; - IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INRECEIVES); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto out; } @@ -418,7 +420,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -428,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } @@ -439,7 +441,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, ip_rcv_finish); inhdr_error: - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c9ab47b966b..033c712c3a5 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1178,7 +1178,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -1241,7 +1241,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) to blackhole. */ - IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 79c1e74263a..e4ab0ac94f9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1429,7 +1429,8 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), + IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0fefd440941..7797d528701 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -175,7 +175,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->sport, usin->sin_port, sk, 1); if (tmp < 0) { if (tmp == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return tmp; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 048ef57edc1..1560d11ba6e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -662,7 +662,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, err = ip_route_output_flow(net, &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; } -- cgit v1.2.3 From c5346fe396f5e22bbfb3ec037c43891c3c57d3e6 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:20:33 -0700 Subject: mib: add net to IP_ADD_STATS_BH Very simple - only ip_evictor (fragments) requires such. This patch ends up the IP_XXX_STATS patching. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 65364c06ada..38d38f05801 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -178,7 +178,7 @@ static void ip_evictor(struct net *net) evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); if (evicted) - IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted); + IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); } /* -- cgit v1.2.3 From a86b1e3019455283a677c2485cfeda2dc36df3eb Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:20:58 -0700 Subject: inet: prepare struct net for TCP MIB accounting This is the same as the first patch in the set, but preparing the net for TCP_XXX_STATS - save the struct net on the stack where required and possible. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7797d528701..db3bf9be076 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -544,6 +544,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif + struct net *net; /* Never send a reset in response to a reset. */ if (th->rst) @@ -594,7 +595,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb, + net = dev_net(skb->dst->dev); + ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); @@ -619,6 +621,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; + struct net *net = dev_net(skb->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -668,7 +671,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; - ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb, + ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); @@ -1505,6 +1508,7 @@ int tcp_v4_rcv(struct sk_buff *skb) struct tcphdr *th; struct sock *sk; int ret; + struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1539,7 +1543,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, + sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (!sk) goto no_tcp_socket; -- cgit v1.2.3 From a9c19329eccdb145a08a4a2e969d7b40c54c9bcc Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:21:42 -0700 Subject: tcp: add net to tcp_mib_init This one sets TCP MIBs after zeroing them, and thus requires the net. The existing single caller can use init_net (temporarily). Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index dc411335c14..95a966dd191 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1363,7 +1363,7 @@ static int __init init_ipv4_mibs(void) sizeof(struct udp_mib)) < 0) goto err_udplite_mib; - tcp_mib_init(); + tcp_mib_init(&init_net); return 0; -- cgit v1.2.3 From 81cc8a75d944fa39fc333c2c329c8e8b3c62cada Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:22:04 -0700 Subject: mib: add net to TCP_INC_STATS Fortunately (almost) all the TCP code has a sock to get the net from :) Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_output.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 631133ebb20..aa79198fb02 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1668,12 +1668,12 @@ void tcp_set_state(struct sock *sk, int state) switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) - TCP_INC_STATS(TCP_MIB_CURRESTAB); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) - TCP_INC_STATS(TCP_MIB_ESTABRESETS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index edef2afe905..c3b0da82698 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -618,7 +618,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) @@ -1910,7 +1910,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (err == 0) { /* Update global TCP statistics. */ - TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); tp->total_retrans++; @@ -2132,7 +2132,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); - TCP_INC_STATS(TCP_MIB_OUTRSTS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } /* WARNING: This routine must only be called when we have already sent @@ -2258,7 +2258,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, ); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -2367,7 +2367,7 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, -- cgit v1.2.3 From 63231bddf6514778792d3784f63822473d250fc0 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:22:25 -0700 Subject: mib: add net to TCP_INC_STATS_BH Same as before - the sock is always there to get the net from, but there are also some places with the net already saved on the stack. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_ipv4.c | 14 +++++++------- net/ipv4/tcp_minisocks.c | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index aa79198fb02..f742f84026b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2663,7 +2663,7 @@ EXPORT_SYMBOL(__tcp_put_md5sig_pool); void tcp_done(struct sock *sk) { if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6ea970a151..01e8004911b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4795,7 +4795,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); return 0; } else { /* Header too small */ - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { @@ -4934,7 +4934,7 @@ slow_path: tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); tcp_reset(sk); return 1; @@ -4957,7 +4957,7 @@ step5: return 0; csum_error: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: __kfree_skb(skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index db3bf9be076..e876312b950 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -599,8 +599,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); } /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states @@ -674,7 +674,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -1494,7 +1494,7 @@ discard: return 0; csum_err: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -1514,7 +1514,7 @@ int tcp_v4_rcv(struct sk_buff *skb) goto discard_it; /* Count it even if it's bad */ - TCP_INC_STATS_BH(TCP_MIB_INSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; @@ -1590,7 +1590,7 @@ no_tcp_socket: if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { bad_packet: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } @@ -1611,7 +1611,7 @@ do_time_wait: } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); inet_twsk_put(inet_twsk(sk)); goto discard_it; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ea68a478fad..8b02b103996 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -480,7 +480,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); - TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; } @@ -630,7 +630,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } -- cgit v1.2.3 From 74688e487a407a33d42879957b478601aca616b8 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:22:46 -0700 Subject: mib: add net to TCP_DEC_STATS Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f742f84026b..525dcf53415 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1682,7 +1682,7 @@ void tcp_set_state(struct sock *sk, int state) /* fall through */ default: if (oldstate==TCP_ESTABLISHED) - TCP_DEC_STATS(TCP_MIB_CURRESTAB); + TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed -- cgit v1.2.3 From 5c52ba170f8167511bdb65b981f4582100c40675 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:28:10 -0700 Subject: sock: add net to prot->enter_memory_pressure callback The tcp_enter_memory_pressure calls NET_INC_STATS, but doesn't have where to get the net from. I decided to add a sk argument, not the net itself, only to factor all the required sock_net(sk) calls inside the enter_memory_pressure callback itself. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 525dcf53415..bc8559a6f7e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -316,7 +316,7 @@ int tcp_memory_pressure __read_mostly; EXPORT_SYMBOL(tcp_memory_pressure); -void tcp_enter_memory_pressure(void) +void tcp_enter_memory_pressure(struct sock *sk) { if (!tcp_memory_pressure) { NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES); @@ -649,7 +649,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(); + sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; -- cgit v1.2.3 From ca12a1a443a51298afcca627ad0bcbd8ad1dcddc Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:28:42 -0700 Subject: inet: prepare net on the stack for NET accounting macros Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 29df75a6bcc..aab98b8a994 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -421,8 +421,9 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) struct rtable *rt; int flag = 0; /*unsigned long now; */ + struct net *net = dev_net(dev); - if (ip_route_output_key(dev_net(dev), &rt, &fl) < 0) + if (ip_route_output_key(net, &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); -- cgit v1.2.3 From 1ed834655a0d42ecd80ff051e681e2ea44747b6c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:29:51 -0700 Subject: tcp: replace tcp_sock argument with sock in some places These places have a tcp_sock, but we'd prefer the sock itself to get net from it. Fortunately, tcp_sk macro is just a type cast, so this replace is really cheap. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 01e8004911b..f50d8433f04 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1169,10 +1169,11 @@ static void tcp_mark_lost_retrans(struct sock *sk) tp->lost_retrans_low = new_low_seq; } -static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, +static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) { + struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); int dup_sack = 0; @@ -1434,7 +1435,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, tcp_highest_sack_reset(sk); } - found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire, + found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) flag |= FLAG_DSACKING_ACK; @@ -3711,8 +3712,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, return 0; } -static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { int mib_idx; @@ -3731,10 +3734,12 @@ static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) } } -static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->rx_opt.dsack) - tcp_dsack_set(tp, seq, end_seq); + tcp_dsack_set(sk, seq, end_seq); else tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } @@ -3753,7 +3758,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) end_seq = tp->rcv_nxt; - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); } } @@ -3906,7 +3911,7 @@ static void tcp_ofo_queue(struct sock *sk) __u32 dsack = dsack_high; if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) dsack_high = TCP_SKB_CB(skb)->end_seq; - tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack); + tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { @@ -4035,7 +4040,7 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an immediate ack. */ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: tcp_enter_quickack_mode(sk); @@ -4057,7 +4062,7 @@ drop: tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. @@ -4122,12 +4127,12 @@ drop: if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ __kfree_skb(skb); - tcp_dsack_set(tp, seq, end_seq); + tcp_dsack_set(sk, seq, end_seq); goto add_sack; } if (after(seq, TCP_SKB_CB(skb1)->seq)) { /* Partial overlap. */ - tcp_dsack_set(tp, seq, + tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); } else { skb1 = skb1->prev; @@ -4140,12 +4145,12 @@ drop: (struct sk_buff *)&tp->out_of_order_queue && after(end_seq, TCP_SKB_CB(skb1)->seq)) { if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, end_seq); break; } __skb_unlink(skb1, &tp->out_of_order_queue); - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); __kfree_skb(skb1); } -- cgit v1.2.3 From 4e6734447dbc7a0a85e09616821c0782d9fb1141 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:30:14 -0700 Subject: mib: add net to NET_INC_STATS Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc8559a6f7e..85f08291e92 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -319,7 +319,7 @@ EXPORT_SYMBOL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) { if (!tcp_memory_pressure) { - NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); tcp_memory_pressure = 1; } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c3b0da82698..176f0702b8a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2119,7 +2119,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } @@ -2130,7 +2130,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } -- cgit v1.2.3 From de0744af1fe2d0a3d428f6af0f2fe1f6179b1a9c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:31:16 -0700 Subject: mib: add net to NET_INC_STATS_BH Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- net/ipv4/inet_hashtables.c | 4 +-- net/ipv4/syncookies.c | 6 ++--- net/ipv4/tcp.c | 6 +++-- net/ipv4/tcp_input.c | 65 +++++++++++++++++++++++----------------------- net/ipv4/tcp_ipv4.c | 12 ++++----- net/ipv4/tcp_minisocks.c | 6 ++--- net/ipv4/tcp_output.c | 4 +-- net/ipv4/tcp_timer.c | 12 ++++----- 9 files changed, 60 insertions(+), 57 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index aab98b8a994..b043eda60b0 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -426,7 +426,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) if (ip_route_output_key(net, &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { - NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index eca5899729e..115f53722d2 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -312,11 +312,11 @@ unique: if (twp) { *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index fdde2ae07e2..51bc24d3b8a 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -173,7 +173,7 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) ; *mssp = msstab[mssind] + 1; - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), @@ -269,11 +269,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) || (mss = cookie_check(skb, cookie)) == 0) { - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 85f08291e92..9e0e45c3780 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1871,7 +1871,8 @@ adjudge_to_death: if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); @@ -1893,7 +1894,8 @@ adjudge_to_death: "sockets\n"); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONMEMORY); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f50d8433f04..fac49a6e161 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -961,7 +961,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, else mib_idx = LINUX_MIB_TCPSACKREORDER; - NET_INC_STATS_BH(mib_idx); + NET_INC_STATS_BH(sock_net(sk), mib_idx); #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, @@ -1157,7 +1157,7 @@ static void tcp_mark_lost_retrans(struct sock *sk) tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; } - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; @@ -1181,7 +1181,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = 1; tcp_dsack_seen(tp); - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); @@ -1190,7 +1190,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, !before(start_seq_0, start_seq_1)) { dup_sack = 1; tcp_dsack_seen(tp); - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPDSACKOFORECV); } } @@ -1476,7 +1477,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, mib_idx = LINUX_MIB_TCPSACKDISCARD; } - NET_INC_STATS_BH(mib_idx); + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (i == 0) first_sack_index = -1; continue; @@ -1969,7 +1970,7 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; @@ -2401,7 +2402,7 @@ static int tcp_try_undo_recovery(struct sock *sk) else mib_idx = LINUX_MIB_TCPFULLUNDO; - NET_INC_STATS_BH(mib_idx); + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { @@ -2424,7 +2425,7 @@ static void tcp_try_undo_dsack(struct sock *sk) DBGUNDO(sk, "D-SACK"); tcp_undo_cwr(sk, 1); tp->undo_marker = 0; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } } @@ -2447,7 +2448,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); - NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. * If the first packet was delayed, the rest @@ -2476,7 +2477,7 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, "partial loss"); tp->lost_out = 0; tcp_undo_cwr(sk, 1); - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (tcp_is_sack(tp)) @@ -2595,7 +2596,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); - NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); } /* D. Check consistency of the current state. */ @@ -2700,7 +2701,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) else mib_idx = LINUX_MIB_TCPSACKRECOVERY; - NET_INC_STATS_BH(mib_idx); + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; @@ -3211,7 +3212,7 @@ static int tcp_process_frto(struct sock *sk, int flag) } tp->frto_counter = 0; tp->undo_marker = 0; - NET_INC_STATS_BH(LINUX_MIB_TCPSPURIOUSRTOS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); } return 0; } @@ -3264,12 +3265,12 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tcp_ca_event(sk, CA_EVENT_FAST_ACK); - NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else - NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); @@ -3724,7 +3725,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) else mib_idx = LINUX_MIB_TCPDSACKOFOSENT; - NET_INC_STATS_BH(mib_idx); + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; @@ -3750,7 +3751,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); if (tcp_is_sack(tp) && sysctl_tcp_dsack) { @@ -4039,7 +4040,7 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an immediate ack. */ - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: @@ -4181,7 +4182,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct sk_buff *next = skb->next; __skb_unlink(skb, list); __kfree_skb(skb); - NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); skb = next; continue; } @@ -4249,7 +4250,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct sk_buff *next = skb->next; __skb_unlink(skb, list); __kfree_skb(skb); - NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); skb = next; if (skb == tail || tcp_hdr(skb)->syn || @@ -4312,7 +4313,7 @@ static int tcp_prune_ofo_queue(struct sock *sk) int res = 0; if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); __skb_queue_purge(&tp->out_of_order_queue); /* Reset SACK state. A conforming SACK implementation will @@ -4341,7 +4342,7 @@ static int tcp_prune_queue(struct sock *sk) SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); @@ -4370,7 +4371,7 @@ static int tcp_prune_queue(struct sock *sk) * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. */ - NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ tp->pred_flags = 0; @@ -4837,7 +4838,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); } if (copied_early) tcp_cleanup_rbuf(sk, skb->len); @@ -4860,7 +4861,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; - NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ __skb_pull(skb, tcp_header_len); @@ -4904,7 +4905,7 @@ slow_path: if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); goto discard; } @@ -4940,7 +4941,7 @@ slow_path: if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); tcp_reset(sk); return 1; } @@ -4996,7 +4997,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp)) { - NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; } @@ -5280,7 +5281,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); goto discard; } @@ -5309,7 +5310,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Check for a SYN in window. */ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); tcp_reset(sk); return 1; } @@ -5391,7 +5392,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; } @@ -5451,7 +5452,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e876312b950..29adc668ad5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -366,7 +366,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; @@ -375,7 +375,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -422,7 +422,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) BUG_TRAP(!req->sk); if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -1251,7 +1251,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { - NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } } @@ -1365,9 +1365,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; exit_overflow: - NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit: - NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); dst_release(dst); return NULL; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8b02b103996..204c4216266 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -244,7 +244,7 @@ kill: } if (paws_reject) - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. @@ -611,7 +611,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(skb, req); if (paws_reject) - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } @@ -695,7 +695,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, } embryonic_reset: - NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_reset(sk, skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 176f0702b8a..36a19707f67 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1995,7 +1995,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) mib_idx = LINUX_MIB_TCPFASTRETRANS; else mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; - NET_INC_STATS_BH(mib_idx); + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -2065,7 +2065,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) inet_csk(sk)->icsk_rto, TCP_RTO_MAX); - NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6a480d1fd8f..328e0cf42b3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -48,7 +48,7 @@ static void tcp_write_err(struct sock *sk) sk->sk_error_report(sk); tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /* Do not allow orphaned sockets to eat all our resources. @@ -89,7 +89,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } return 0; @@ -179,7 +179,7 @@ static void tcp_delack_timer(unsigned long data) if (sock_owned_by_user(sk)) { /* Try again later. */ icsk->icsk_ack.blocked = 1; - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out_unlock; } @@ -198,7 +198,7 @@ static void tcp_delack_timer(unsigned long data) if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; - NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk->sk_backlog_rcv(sk, skb); @@ -218,7 +218,7 @@ static void tcp_delack_timer(unsigned long data) icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); } TCP_CHECK_TIMER(sk); @@ -346,7 +346,7 @@ static void tcp_retransmit_timer(struct sock *sk) } else { mib_idx = LINUX_MIB_TCPTIMEOUTS; } - NET_INC_STATS_BH(mib_idx); + NET_INC_STATS_BH(sock_net(sk), mib_idx); } if (tcp_use_frto(sk)) { -- cgit v1.2.3 From 6f67c817fcfd94f5ca0f14b114b7fa25c0210c8b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:31:39 -0700 Subject: mib: add net to NET_INC_STATS_USER Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9e0e45c3780..dec0b868111 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1153,7 +1153,7 @@ static void tcp_prequeue_process(struct sock *sk) struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); /* RX process wants to run with disabled BHs, though it is not * necessary */ @@ -1793,13 +1793,13 @@ void tcp_close(struct sock *sk, long timeout) */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_KERNEL); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); - NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. -- cgit v1.2.3 From f2bf415cfed703de5ba94d25cdb160920c01fb00 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:32:25 -0700 Subject: mib: add net to NET_ADD_STATS_BH This one is tricky. The thing is that this macro is only used when killing tw buckets, but since this killer is promiscuous wrt to which net each particular tw belongs to, I have to use it only when NET_NS is off. When the net namespaces are on, I use the INET_INC_STATS_BH for each bucket. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 06006a5ac8b..75c2def8f9a 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -160,6 +160,9 @@ rescan: __inet_twsk_del_dead_node(tw); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); +#ifdef CONFIG_NET_NS + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); +#endif inet_twsk_put(tw); killed++; spin_lock(&twdr->death_lock); @@ -178,8 +181,9 @@ rescan: } twdr->tw_count -= killed; - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); - +#ifndef CONFIG_NET_NS + NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); +#endif return ret; } @@ -372,6 +376,9 @@ void inet_twdr_twcal_tick(unsigned long data) &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); +#ifdef CONFIG_NET_NS + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); +#endif inet_twsk_put(tw); killed++; } @@ -395,7 +402,9 @@ void inet_twdr_twcal_tick(unsigned long data) out: if ((twdr->tw_count -= killed) == 0) del_timer(&twdr->tw_timer); - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); +#ifndef CONFIG_NET_NS + NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); +#endif spin_unlock(&twdr->death_lock); } -- cgit v1.2.3 From ed88098e25d77bef3b2ad8c9d8e2ebf454d9ccbf Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:32:45 -0700 Subject: mib: add net to NET_ADD_STATS_USER Done with NET_XXX_STATS macros :) To be continued... Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dec0b868111..eec8cf7c024 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1475,7 +1475,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Restore normal policy in scheduler __ */ if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; } @@ -1486,7 +1486,7 @@ do_prequeue: tcp_prequeue_process(sk); if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -1601,7 +1601,7 @@ skip_copy: tcp_prequeue_process(sk); if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } -- cgit v1.2.3 From 9b4661bd6e5437508e0920608f3213c23212cd1b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:01:44 -0700 Subject: ipv4: add pernet mib operations These ones are currently empty, but stuff from init_ipv4_mibs will sequentially migrate there. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 95a966dd191..b4b77aa0795 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -110,6 +110,7 @@ #include #include #include +#include #ifdef CONFIG_IP_MROUTE #include #endif @@ -1339,6 +1340,20 @@ static struct net_protocol icmp_protocol = { .netns_ok = 1, }; +static __net_init int ipv4_mib_init_net(struct net *net) +{ + return 0; +} + +static __net_exit void ipv4_mib_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations ipv4_mib_ops = { + .init = ipv4_mib_init_net, + .exit = ipv4_mib_exit_net, +}; + static int __init init_ipv4_mibs(void) { if (snmp_mib_init((void **)net_statistics, @@ -1365,8 +1380,13 @@ static int __init init_ipv4_mibs(void) tcp_mib_init(&init_net); + if (register_pernet_subsys(&ipv4_mib_ops)) + goto err_net; + return 0; +err_net: + snmp_mib_free((void **)udplite_statistics); err_udplite_mib: snmp_mib_free((void **)udp_statistics); err_udp_mib: -- cgit v1.2.3 From 57ef42d59d1c1d79be59fc3c6380ae14234e38c3 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:02:08 -0700 Subject: mib: put tcp statistics on struct net Proc temporary uses stats from init_net. BTW, TCP_XXX_STATS are beautiful (w/o do { } while (0) facing) again :) Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 16 +++++++++------- net/ipv4/proc.c | 4 ++-- net/ipv4/tcp.c | 3 --- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b4b77aa0795..c1a3e986f8b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1342,11 +1342,20 @@ static struct net_protocol icmp_protocol = { static __net_init int ipv4_mib_init_net(struct net *net) { + if (snmp_mib_init((void **)net->mib.tcp_statistics, + sizeof(struct tcp_mib)) < 0) + goto err_tcp_mib; + + tcp_mib_init(net); return 0; + +err_tcp_mib: + return -ENOMEM; } static __net_exit void ipv4_mib_exit_net(struct net *net) { + snmp_mib_free((void **)net->mib.tcp_statistics); } static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1368,9 +1377,6 @@ static int __init init_ipv4_mibs(void) if (snmp_mib_init((void **)icmpmsg_statistics, sizeof(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)tcp_statistics, - sizeof(struct tcp_mib)) < 0) - goto err_tcp_mib; if (snmp_mib_init((void **)udp_statistics, sizeof(struct udp_mib)) < 0) goto err_udp_mib; @@ -1378,8 +1384,6 @@ static int __init init_ipv4_mibs(void) sizeof(struct udp_mib)) < 0) goto err_udplite_mib; - tcp_mib_init(&init_net); - if (register_pernet_subsys(&ipv4_mib_ops)) goto err_net; @@ -1390,8 +1394,6 @@ err_net: err_udplite_mib: snmp_mib_free((void **)udp_statistics); err_udp_mib: - snmp_mib_free((void **)tcp_statistics); -err_tcp_mib: snmp_mib_free((void **)icmpmsg_statistics); err_icmpmsg_mib: snmp_mib_free((void **)icmp_statistics); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index eb5cee279c5..19e1d8e257d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -359,11 +359,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void **)tcp_statistics, + snmp_fold_field((void **)init_net.mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void **)tcp_statistics, + snmp_fold_field((void **)init_net.mib.tcp_statistics, snmp4_tcp_list[i].entry)); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eec8cf7c024..827e6132af5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -277,8 +277,6 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; -DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly; - atomic_t tcp_orphan_count = ATOMIC_INIT(0); EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -2802,4 +2800,3 @@ EXPORT_SYMBOL(tcp_splice_read); EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); -EXPORT_SYMBOL(tcp_statistics); -- cgit v1.2.3 From a20f5799ca7ceb24d63c74b6fdad4b0c0ee91f4f Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:02:42 -0700 Subject: mib: put ip statistics on struct net Similar to tcp one. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 11 ++++++----- net/ipv4/ip_input.c | 8 -------- net/ipv4/proc.c | 4 ++-- 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c1a3e986f8b..3090a9307c4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1345,16 +1345,22 @@ static __net_init int ipv4_mib_init_net(struct net *net) if (snmp_mib_init((void **)net->mib.tcp_statistics, sizeof(struct tcp_mib)) < 0) goto err_tcp_mib; + if (snmp_mib_init((void **)net->mib.ip_statistics, + sizeof(struct ipstats_mib)) < 0) + goto err_ip_mib; tcp_mib_init(net); return 0; +err_ip_mib: + snmp_mib_free((void **)net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } static __net_exit void ipv4_mib_exit_net(struct net *net) { + snmp_mib_free((void **)net->mib.ip_statistics); snmp_mib_free((void **)net->mib.tcp_statistics); } @@ -1368,9 +1374,6 @@ static int __init init_ipv4_mibs(void) if (snmp_mib_init((void **)net_statistics, sizeof(struct linux_mib)) < 0) goto err_net_mib; - if (snmp_mib_init((void **)ip_statistics, - sizeof(struct ipstats_mib)) < 0) - goto err_ip_mib; if (snmp_mib_init((void **)icmp_statistics, sizeof(struct icmp_mib)) < 0) goto err_icmp_mib; @@ -1398,8 +1401,6 @@ err_udp_mib: err_icmpmsg_mib: snmp_mib_free((void **)icmp_statistics); err_icmp_mib: - snmp_mib_free((void **)ip_statistics); -err_ip_mib: snmp_mib_free((void **)net_statistics); err_net_mib: return -ENOMEM; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 043f640df4b..e0bed56c51f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -144,12 +144,6 @@ #include #include -/* - * SNMP management statistics - */ - -DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly; - /* * Process Router Attention IP option */ @@ -447,5 +441,3 @@ drop: out: return NET_RX_DROP; } - -EXPORT_SYMBOL(ip_statistics); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 19e1d8e257d..2698bb2ce98 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -344,7 +344,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)ip_statistics, + snmp_fold_field((void **)init_net.mib.ip_statistics, snmp4_ipstats_list[i].entry)); icmp_put(seq); /* RFC 2011 compatibility */ @@ -431,7 +431,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)ip_statistics, + snmp_fold_field((void **)init_net.mib.ip_statistics, snmp4_ipextstats_list[i].entry)); seq_putc(seq, '\n'); -- cgit v1.2.3 From 61a7e26028b94805fd686a6dc9dbd9941f8f19b0 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:03:08 -0700 Subject: mib: put net statistics on struct net Similar to ip and tcp ones :) Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 14 ++++++-------- net/ipv4/proc.c | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3090a9307c4..776ed3199b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -115,8 +115,6 @@ #include #endif -DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; - extern void ip_mc_drop_socket(struct sock *sk); /* The inetsw table contains everything that inet_create needs to @@ -1348,10 +1346,15 @@ static __net_init int ipv4_mib_init_net(struct net *net) if (snmp_mib_init((void **)net->mib.ip_statistics, sizeof(struct ipstats_mib)) < 0) goto err_ip_mib; + if (snmp_mib_init((void **)net->mib.net_statistics, + sizeof(struct linux_mib)) < 0) + goto err_net_mib; tcp_mib_init(net); return 0; +err_net_mib: + snmp_mib_free((void **)net->mib.ip_statistics); err_ip_mib: snmp_mib_free((void **)net->mib.tcp_statistics); err_tcp_mib: @@ -1360,6 +1363,7 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { + snmp_mib_free((void **)net->mib.net_statistics); snmp_mib_free((void **)net->mib.ip_statistics); snmp_mib_free((void **)net->mib.tcp_statistics); } @@ -1371,9 +1375,6 @@ static __net_initdata struct pernet_operations ipv4_mib_ops = { static int __init init_ipv4_mibs(void) { - if (snmp_mib_init((void **)net_statistics, - sizeof(struct linux_mib)) < 0) - goto err_net_mib; if (snmp_mib_init((void **)icmp_statistics, sizeof(struct icmp_mib)) < 0) goto err_icmp_mib; @@ -1401,8 +1402,6 @@ err_udp_mib: err_icmpmsg_mib: snmp_mib_free((void **)icmp_statistics); err_icmp_mib: - snmp_mib_free((void **)net_statistics); -err_net_mib: return -ENOMEM; } @@ -1582,5 +1581,4 @@ EXPORT_SYMBOL(inet_sock_destruct); EXPORT_SYMBOL(inet_stream_connect); EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_unregister_protosw); -EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2698bb2ce98..ef38b1dccfd 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -421,7 +421,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net_statistics, + snmp_fold_field((void **)init_net.mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); -- cgit v1.2.3 From 2f275f91a438abd8eec5321798d66a4ffe6869fa Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:03:27 -0700 Subject: mib: put udp statistics on struct net Similar to... ouch, I repeat myself. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 11 ++++++----- net/ipv4/proc.c | 2 +- net/ipv4/udp.c | 3 --- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 776ed3199b5..1f418164ebf 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1349,10 +1349,15 @@ static __net_init int ipv4_mib_init_net(struct net *net) if (snmp_mib_init((void **)net->mib.net_statistics, sizeof(struct linux_mib)) < 0) goto err_net_mib; + if (snmp_mib_init((void **)net->mib.udp_statistics, + sizeof(struct udp_mib)) < 0) + goto err_udp_mib; tcp_mib_init(net); return 0; +err_udp_mib: + snmp_mib_free((void **)net->mib.net_statistics); err_net_mib: snmp_mib_free((void **)net->mib.ip_statistics); err_ip_mib: @@ -1363,6 +1368,7 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { + snmp_mib_free((void **)net->mib.udp_statistics); snmp_mib_free((void **)net->mib.net_statistics); snmp_mib_free((void **)net->mib.ip_statistics); snmp_mib_free((void **)net->mib.tcp_statistics); @@ -1381,9 +1387,6 @@ static int __init init_ipv4_mibs(void) if (snmp_mib_init((void **)icmpmsg_statistics, sizeof(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)udp_statistics, - sizeof(struct udp_mib)) < 0) - goto err_udp_mib; if (snmp_mib_init((void **)udplite_statistics, sizeof(struct udp_mib)) < 0) goto err_udplite_mib; @@ -1396,8 +1399,6 @@ static int __init init_ipv4_mibs(void) err_net: snmp_mib_free((void **)udplite_statistics); err_udplite_mib: - snmp_mib_free((void **)udp_statistics); -err_udp_mib: snmp_mib_free((void **)icmpmsg_statistics); err_icmpmsg_mib: snmp_mib_free((void **)icmp_statistics); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ef38b1dccfd..869085c8e43 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -374,7 +374,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)udp_statistics, + snmp_fold_field((void **)init_net.mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1560d11ba6e..a751770947a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -108,9 +108,6 @@ * Snmp MIB for the UDP layer */ -DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; -EXPORT_SYMBOL(udp_statistics); - DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; EXPORT_SYMBOL(udp_stats_in6); -- cgit v1.2.3 From 386019d3514b3ed9de8d0b05b67e638a7048375b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:03:45 -0700 Subject: mib: put udplite statistics on struct net Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 11 ++++++----- net/ipv4/proc.c | 2 +- net/ipv4/udplite.c | 1 - 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1f418164ebf..bf1f200c657 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1352,10 +1352,15 @@ static __net_init int ipv4_mib_init_net(struct net *net) if (snmp_mib_init((void **)net->mib.udp_statistics, sizeof(struct udp_mib)) < 0) goto err_udp_mib; + if (snmp_mib_init((void **)net->mib.udplite_statistics, + sizeof(struct udp_mib)) < 0) + goto err_udplite_mib; tcp_mib_init(net); return 0; +err_udplite_mib: + snmp_mib_free((void **)net->mib.udp_statistics); err_udp_mib: snmp_mib_free((void **)net->mib.net_statistics); err_net_mib: @@ -1368,6 +1373,7 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { + snmp_mib_free((void **)net->mib.udplite_statistics); snmp_mib_free((void **)net->mib.udp_statistics); snmp_mib_free((void **)net->mib.net_statistics); snmp_mib_free((void **)net->mib.ip_statistics); @@ -1387,9 +1393,6 @@ static int __init init_ipv4_mibs(void) if (snmp_mib_init((void **)icmpmsg_statistics, sizeof(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)udplite_statistics, - sizeof(struct udp_mib)) < 0) - goto err_udplite_mib; if (register_pernet_subsys(&ipv4_mib_ops)) goto err_net; @@ -1397,8 +1400,6 @@ static int __init init_ipv4_mibs(void) return 0; err_net: - snmp_mib_free((void **)udplite_statistics); -err_udplite_mib: snmp_mib_free((void **)icmpmsg_statistics); err_icmpmsg_mib: snmp_mib_free((void **)icmp_statistics); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 869085c8e43..765418334b3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -385,7 +385,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)udplite_statistics, + snmp_fold_field((void **)init_net.mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 4ad16b6d513..3c807964da9 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -11,7 +11,6 @@ * 2 of the License, or (at your option) any later version. */ #include "udp_impl.h" -DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics) __read_mostly; struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; -- cgit v1.2.3 From b60538a0d737609213e4b758881913498d3ff0b4 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:04:02 -0700 Subject: mib: put icmp statistics on struct net Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 11 ++++++----- net/ipv4/icmp.c | 2 -- net/ipv4/proc.c | 8 ++++---- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index bf1f200c657..5c72bb3adad 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1355,10 +1355,15 @@ static __net_init int ipv4_mib_init_net(struct net *net) if (snmp_mib_init((void **)net->mib.udplite_statistics, sizeof(struct udp_mib)) < 0) goto err_udplite_mib; + if (snmp_mib_init((void **)net->mib.icmp_statistics, + sizeof(struct icmp_mib)) < 0) + goto err_icmp_mib; tcp_mib_init(net); return 0; +err_icmp_mib: + snmp_mib_free((void **)net->mib.udplite_statistics); err_udplite_mib: snmp_mib_free((void **)net->mib.udp_statistics); err_udp_mib: @@ -1373,6 +1378,7 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { + snmp_mib_free((void **)net->mib.icmp_statistics); snmp_mib_free((void **)net->mib.udplite_statistics); snmp_mib_free((void **)net->mib.udp_statistics); snmp_mib_free((void **)net->mib.net_statistics); @@ -1387,9 +1393,6 @@ static __net_initdata struct pernet_operations ipv4_mib_ops = { static int __init init_ipv4_mibs(void) { - if (snmp_mib_init((void **)icmp_statistics, - sizeof(struct icmp_mib)) < 0) - goto err_icmp_mib; if (snmp_mib_init((void **)icmpmsg_statistics, sizeof(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; @@ -1402,8 +1405,6 @@ static int __init init_ipv4_mibs(void) err_net: snmp_mib_free((void **)icmpmsg_statistics); err_icmpmsg_mib: - snmp_mib_free((void **)icmp_statistics); -err_icmp_mib: return -ENOMEM; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index ea60ad41008..33f958902d9 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -114,7 +114,6 @@ struct icmp_bxm { /* * Statistics */ -DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly; /* An array of errno for error messages from dest unreach. */ @@ -1213,5 +1212,4 @@ int __init icmp_init(void) EXPORT_SYMBOL(icmp_err_convert); EXPORT_SYMBOL(icmp_send); -EXPORT_SYMBOL(icmp_statistics); EXPORT_SYMBOL(xrlim_allow); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 765418334b3..6b43cce0558 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -311,15 +311,15 @@ static void icmp_put(struct seq_file *seq) for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu", - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS)); + snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", snmp_fold_field((void **) icmpmsg_statistics, icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", snmp_fold_field((void **) icmpmsg_statistics, -- cgit v1.2.3 From 923c6586b0dc0a00df07a1608185437145a0c68b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:04:22 -0700 Subject: mib: put icmpmsg statistics on struct net Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 12 ++++++------ net/ipv4/icmp.c | 5 ----- net/ipv4/proc.c | 10 +++++----- 3 files changed, 11 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5c72bb3adad..36ff6dc769c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1358,10 +1358,15 @@ static __net_init int ipv4_mib_init_net(struct net *net) if (snmp_mib_init((void **)net->mib.icmp_statistics, sizeof(struct icmp_mib)) < 0) goto err_icmp_mib; + if (snmp_mib_init((void **)net->mib.icmpmsg_statistics, + sizeof(struct icmpmsg_mib)) < 0) + goto err_icmpmsg_mib; tcp_mib_init(net); return 0; +err_icmpmsg_mib: + snmp_mib_free((void **)net->mib.icmp_statistics); err_icmp_mib: snmp_mib_free((void **)net->mib.udplite_statistics); err_udplite_mib: @@ -1378,6 +1383,7 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { + snmp_mib_free((void **)net->mib.icmpmsg_statistics); snmp_mib_free((void **)net->mib.icmp_statistics); snmp_mib_free((void **)net->mib.udplite_statistics); snmp_mib_free((void **)net->mib.udp_statistics); @@ -1393,18 +1399,12 @@ static __net_initdata struct pernet_operations ipv4_mib_ops = { static int __init init_ipv4_mibs(void) { - if (snmp_mib_init((void **)icmpmsg_statistics, - sizeof(struct icmpmsg_mib)) < 0) - goto err_icmpmsg_mib; - if (register_pernet_subsys(&ipv4_mib_ops)) goto err_net; return 0; err_net: - snmp_mib_free((void **)icmpmsg_statistics); -err_icmpmsg_mib: return -ENOMEM; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 33f958902d9..860558633b2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -111,11 +111,6 @@ struct icmp_bxm { unsigned char optbuf[40]; }; -/* - * Statistics - */ -DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly; - /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6b43cce0558..e11144b0134 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -270,7 +270,7 @@ static void icmpmsg_put(struct seq_file *seq) count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - if (snmp_fold_field((void **) icmpmsg_statistics, i)) + if (snmp_fold_field((void **) init_net.mib.icmpmsg_statistics, i)) out[count++] = i; if (count < PERLINE) continue; @@ -282,7 +282,7 @@ static void icmpmsg_put(struct seq_file *seq) seq_printf(seq, "\nIcmpMsg: "); for (j = 0; j < PERLINE; ++j) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) init_net.mib.icmpmsg_statistics, out[j])); seq_putc(seq, '\n'); } @@ -294,7 +294,7 @@ static void icmpmsg_put(struct seq_file *seq) seq_printf(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", snmp_fold_field((void **) - icmpmsg_statistics, out[j])); + init_net.mib.icmpmsg_statistics, out[j])); } #undef PERLINE @@ -315,14 +315,14 @@ static void icmp_put(struct seq_file *seq) snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) init_net.mib.icmpmsg_statistics, icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) init_net.mib.icmpmsg_statistics, icmpmibmap[i].index | 0x100)); } -- cgit v1.2.3 From d89cbbb1e69a3bfea38038fb058bd51013902ef3 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:04:51 -0700 Subject: ipv4: clean the init_ipv4_mibs error paths After moving all the stuff outside this function it looks a bit ugly - make it look better. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 36ff6dc769c..dd919d84285 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1399,13 +1399,7 @@ static __net_initdata struct pernet_operations ipv4_mib_ops = { static int __init init_ipv4_mibs(void) { - if (register_pernet_subsys(&ipv4_mib_ops)) - goto err_net; - - return 0; - -err_net: - return -ENOMEM; + return register_pernet_subsys(&ipv4_mib_ops); } static int ipv4_proc_init(void); -- cgit v1.2.3 From 7b7a9dfdf6ccda647f54ea5fa3bd0ac17a189eeb Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:05:17 -0700 Subject: proc: create /proc/net/netstat file in each net Now all the shown in it statistics is netnsizated, time to show it in appropriate net. The appropriate net init/exit ops already exist - they make the sockstat file per net - so just extend them. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/proc.c | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index e11144b0134..df8c4affde0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -413,6 +413,7 @@ static const struct file_operations snmp_seq_fops = { static int netstat_seq_show(struct seq_file *seq, void *v) { int i; + struct net *net = seq->private; seq_puts(seq, "TcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) @@ -421,7 +422,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)init_net.mib.net_statistics, + snmp_fold_field((void **)net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -431,7 +432,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)init_net.mib.ip_statistics, + snmp_fold_field((void **)net->mib.ip_statistics, snmp4_ipextstats_list[i].entry)); seq_putc(seq, '\n'); @@ -440,7 +441,32 @@ static int netstat_seq_show(struct seq_file *seq, void *v) static int netstat_seq_open(struct inode *inode, struct file *file) { - return single_open(file, netstat_seq_show, NULL); + int err; + struct net *net; + + err = -ENXIO; + net = get_proc_net(inode); + if (net == NULL) + goto err_net; + + err = single_open(file, netstat_seq_show, net); + if (err < 0) + goto err_open; + + return 0; + +err_open: + put_net(net); +err_net: + return err; +} + +static int netstat_seq_release(struct inode *inode, struct file *file) +{ + struct net *net = ((struct seq_file *)file->private_data)->private; + + put_net(net); + return single_release(inode, file); } static const struct file_operations netstat_seq_fops = { @@ -448,18 +474,26 @@ static const struct file_operations netstat_seq_fops = { .open = netstat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = netstat_seq_release, }; static __net_init int ip_proc_init_net(struct net *net) { if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) return -ENOMEM; + if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) + goto out_netstat; + return 0; + +out_netstat: + proc_net_remove(net, "sockstat"); + return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { + proc_net_remove(net, "netstat"); proc_net_remove(net, "sockstat"); } @@ -475,16 +509,11 @@ int __init ip_misc_proc_init(void) if (register_pernet_subsys(&ip_proc_ops)) goto out_pernet; - if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops)) - goto out_netstat; - if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops)) goto out_snmp; out: return rc; out_snmp: - proc_net_remove(&init_net, "netstat"); -out_netstat: unregister_pernet_subsys(&ip_proc_ops); out_pernet: rc = -ENOMEM; -- cgit v1.2.3 From 229bf0cbaa054f1502ab4ee219f05985d2c838d1 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:06:04 -0700 Subject: proc: create /proc/net/snmp file in each net All the statistics shown in this file have been made per-net already. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/proc.c | 70 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 20 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index df8c4affde0..367b81f6f57 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -266,11 +266,12 @@ static void icmpmsg_put(struct seq_file *seq) int j, i, count; static int out[PERLINE]; + struct net *net = seq->private; count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - if (snmp_fold_field((void **) init_net.mib.icmpmsg_statistics, i)) + if (snmp_fold_field((void **) net->mib.icmpmsg_statistics, i)) out[count++] = i; if (count < PERLINE) continue; @@ -282,7 +283,7 @@ static void icmpmsg_put(struct seq_file *seq) seq_printf(seq, "\nIcmpMsg: "); for (j = 0; j < PERLINE; ++j) seq_printf(seq, " %lu", - snmp_fold_field((void **) init_net.mib.icmpmsg_statistics, + snmp_fold_field((void **) net->mib.icmpmsg_statistics, out[j])); seq_putc(seq, '\n'); } @@ -294,7 +295,7 @@ static void icmpmsg_put(struct seq_file *seq) seq_printf(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", snmp_fold_field((void **) - init_net.mib.icmpmsg_statistics, out[j])); + net->mib.icmpmsg_statistics, out[j])); } #undef PERLINE @@ -303,6 +304,7 @@ static void icmpmsg_put(struct seq_file *seq) static void icmp_put(struct seq_file *seq) { int i; + struct net *net = seq->private; seq_puts(seq, "\nIcmp: InMsgs InErrors"); for (i=0; icmpmibmap[i].name != NULL; i++) @@ -311,18 +313,18 @@ static void icmp_put(struct seq_file *seq) for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu", - snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_INERRORS)); + snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) init_net.mib.icmpmsg_statistics, + snmp_fold_field((void **) net->mib.icmpmsg_statistics, icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void **) init_net.mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) init_net.mib.icmpmsg_statistics, + snmp_fold_field((void **) net->mib.icmpmsg_statistics, icmpmibmap[i].index | 0x100)); } @@ -332,6 +334,7 @@ static void icmp_put(struct seq_file *seq) static int snmp_seq_show(struct seq_file *seq, void *v) { int i; + struct net *net = seq->private; seq_puts(seq, "Ip: Forwarding DefaultTTL"); @@ -344,7 +347,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)init_net.mib.ip_statistics, + snmp_fold_field((void **)net->mib.ip_statistics, snmp4_ipstats_list[i].entry)); icmp_put(seq); /* RFC 2011 compatibility */ @@ -359,11 +362,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void **)init_net.mib.tcp_statistics, + snmp_fold_field((void **)net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void **)init_net.mib.tcp_statistics, + snmp_fold_field((void **)net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -374,7 +377,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)init_net.mib.udp_statistics, + snmp_fold_field((void **)net->mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -385,7 +388,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)init_net.mib.udplite_statistics, + snmp_fold_field((void **)net->mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -394,7 +397,32 @@ static int snmp_seq_show(struct seq_file *seq, void *v) static int snmp_seq_open(struct inode *inode, struct file *file) { - return single_open(file, snmp_seq_show, NULL); + int err; + struct net *net; + + err = -ENXIO; + net = get_proc_net(inode); + if (net == NULL) + goto err_net; + + err = single_open(file, snmp_seq_show, net); + if (err < 0) + goto err_open; + + return 0; + +err_open: + put_net(net); +err_net: + return err; +} + +static int snmp_seq_release(struct inode *inode, struct file *file) +{ + struct net *net = ((struct seq_file *)file->private_data)->private; + + put_net(net); + return single_release(inode, file); } static const struct file_operations snmp_seq_fops = { @@ -402,7 +430,7 @@ static const struct file_operations snmp_seq_fops = { .open = snmp_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = snmp_seq_release, }; @@ -483,9 +511,13 @@ static __net_init int ip_proc_init_net(struct net *net) return -ENOMEM; if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) goto out_netstat; + if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) + goto out_snmp; return 0; +out_snmp: + proc_net_remove(net, "netstat"); out_netstat: proc_net_remove(net, "sockstat"); return -ENOMEM; @@ -493,6 +525,7 @@ out_netstat: static __net_exit void ip_proc_exit_net(struct net *net) { + proc_net_remove(net, "snmp"); proc_net_remove(net, "netstat"); proc_net_remove(net, "sockstat"); } @@ -509,12 +542,9 @@ int __init ip_misc_proc_init(void) if (register_pernet_subsys(&ip_proc_ops)) goto out_pernet; - if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops)) - goto out_snmp; out: return rc; -out_snmp: - unregister_pernet_subsys(&ip_proc_ops); + out_pernet: rc = -ENOMEM; goto out; -- cgit v1.2.3 From 8e3461d01bdbc3dbf993448ed9ad4acaaeb6495d Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:06:26 -0700 Subject: proc: show per-net ip_devconf.forwarding in /proc/net/snmp This one has become per-net long ago, but the appropriate file is per-net only now. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 367b81f6f57..120e1f7461d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -342,7 +342,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", - IPV4_DEVCONF_ALL(&init_net, FORWARDING) ? 1 : 2, + IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, sysctl_ip_default_ttl); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) -- cgit v1.2.3 From 60bdde95807e982a824be9cfdd35055cc721a88a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:06:50 -0700 Subject: proc: clean the ip_misc_proc_init and ip_proc_init_net error paths After all this stuff is moved outside, this function can look better. Besides, I tuned the error path in ip_proc_init_net to make it have only 2 exit points, not 3. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/proc.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 120e1f7461d..554390431a4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -508,7 +508,7 @@ static const struct file_operations netstat_seq_fops = { static __net_init int ip_proc_init_net(struct net *net) { if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) - return -ENOMEM; + goto out_sockstat; if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) goto out_netstat; if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) @@ -520,6 +520,7 @@ out_snmp: proc_net_remove(net, "netstat"); out_netstat: proc_net_remove(net, "sockstat"); +out_sockstat: return -ENOMEM; } @@ -537,16 +538,6 @@ static __net_initdata struct pernet_operations ip_proc_ops = { int __init ip_misc_proc_init(void) { - int rc = 0; - - if (register_pernet_subsys(&ip_proc_ops)) - goto out_pernet; - -out: - return rc; - -out_pernet: - rc = -ENOMEM; - goto out; + return register_pernet_subsys(&ip_proc_ops); } -- cgit v1.2.3 From de05c557b24c7dffc6d392e3db120cf11c9f6ae7 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:07:21 -0700 Subject: proc: consolidate per-net single_open callers There are already 7 of them - time to kill some duplicate code. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 13 +----------- net/ipv4/proc.c | 57 +++-------------------------------------------------- 2 files changed, 4 insertions(+), 66 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index f155a66d6eb..6009df238ed 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2251,18 +2251,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) static int fib_triestat_seq_open(struct inode *inode, struct file *file) { - int err; - struct net *net; - - net = get_proc_net(inode); - if (net == NULL) - return -ENXIO; - err = single_open(file, fib_triestat_seq_show, net); - if (err < 0) { - put_net(net); - return err; - } - return 0; + return single_open_net(inode, file, fib_triestat_seq_show); } static int fib_triestat_seq_release(struct inode *ino, struct file *f) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 554390431a4..daf5d3c80ce 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -71,24 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) static int sockstat_seq_open(struct inode *inode, struct file *file) { - int err; - struct net *net; - - err = -ENXIO; - net = get_proc_net(inode); - if (net == NULL) - goto err_net; - - err = single_open(file, sockstat_seq_show, net); - if (err < 0) - goto err_open; - - return 0; - -err_open: - put_net(net); -err_net: - return err; + return single_open_net(inode, file, sockstat_seq_show); } static int sockstat_seq_release(struct inode *inode, struct file *file) @@ -397,24 +380,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) static int snmp_seq_open(struct inode *inode, struct file *file) { - int err; - struct net *net; - - err = -ENXIO; - net = get_proc_net(inode); - if (net == NULL) - goto err_net; - - err = single_open(file, snmp_seq_show, net); - if (err < 0) - goto err_open; - - return 0; - -err_open: - put_net(net); -err_net: - return err; + return single_open_net(inode, file, snmp_seq_show); } static int snmp_seq_release(struct inode *inode, struct file *file) @@ -469,24 +435,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) static int netstat_seq_open(struct inode *inode, struct file *file) { - int err; - struct net *net; - - err = -ENXIO; - net = get_proc_net(inode); - if (net == NULL) - goto err_net; - - err = single_open(file, netstat_seq_show, net); - if (err < 0) - goto err_open; - - return 0; - -err_open: - put_net(net); -err_net: - return err; + return single_open_net(inode, file, netstat_seq_show); } static int netstat_seq_release(struct inode *inode, struct file *file) -- cgit v1.2.3 From b6fcbdb4f283f7ba67cec3cda6be23da8e959031 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 18 Jul 2008 04:07:44 -0700 Subject: proc: consolidate per-net single-release callers They are symmetrical to single_open ones :) Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 9 +-------- net/ipv4/proc.c | 30 +++--------------------------- 2 files changed, 4 insertions(+), 35 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 6009df238ed..5cb72786a8a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2254,19 +2254,12 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file) return single_open_net(inode, file, fib_triestat_seq_show); } -static int fib_triestat_seq_release(struct inode *ino, struct file *f) -{ - struct seq_file *seq = f->private_data; - put_net(seq->private); - return single_release(ino, f); -} - static const struct file_operations fib_triestat_fops = { .owner = THIS_MODULE, .open = fib_triestat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = fib_triestat_seq_release, + .release = single_release_net, }; static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index daf5d3c80ce..834356ea99d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -74,20 +74,12 @@ static int sockstat_seq_open(struct inode *inode, struct file *file) return single_open_net(inode, file, sockstat_seq_show); } -static int sockstat_seq_release(struct inode *inode, struct file *file) -{ - struct net *net = ((struct seq_file *)file->private_data)->private; - - put_net(net); - return single_release(inode, file); -} - static const struct file_operations sockstat_seq_fops = { .owner = THIS_MODULE, .open = sockstat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = sockstat_seq_release, + .release = single_release_net, }; /* snmp items */ @@ -383,20 +375,12 @@ static int snmp_seq_open(struct inode *inode, struct file *file) return single_open_net(inode, file, snmp_seq_show); } -static int snmp_seq_release(struct inode *inode, struct file *file) -{ - struct net *net = ((struct seq_file *)file->private_data)->private; - - put_net(net); - return single_release(inode, file); -} - static const struct file_operations snmp_seq_fops = { .owner = THIS_MODULE, .open = snmp_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = snmp_seq_release, + .release = single_release_net, }; @@ -438,20 +422,12 @@ static int netstat_seq_open(struct inode *inode, struct file *file) return single_open_net(inode, file, netstat_seq_show); } -static int netstat_seq_release(struct inode *inode, struct file *file) -{ - struct net *net = ((struct seq_file *)file->private_data)->private; - - put_net(net); - return single_release(inode, file); -} - static const struct file_operations netstat_seq_fops = { .owner = THIS_MODULE, .open = netstat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = netstat_seq_release, + .release = single_release_net, }; static __net_init int ip_proc_init_net(struct net *net) -- cgit v1.2.3 From c1e20f7c8b9ccbafc9ea78f2b406738728ce6b81 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 18 Jul 2008 23:02:15 -0700 Subject: tcp: RTT metrics scaling Some of the metrics (RTT, RTTVAR and RTAX_RTO_MIN) are stored in kernel units (jiffies) and this leaks out through the netlink API to user space where the units for jiffies are unknown. This patches changes the kernel to convert to/from milliseconds. This changes the ABI, but milliseconds seemed like the most natural unit for these parameters. Values available via syscall in /proc/net/rt_cache and netlink will be in milliseconds. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fac49a6e161..88810bc0137 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -602,7 +602,7 @@ static u32 tcp_rto_min(struct sock *sk) u32 rto_min = TCP_RTO_MIN; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst_metric(dst, RTAX_RTO_MIN); + rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } @@ -729,6 +729,7 @@ void tcp_update_metrics(struct sock *sk) if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; + unsigned long rtt; if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? @@ -740,7 +741,8 @@ void tcp_update_metrics(struct sock *sk) return; } - m = dst_metric(dst, RTAX_RTT) - tp->srtt; + rtt = dst_metric_rtt(dst, RTAX_RTT); + m = rtt - tp->srtt; /* If newly calculated rtt larger than stored one, * store new one. Otherwise, use EWMA. Remember, @@ -748,12 +750,13 @@ void tcp_update_metrics(struct sock *sk) */ if (!(dst_metric_locked(dst, RTAX_RTT))) { if (m <= 0) - dst->metrics[RTAX_RTT - 1] = tp->srtt; + set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); else - dst->metrics[RTAX_RTT - 1] -= (m >> 3); + set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); } if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { + unsigned long var; if (m < 0) m = -m; @@ -762,11 +765,13 @@ void tcp_update_metrics(struct sock *sk) if (m < tp->mdev) m = tp->mdev; - if (m >= dst_metric(dst, RTAX_RTTVAR)) - dst->metrics[RTAX_RTTVAR - 1] = m; + var = dst_metric_rtt(dst, RTAX_RTTVAR); + if (m >= var) + var = m; else - dst->metrics[RTAX_RTTVAR-1] -= - (dst_metric(dst, RTAX_RTTVAR) - m)>>2; + var -= (var - m) >> 2; + + set_dst_metric_rtt(dst, RTAX_RTTVAR, var); } if (tp->snd_ssthresh >= 0xFFFF) { @@ -897,7 +902,7 @@ static void tcp_init_metrics(struct sock *sk) if (dst_metric(dst, RTAX_RTT) == 0) goto reset; - if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) + if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -914,12 +919,12 @@ static void tcp_init_metrics(struct sock *sk) * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ - if (dst_metric(dst, RTAX_RTT) > tp->srtt) { - tp->srtt = dst_metric(dst, RTAX_RTT); + if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { + tp->srtt = dst_metric_rtt(dst, RTAX_RTT); tp->rtt_seq = tp->snd_nxt; } - if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { - tp->mdev = dst_metric(dst, RTAX_RTTVAR); + if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { + tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); -- cgit v1.2.3 From 336d3262df71fcd2661180bb35d5ea41b4cbca58 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 18 Jul 2008 23:07:09 -0700 Subject: sctp: remove unnecessary byteshifting, calculate directly in big-endian Signed-off-by: Harvey Harrison Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_proto_sctp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c index 82e4c0e286b..65e470bc612 100644 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -36,7 +36,7 @@ sctp_manip_pkt(struct sk_buff *skb, sctp_sctphdr_t *hdr; unsigned int hdroff = iphdroff + iph->ihl*4; __be32 oldip, newip; - u32 crc32; + __be32 crc32; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; @@ -61,7 +61,7 @@ sctp_manip_pkt(struct sk_buff *skb, crc32 = sctp_update_cksum((u8 *)skb->data, skb_headlen(skb), crc32); crc32 = sctp_end_cksum(crc32); - hdr->checksum = htonl(crc32); + hdr->checksum = crc32; return true; } -- cgit v1.2.3 From 49a72dfb8814c2d65bd9f8c9c6daf6395a1ec58d Mon Sep 17 00:00:00 2001 From: Adam Langley Date: Sat, 19 Jul 2008 00:01:42 -0700 Subject: tcp: Fix MD5 signatures for non-linear skbs Currently, the MD5 code assumes that the SKBs are linear and, in the case that they aren't, happily goes off and hashes off the end of the SKB and into random memory. Reported by Stephen Hemminger in [1]. Advice thanks to Stephen and Evgeniy Polyakov. Also includes a couple of missed route_caps from Stephen's patch in [2]. [1] http://marc.info/?l=linux-netdev&m=121445989106145&w=2 [2] http://marc.info/?l=linux-netdev&m=121459157816964&w=2 Signed-off-by: Adam Langley Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 127 ++++++++++++++++++++------------------------- net/ipv4/tcp_ipv4.c | 140 ++++++++++++++++++++++++++++++-------------------- net/ipv4/tcp_output.c | 14 ++--- 3 files changed, 147 insertions(+), 134 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 827e6132af5..0b491bf03db 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2465,76 +2465,6 @@ static unsigned long tcp_md5sig_users; static struct tcp_md5sig_pool **tcp_md5sig_pool; static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); -int tcp_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - int bplen, - struct tcphdr *th, unsigned int tcplen, - struct tcp_md5sig_pool *hp) -{ - struct scatterlist sg[4]; - __u16 data_len; - int block = 0; - __sum16 cksum; - struct hash_desc *desc = &hp->md5_desc; - int err; - unsigned int nbytes = 0; - - sg_init_table(sg, 4); - - /* 1. The TCP pseudo-header */ - sg_set_buf(&sg[block++], &hp->md5_blk, bplen); - nbytes += bplen; - - /* 2. The TCP header, excluding options, and assuming a - * checksum of zero - */ - cksum = th->check; - th->check = 0; - sg_set_buf(&sg[block++], th, sizeof(*th)); - nbytes += sizeof(*th); - - /* 3. The TCP segment data (if any) */ - data_len = tcplen - (th->doff << 2); - if (data_len > 0) { - u8 *data = (u8 *)th + (th->doff << 2); - sg_set_buf(&sg[block++], data, data_len); - nbytes += data_len; - } - - /* 4. an independently-specified key or password, known to both - * TCPs and presumably connection-specific - */ - sg_set_buf(&sg[block++], key->key, key->keylen); - nbytes += key->keylen; - - sg_mark_end(&sg[block - 1]); - - /* Now store the hash into the packet */ - err = crypto_hash_init(desc); - if (err) { - if (net_ratelimit()) - printk(KERN_WARNING "%s(): hash_init failed\n", __func__); - return -1; - } - err = crypto_hash_update(desc, sg, nbytes); - if (err) { - if (net_ratelimit()) - printk(KERN_WARNING "%s(): hash_update failed\n", __func__); - return -1; - } - err = crypto_hash_final(desc, md5_hash); - if (err) { - if (net_ratelimit()) - printk(KERN_WARNING "%s(): hash_final failed\n", __func__); - return -1; - } - - /* Reset header */ - th->check = cksum; - - return 0; -} -EXPORT_SYMBOL(tcp_calc_md5_hash); - static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) { int cpu; @@ -2658,6 +2588,63 @@ void __tcp_put_md5sig_pool(void) } EXPORT_SYMBOL(__tcp_put_md5sig_pool); + +int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, + struct tcphdr *th) +{ + struct scatterlist sg; + int err; + + __sum16 old_checksum = th->check; + th->check = 0; + /* options aren't included in the hash */ + sg_init_one(&sg, th, sizeof(struct tcphdr)); + err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); + th->check = old_checksum; + return err; +} + +EXPORT_SYMBOL(tcp_md5_hash_header); + +int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, + struct sk_buff *skb, unsigned header_len) +{ + struct scatterlist sg; + const struct tcphdr *tp = tcp_hdr(skb); + struct hash_desc *desc = &hp->md5_desc; + unsigned i; + const unsigned head_data_len = skb_headlen(skb) > header_len ? + skb_headlen(skb) - header_len : 0; + const struct skb_shared_info *shi = skb_shinfo(skb); + + sg_init_table(&sg, 1); + + sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); + if (crypto_hash_update(desc, &sg, head_data_len)) + return 1; + + for (i = 0; i < shi->nr_frags; ++i) { + const struct skb_frag_struct *f = &shi->frags[i]; + sg_set_page(&sg, f->page, f->size, f->page_offset); + if (crypto_hash_update(desc, &sg, f->size)) + return 1; + } + + return 0; +} + +EXPORT_SYMBOL(tcp_md5_hash_skb_data); + +int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) +{ + struct scatterlist sg; + + sg_init_one(&sg, key->key, key->keylen); + return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); +} + +EXPORT_SYMBOL(tcp_md5_hash_key); + #endif void tcp_done(struct sock *sk) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 29adc668ad5..5400d75ff17 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -87,9 +87,8 @@ int sysctl_tcp_low_latency __read_mostly; #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr); -static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - __be32 saddr, __be32 daddr, - struct tcphdr *th, unsigned int tcplen); +static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, struct tcphdr *th); #else static inline struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) @@ -583,11 +582,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; rep.th.doff = arg.iov[0].iov_len / 4; - tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1], - key, - ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - &rep.th, arg.iov[0].iov_len); + tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], + key, ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, &rep.th); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, @@ -657,11 +654,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; rep.th.doff = arg.iov[0].iov_len/4; - tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset], - key, - ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - &rep.th, arg.iov[0].iov_len); + tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], + key, ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, &rep.th); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, @@ -989,28 +984,16 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, newkey, cmd.tcpm_keylen); } -static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - __be32 saddr, __be32 daddr, - struct tcphdr *th, - unsigned int tcplen) +static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, + __be32 daddr, __be32 saddr, int nbytes) { - struct tcp_md5sig_pool *hp; struct tcp4_pseudohdr *bp; - int err; - - /* - * Okay, so RFC2385 is turned on for this connection, - * so we need to generate the MD5 hash for the packet now. - */ - - hp = tcp_get_md5sig_pool(); - if (!hp) - goto clear_hash_noput; + struct scatterlist sg; bp = &hp->md5_blk.ip4; /* - * The TCP pseudo-header (in the order: source IP address, + * 1. the TCP pseudo-header (in the order: source IP address, * destination IP address, zero-padded protocol number, and * segment length) */ @@ -1018,48 +1001,95 @@ static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, bp->daddr = daddr; bp->pad = 0; bp->protocol = IPPROTO_TCP; - bp->len = htons(tcplen); + bp->len = cpu_to_be16(nbytes); - err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp), - th, tcplen, hp); - if (err) + sg_init_one(&sg, bp, sizeof(*bp)); + return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); +} + +static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, struct tcphdr *th) +{ + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) goto clear_hash; - /* Free up the crypto pool */ tcp_put_md5sig_pool(); -out: return 0; + clear_hash: tcp_put_md5sig_pool(); clear_hash_noput: memset(md5_hash, 0, 16); - goto out; + return 1; } -int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - struct sock *sk, - struct dst_entry *dst, - struct request_sock *req, - struct tcphdr *th, - unsigned int tcplen) +int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, + struct sock *sk, struct request_sock *req, + struct sk_buff *skb) { + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; if (sk) { saddr = inet_sk(sk)->saddr; daddr = inet_sk(sk)->daddr; + } else if (req) { + saddr = inet_rsk(req)->loc_addr; + daddr = inet_rsk(req)->rmt_addr; } else { - struct rtable *rt = (struct rtable *)dst; - BUG_ON(!rt); - saddr = rt->rt_src; - daddr = rt->rt_dst; + const struct iphdr *iph = ip_hdr(skb); + saddr = iph->saddr; + daddr = iph->daddr; } - return tcp_v4_do_calc_md5_hash(md5_hash, key, - saddr, daddr, - th, tcplen); + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + + if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; } -EXPORT_SYMBOL(tcp_v4_calc_md5_hash); +EXPORT_SYMBOL(tcp_v4_md5_hash_skb); static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) { @@ -1104,10 +1134,9 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) /* Okay, so this is hash_expected and hash_location - * so we need to calculate the checksum. */ - genhash = tcp_v4_do_calc_md5_hash(newhash, - hash_expected, - iph->saddr, iph->daddr, - th, skb->len); + genhash = tcp_v4_md5_hash_skb(newhash, + hash_expected, + NULL, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { if (net_ratelimit()) { @@ -1356,6 +1385,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (newkey != NULL) tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr, newkey, key->keylen); + newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; } #endif @@ -1719,7 +1749,7 @@ struct inet_connection_sock_af_ops ipv4_specific = { #ifdef CONFIG_TCP_MD5SIG static struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .md5_lookup = tcp_v4_md5_lookup, - .calc_md5_hash = tcp_v4_calc_md5_hash, + .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_add = tcp_v4_md5_add_func, .md5_parse = tcp_v4_parse_md5_keys, }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 36a19707f67..958ff486165 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -540,8 +540,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, * room for it. */ md5 = tp->af_specific->md5_lookup(sk, sk); - if (md5) + if (md5) { tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + } #endif skb_push(skb, tcp_header_size); @@ -602,10 +604,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Calculate the MD5 hash, as we have all we need now */ if (md5) { tp->af_specific->calc_md5_hash(md5_hash_location, - md5, - sk, NULL, NULL, - tcp_hdr(skb), - skb->len); + md5, sk, NULL, skb); } #endif @@ -2264,10 +2263,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { tp->af_specific->calc_md5_hash(md5_hash_location, - md5, - NULL, dst, req, - tcp_hdr(skb), - skb->len); + md5, NULL, req, skb); } #endif -- cgit v1.2.3 From 33ad798c924b4a1afad3593f2796d465040aadd5 Mon Sep 17 00:00:00 2001 From: Adam Langley Date: Sat, 19 Jul 2008 00:04:31 -0700 Subject: tcp: options clean up This should fix the following bugs: * Connections with MD5 signatures produce invalid packets whenever SACK options are included * MD5 signatures are counted twice in the MSS calculations Behaviour changes: * A SYN with MD5 + SACK + TS elicits a SYNACK with MD5 + SACK This is because we can't fit any SACK blocks in a packet with MD5 + TS options. There was discussion about disabling SACK rather than TS in order to fit in better with old, buggy kernels, but that was deemed to be unnecessary. * SYNs with MD5 don't include a TS option See above. Additionally, it removes a bunch of duplicated logic for calculating options, which should help avoid these sort of issues in the future. Signed-off-by: Adam Langley Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 432 +++++++++++++++++++++++++++----------------------- 1 file changed, 236 insertions(+), 196 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 958ff486165..1fa683c0ba9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -345,28 +345,82 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->end_seq = seq; } -static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, - __u32 tstamp, __u8 **md5_hash) -{ - if (tp->rx_opt.tstamp_ok) { +#define OPTION_SACK_ADVERTISE (1 << 0) +#define OPTION_TS (1 << 1) +#define OPTION_MD5 (1 << 2) + +struct tcp_out_options { + u8 options; /* bit field of OPTION_* */ + u8 ws; /* window scale, 0 to disable */ + u8 num_sack_blocks; /* number of SACK blocks to include */ + u16 mss; /* 0 to disable */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ +}; + +static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, + const struct tcp_out_options *opts, + __u8 **md5_hash) { + if (unlikely(OPTION_MD5 & opts->options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); - *ptr++ = htonl(tp->rx_opt.ts_recent); + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + *md5_hash = (__u8 *)ptr; + ptr += 4; + } else { + *md5_hash = NULL; } - if (tp->rx_opt.eff_sacks) { - struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; + + if (likely(OPTION_TS & opts->options)) { + if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + } else { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + } + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } + + if (unlikely(opts->mss)) { + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + } + + if (unlikely(OPTION_SACK_ADVERTISE & opts->options && + !(OPTION_TS & opts->options))) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); + } + + if (unlikely(opts->ws)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + opts->ws); + } + + if (unlikely(opts->num_sack_blocks)) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? + tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * + (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); - for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { + for (this_sack = 0; this_sack < opts->num_sack_blocks; + ++this_sack) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } @@ -376,81 +430,137 @@ static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, tp->rx_opt.eff_sacks--; } } +} + +static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + #ifdef CONFIG_TCP_MD5SIG - if (md5_hash) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *)ptr; + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; } +#else + *md5 = NULL; #endif + + /* We always get an MSS option. The option bytes which will be seen in + * normal data packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so that + * calculations in tcp_sendmsg are simpler etc. So account for this + * fact here if necessary. If we don't do this correctly, as a + * receiver we won't recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK rules correctly. + * SACKs don't matter, we never delay an ACK when we have any of those + * going out. */ + opts->mss = tcp_advertise_mss(sk); + size += TCPOLEN_MSS_ALIGNED; + + if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(sysctl_tcp_window_scaling)) { + opts->ws = tp->rx_opt.rcv_wscale; + size += TCPOLEN_WSCALE_ALIGNED; + } + if (likely(sysctl_tcp_sack)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!OPTION_TS & opts->options)) + size += TCPOLEN_SACKPERM_ALIGNED; + } + + return size; } -/* Construct a tcp options header for a SYN or SYN_ACK packet. - * If this is every changed make sure to change the definition of - * MAX_SYN_SIZE to match the new maximum number of options that you - * can generate. - * - * Note - that with the RFC2385 TCP option, we make room for the - * 16 byte MD5 hash. This will be filled in later, so the pointer for the - * location to be filled is passed back up. - */ -static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, - int offer_wscale, int wscale, __u32 tstamp, - __u32 ts_recent, __u8 **md5_hash) -{ - /* We always get an MSS option. - * The option bytes which will be seen in normal data - * packets should timestamps be used, must be in the MSS - * advertised. But we subtract them from tp->mss_cache so - * that calculations in tcp_sendmsg are simpler etc. - * So account for this fact here if necessary. If we - * don't do this correctly, as a receiver we won't - * recognize data packets as being full sized when we - * should, and thus we won't abide by the delayed ACK - * rules correctly. - * SACKs don't matter, we never delay an ACK when we - * have any of those going out. - */ - *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); - if (ts) { - if (sack) - *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | - (TCPOLEN_SACK_PERM << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - else - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); /* TSVAL */ - *ptr++ = htonl(ts_recent); /* TSECR */ - } else if (sack) - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK_PERM << 8) | - TCPOLEN_SACK_PERM); - if (offer_wscale) - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_WINDOW << 16) | - (TCPOLEN_WINDOW << 8) | - (wscale)); +static unsigned tcp_synack_options(struct sock *sk, + struct request_sock *req, + unsigned mss, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + unsigned size = 0; + struct inet_request_sock *ireq = inet_rsk(req); + char doing_ts; + #ifdef CONFIG_TCP_MD5SIG - /* - * If MD5 is enabled, then we set the option, and include the size - * (always 18). The actual MD5 hash is added just before the - * packet is sent. - */ - if (md5_hash) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *)ptr; + *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; } +#else + *md5 = NULL; #endif + + /* we can't fit any SACK blocks in a packet with MD5 + TS + options. There was discussion about disabling SACK rather than TS in + order to fit in better with old, buggy kernels, but that was deemed + to be unnecessary. */ + doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); + + opts->mss = mss; + size += TCPOLEN_MSS_ALIGNED; + + if (likely(ireq->wscale_ok)) { + opts->ws = ireq->rcv_wscale; + size += TCPOLEN_WSCALE_ALIGNED; + } + if (likely(doing_ts)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = req->ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(ireq->sack_ok)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!doing_ts)) + size += TCPOLEN_SACKPERM_ALIGNED; + } + + return size; +} + +static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (unlikely(*md5)) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + if (likely(tp->rx_opt.tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = tcb ? tcb->when : 0; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + + if (unlikely(tp->rx_opt.eff_sacks)) { + const unsigned remaining = MAX_TCP_OPTION_SPACE - size; + opts->num_sack_blocks = + min_t(unsigned, tp->rx_opt.eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } + + return size; } /* This routine actually transmits TCP packets queued in by @@ -471,13 +581,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; - int tcp_header_size; -#ifdef CONFIG_TCP_MD5SIG + struct tcp_out_options opts; + unsigned tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; __u8 *md5_hash_location; -#endif struct tcphdr *th; - int sysctl_flags; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -500,52 +608,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, inet = inet_sk(sk); tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); - tcp_header_size = tp->tcp_header_len; - -#define SYSCTL_FLAG_TSTAMPS 0x1 -#define SYSCTL_FLAG_WSCALE 0x2 -#define SYSCTL_FLAG_SACK 0x4 + memset(&opts, 0, sizeof(opts)); - sysctl_flags = 0; - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if (sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if (sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if (sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (unlikely(tp->rx_opt.eff_sacks)) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK)); - } + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) + tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); + else + tcp_options_size = tcp_established_options(sk, skb, &opts, + &md5); + tcp_header_size = tcp_options_size + sizeof(struct tcphdr); if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); -#ifdef CONFIG_TCP_MD5SIG - /* - * Are we doing MD5 on this segment? If so - make - * room for it. - */ - md5 = tp->af_specific->md5_lookup(sk, sk); - if (md5) { - tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - } -#endif - skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_set_owner_w(skb, sk); @@ -576,33 +650,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg = 1; } - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_syn_build_options((__be32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent, - -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL); - } else { - tcp_build_and_update_options((__be32 *)(th + 1), - tp, tcb->when, -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL); + tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); - } #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; tp->af_specific->calc_md5_hash(md5_hash_location, md5, sk, NULL, skb); } @@ -626,10 +681,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_enter_cwr(sk, 1); return net_xmit_eval(err); - -#undef SYSCTL_FLAG_TSTAMPS -#undef SYSCTL_FLAG_WSCALE -#undef SYSCTL_FLAG_SACK } /* This routine just queue's the buffer @@ -970,6 +1021,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) u32 mss_now; u16 xmit_size_goal; int doing_tso = 0; + unsigned header_len; + struct tcp_out_options opts; + struct tcp_md5sig_key *md5; mss_now = tp->mss_cache; @@ -982,14 +1036,16 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now = tcp_sync_mss(sk, mtu); } - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - -#ifdef CONFIG_TCP_MD5SIG - if (tp->af_specific->md5_lookup(sk, sk)) - mss_now -= TCPOLEN_MD5SIG_ALIGNED; -#endif + header_len = tcp_established_options(sk, NULL, &opts, &md5) + + sizeof(struct tcphdr); + /* The mss_cache is sized based on tp->tcp_header_len, which assumes + * some common options. If this is an odd packet (because we have SACK + * blocks etc) then our calculated header_len will be different, and + * we have to adjust mss_now correspondingly */ + if (header_len != tp->tcp_header_len) { + int delta = (int) header_len - tp->tcp_header_len; + mss_now -= delta; + } xmit_size_goal = mss_now; @@ -2179,11 +2235,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; int tcp_header_size; + struct tcp_out_options opts; struct sk_buff *skb; -#ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *md5; __u8 *md5_hash_location; -#endif skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) @@ -2194,18 +2249,27 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->dst = dst_clone(dst); - tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + - (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + - /* SACK_PERM is in the place of NOP NOP of TS */ - ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + ireq->wscale_ok, + &rcv_wscale); + ireq->rcv_wscale = rcv_wscale; + } + + memset(&opts, 0, sizeof(opts)); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_header_size = tcp_synack_options(sk, req, + dst_metric(dst, RTAX_ADVMSS), + skb, &opts, &md5) + + sizeof(struct tcphdr); -#ifdef CONFIG_TCP_MD5SIG - /* Are we doing MD5 on this segment? If so - make room for it */ - md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); - if (md5) - tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; -#endif skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2223,19 +2287,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), - dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale); - ireq->rcv_wscale = rcv_wscale; - } /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); @@ -2244,18 +2295,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); else #endif - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, - ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale, - TCP_SKB_CB(skb)->when, - req->ts_recent, - ( -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL) - ); - + tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); -- cgit v1.2.3 From 4389dded7767d24290463f2a8302ba3253ebdd56 Mon Sep 17 00:00:00 2001 From: Adam Langley Date: Sat, 19 Jul 2008 00:07:02 -0700 Subject: tcp: Remove redundant checks when setting eff_sacks Remove redundant checks when setting eff_sacks and make the number of SACKs a compile time constant. Now that the options code knows how many SACK blocks can fit in the header, we don't need to have the SACK code guessing at it. Signed-off-by: Adam Langley Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 88810bc0137..1f5e6049883 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1423,10 +1423,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); - struct tcp_sack_block sp[4]; + struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; struct sk_buff *skb; - int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE) >> 3; + int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; int reord = tp->packets_out; int flag = 0; @@ -3735,8 +3735,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1; } } @@ -3791,9 +3790,8 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) * Decrease num_sacks. */ tp->rx_opt.num_sacks--; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + - tp->rx_opt.dsack, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + + tp->rx_opt.dsack; for (i = this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i + 1]; continue; @@ -3843,7 +3841,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * * If the sack array is full, forget about the last one. */ - if (this_sack >= 4) { + if (this_sack >= TCP_NUM_SACKS) { this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -3856,8 +3854,7 @@ new_sack: sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; } /* RCV.NXT advances, some SACKs should be eaten. */ @@ -3894,9 +3891,8 @@ static void tcp_sack_remove(struct tcp_sock *tp) } if (num_sacks != tp->rx_opt.num_sacks) { tp->rx_opt.num_sacks = num_sacks; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + - tp->rx_opt.dsack, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + + tp->rx_opt.dsack; } } @@ -3975,8 +3971,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; } /* Queue data for delivery to the user. -- cgit v1.2.3 From bdccc4ca13a639d759206c5b21ed73f8a813eaba Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 19 Jul 2008 00:15:13 -0700 Subject: tcp: fix kernel panic with listening_get_next # BUG: unable to handle kernel NULL pointer dereference at 0000000000000038 IP: [] listening_get_next+0x50/0x1b3 PGD 11e4b9067 PUD 11d16c067 PMD 0 Oops: 0000 [1] SMP last sysfs file: /sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_map CPU 3 Modules linked in: bridge ipv6 button battery ac loop dm_mod tg3 ext3 jbd edd fan thermal processor thermal_sys hwmon sg sata_svw libata dock serverworks sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table] Pid: 3368, comm: slpd Not tainted 2.6.26-rc2-mm1-lxc4 #1 RIP: 0010:[] [] listening_get_next+0x50/0x1b3 RSP: 0018:ffff81011e1fbe18 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8100be0ad3c0 RCX: ffff8100619f50c0 RDX: ffffffff82475be0 RSI: ffff81011d9ae6c0 RDI: ffff8100be0ad508 RBP: ffff81011f4f1240 R08: 00000000ffffffff R09: ffff8101185b6780 R10: 000000000000002d R11: ffffffff820fdbfa R12: ffff8100be0ad3c8 R13: ffff8100be0ad6a0 R14: ffff8100be0ad3c0 R15: ffffffff825b8ce0 FS: 00007f6a0ebd16d0(0000) GS:ffff81011f424540(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000038 CR3: 000000011dc20000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process slpd (pid: 3368, threadinfo ffff81011e1fa000, task ffff81011f4b8660) Stack: 00000000000002ee ffff81011f5a57c0 ffff81011f4f1240 ffff81011e1fbe90 0000000000001000 0000000000000000 00007fff16bf2590 ffffffff821ed9c8 ffff81011f5a57c0 ffff81011d9ae6c0 000000000000041a ffffffff820b0abd Call Trace: [] ? tcp_seq_next+0x34/0x7e [] ? seq_read+0x1aa/0x29d [] ? proc_reg_read+0x73/0x8e [] ? vfs_read+0xaa/0x152 [] ? sys_read+0x45/0x6e [] ? system_call_after_swapgs+0x7b/0x80 Code: 31 a9 25 00 e9 b5 00 00 00 ff 45 20 83 7d 0c 01 75 79 4c 8b 75 10 48 8b 0e eb 1d 48 8b 51 20 0f b7 45 08 39 02 75 0e 48 8b 41 28 <4c> 39 78 38 0f 84 93 00 00 00 48 8b 09 48 85 c9 75 de 8b 55 1c RIP [] listening_get_next+0x50/0x1b3 RSP CR2: 0000000000000038 This kernel panic appears with CONFIG_NET_NS=y. How to reproduce ? On the buggy host (host A) * ip addr add 1.2.3.4/24 dev eth0 On a remote host (host B) * ip addr add 1.2.3.5/24 dev eth0 * iptables -A INPUT -p tcp -s 1.2.3.4 -j DROP * ssh 1.2.3.4 On host A: * netstat -ta or cat /proc/net/tcp This bug happens when reading /proc/net/tcp[6] when there is a req_sock at the SYN_RECV state. When a SYN is received the minisock is created and the sk field is set to NULL. In the listening_get_next function, we try to look at the field req->sk->sk_net. When looking at how to fix this bug, I noticed that is useless to do the check for the minisock belonging to the namespace. A minisock belongs to a listen point and this one is per namespace, so when browsing the minisock they are always per namespace. Signed-off-by: Daniel Lezcano Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5400d75ff17..a82df630756 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1892,8 +1892,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) req = req->dl_next; while (1) { while (req) { - if (req->rsk_ops->family == st->family && - net_eq(sock_net(req->sk), net)) { + if (req->rsk_ops->family == st->family) { cur = req; goto out; } -- cgit v1.2.3 From 721499e8931c5732202481ae24f2dfbf9910f129 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Sat, 19 Jul 2008 22:34:43 -0700 Subject: netns: Use net_eq() to compare net-namespaces for optimization. Without CONFIG_NET_NS, namespace is always &init_net. Compiler will be able to omit namespace comparisons with this patch. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 26 +++++++++++++------------- net/ipv4/ipconfig.c | 4 ++-- net/ipv4/ipmr.c | 2 +- net/ipv4/netfilter/ip_queue.c | 2 +- net/ipv4/netfilter/ipt_MASQUERADE.c | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 68e84a933e9..6203ece5360 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1196,7 +1196,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) + if (!net_eq(dev_net(in_dev->dev), &init_net)) return; for (im=in_dev->mc_list; im; im=im->next) { @@ -1278,7 +1278,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) + if (!net_eq(dev_net(in_dev->dev), &init_net)) return; for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { @@ -1308,7 +1308,7 @@ void ip_mc_down(struct in_device *in_dev) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) + if (!net_eq(dev_net(in_dev->dev), &init_net)) return; for (i=in_dev->mc_list; i; i=i->next) @@ -1331,7 +1331,7 @@ void ip_mc_init_dev(struct in_device *in_dev) { ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) + if (!net_eq(dev_net(in_dev->dev), &init_net)) return; in_dev->mc_tomb = NULL; @@ -1357,7 +1357,7 @@ void ip_mc_up(struct in_device *in_dev) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) + if (!net_eq(dev_net(in_dev->dev), &init_net)) return; ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); @@ -1376,7 +1376,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) + if (!net_eq(dev_net(in_dev->dev), &init_net)) return; /* Deactivate timers */ @@ -1760,7 +1760,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) if (!ipv4_is_multicast(addr)) return -EINVAL; - if (sock_net(sk) != &init_net) + if (!net_eq(sock_net(sk), &init_net)) return -EPROTONOSUPPORT; rtnl_lock(); @@ -1831,7 +1831,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) u32 ifindex; int ret = -EADDRNOTAVAIL; - if (sock_net(sk) != &init_net) + if (!net_eq(sock_net(sk), &init_net)) return -EPROTONOSUPPORT; rtnl_lock(); @@ -1879,7 +1879,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!ipv4_is_multicast(addr)) return -EINVAL; - if (sock_net(sk) != &init_net) + if (!net_eq(sock_net(sk), &init_net)) return -EPROTONOSUPPORT; rtnl_lock(); @@ -2015,7 +2015,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; - if (sock_net(sk) != &init_net) + if (!net_eq(sock_net(sk), &init_net)) return -EPROTONOSUPPORT; rtnl_lock(); @@ -2098,7 +2098,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, if (!ipv4_is_multicast(addr)) return -EINVAL; - if (sock_net(sk) != &init_net) + if (!net_eq(sock_net(sk), &init_net)) return -EPROTONOSUPPORT; rtnl_lock(); @@ -2163,7 +2163,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; - if (sock_net(sk) != &init_net) + if (!net_eq(sock_net(sk), &init_net)) return -EPROTONOSUPPORT; rtnl_lock(); @@ -2250,7 +2250,7 @@ void ip_mc_drop_socket(struct sock *sk) if (inet->mc_list == NULL) return; - if (sock_net(sk) != &init_net) + if (!net_eq(sock_net(sk), &init_net)) return; rtnl_lock(); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b88aa9afa42..42065fff46c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -432,7 +432,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) goto drop; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) @@ -852,7 +852,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str struct ic_device *d; int len, ext_len; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) goto drop; /* Perform verifications before taking the lock. */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 033c712c3a5..c519b8d30ee 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1124,7 +1124,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v struct vif_device *v; int ct; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event != NETDEV_UNREGISTER) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index aa33a4a7a71..432ce9d1c11 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -477,7 +477,7 @@ ipq_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; /* Drop any packets associated with the downed device */ diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 84c26dd27d8..0841aefaa50 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -120,7 +120,7 @@ static int masq_device_event(struct notifier_block *this, { const struct net_device *dev = ptr; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event == NETDEV_DOWN) { -- cgit v1.2.3