From f26ba1751145edbf52b2c89a40e389f2fbdfc1af Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 2 Nov 2008 16:11:01 +0000 Subject: udp: Fix the SNMP counter of UDP_MIB_INDATAGRAMS If UDP echo is sent to xinetd/echo-dgram, the UDP reply will be received at the sender. But the SNMP counter of UDP_MIB_INDATAGRAMS will be not increased, UDP6_MIB_INDATAGRAMS will be increased instead. Endpoint A Endpoint B UDP Echo request -----------> (IPv4, Dst port=7) <---------- UDP Echo Reply (IPv4, Src port=7) This bug is come from this patch cb75994ec311b2cd50e5205efdcc0696abd6675d. It do counter UDP[6]_MIB_INDATAGRAMS until udp[v6]_recvmsg. Because xinetd used IPv6 socket to receive UDP messages, thus, when received UDP packet, the UDP6_MIB_INDATAGRAMS will be increased in function udpv6_recvmsg() even if the packet is a IPv4 UDP packet. This patch fixed the problem. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv6/udp.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 71e259e866a..18696af106d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -138,6 +138,7 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, int peeked; int err; int is_udplite = IS_UDPLITE(sk); + int is_udp4; if (addr_len) *addr_len=sizeof(struct sockaddr_in6); @@ -158,6 +159,8 @@ try_again: else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; + is_udp4 = (skb->protocol == htons(ETH_P_IP)); + /* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial @@ -180,9 +183,14 @@ try_again: if (err) goto out_free; - if (!peeked) - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_INDATAGRAMS, is_udplite); + if (!peeked) { + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + } sock_recv_timestamp(msg, sk, skb); @@ -196,7 +204,7 @@ try_again: sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; - if (skb->protocol == htons(ETH_P_IP)) + if (is_udp4) ipv6_addr_set(&sin6->sin6_addr, 0, 0, htonl(0xffff), ip_hdr(skb)->saddr); else { @@ -207,7 +215,7 @@ try_again: } } - if (skb->protocol == htons(ETH_P_IP)) { + if (is_udp4) { if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); } else { -- cgit v1.2.3 From 0856f93958c488f0cc656be53c26dfd20663bdb3 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 2 Nov 2008 16:14:27 +0000 Subject: udp: Fix the SNMP counter of UDP_MIB_INERRORS UDP packets received in udpv6_recvmsg() are not only IPv6 UDP packets, but also have IPv4 UDP packets, so when do the counter of UDP_MIB_INERRORS in udpv6_recvmsg(), we should check whether the packet is a IPv6 UDP packet or a IPv4 UDP packet. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv6/udp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 18696af106d..8b48512ebf6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -236,8 +236,14 @@ out: csum_copy_err: lock_sock(sk); - if (!skb_kill_datagram(sk, skb, flags)) - UDP6_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + if (!skb_kill_datagram(sk, skb, flags)) { + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + } release_sock(sk); if (flags & MSG_DONTWAIT) -- cgit v1.2.3 From a1caa32295d67284ecba18cd8db692c7166f0706 Mon Sep 17 00:00:00 2001 From: Arnaud Ebalard Date: Mon, 3 Nov 2008 01:30:23 -0800 Subject: XFRM: copy_to_user_kmaddress() reports local address twice While adding support for MIGRATE/KMADDRESS in strongSwan (as specified in draft-ebalard-mext-pfkey-enhanced-migrate-00), Andreas Steffen noticed that XFRMA_KMADDRESS attribute passed to userland contains the local address twice (remote provides local address instead of remote one). This bug in copy_to_user_kmaddress() affects only key managers that use native XFRM interface (key managers that use PF_KEY are not affected). For the record, the bug was in the initial changeset I posted which added support for KMADDRESS (13c1d18931ebb5cf407cb348ef2cd6284d68902d 'xfrm: MIGRATE enhancements (draft-ebalard-mext-pfkey-enhanced-migrate)'). Signed-off-by: Arnaud Ebalard Reported-by: Andreas Steffen Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 4a8a1abb59e..a278a6f3b99 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1816,7 +1816,7 @@ static int copy_to_user_kmaddress(struct xfrm_kmaddress *k, struct sk_buff *skb) uk.family = k->family; uk.reserved = k->reserved; memcpy(&uk.local, &k->local, sizeof(uk.local)); - memcpy(&uk.remote, &k->local, sizeof(uk.remote)); + memcpy(&uk.remote, &k->remote, sizeof(uk.remote)); return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk); } -- cgit v1.2.3 From bbb770e7ab9a436752babfc8765e422d7481be1f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 3 Nov 2008 19:11:29 -0800 Subject: xfrm: Fix xfrm_policy_gc_lock handling. From: Alexey Dobriyan Based upon a lockdep trace by Simon Arlott. xfrm_policy_kill() can be called from both BH and non-BH contexts, so we have to grab xfrm_policy_gc_lock with BH disabling. Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 25872747762..058f04f54b9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -315,9 +315,9 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) return; } - spin_lock(&xfrm_policy_gc_lock); + spin_lock_bh(&xfrm_policy_gc_lock); hlist_add_head(&policy->bydst, &xfrm_policy_gc_list); - spin_unlock(&xfrm_policy_gc_lock); + spin_unlock_bh(&xfrm_policy_gc_lock); schedule_work(&xfrm_policy_gc_work); } -- cgit v1.2.3 From 79654a7698195fa043063092f5c1ca5245276fba Mon Sep 17 00:00:00 2001 From: Andreas Steffen Date: Tue, 4 Nov 2008 14:49:19 -0800 Subject: xfrm: Have af-specific init_tempsel() initialize family field of temporary selector While adding MIGRATE support to strongSwan, Andreas Steffen noticed that the selectors provided in XFRM_MSG_ACQUIRE have their family field uninitialized (those in MIGRATE do have their family set). Looking at the code, this is because the af-specific init_tempsel() (called via afinfo->init_tempsel() in xfrm_init_tempsel()) do not set the value. Reported-by: Andreas Steffen Acked-by: Herbert Xu Signed-off-by: Arnaud Ebalard --- net/ipv4/xfrm4_state.c | 1 + net/ipv6/xfrm6_state.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 07735ed280d..55dc6beab9a 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -33,6 +33,7 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, x->sel.dport_mask = htons(0xffff); x->sel.sport = xfrm_flowi_sport(fl); x->sel.sport_mask = htons(0xffff); + x->sel.family = AF_INET; x->sel.prefixlen_d = 32; x->sel.prefixlen_s = 32; x->sel.proto = fl->proto; diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 89884a4f23a..60c78cfc273 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -34,6 +34,7 @@ __xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, x->sel.dport_mask = htons(0xffff); x->sel.sport = xfrm_flowi_sport(fl); x->sel.sport_mask = htons(0xffff); + x->sel.family = AF_INET6; x->sel.prefixlen_d = 128; x->sel.prefixlen_s = 128; x->sel.proto = fl->proto; -- cgit v1.2.3 From 9b22ea560957de1484e6b3e8538f7eef202e3596 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 4 Nov 2008 14:49:57 -0800 Subject: net: fix packet socket delivery in rx irq handler The changes to deliver hardware accelerated VLAN packets to packet sockets (commit bc1d0411) caused a warning for non-NAPI drivers. The __vlan_hwaccel_rx() function is called directly from the drivers RX function, for non-NAPI drivers that means its still in RX IRQ context: [ 27.779463] ------------[ cut here ]------------ [ 27.779509] WARNING: at kernel/softirq.c:136 local_bh_enable+0x37/0x81() ... [ 27.782520] [] netif_nit_deliver+0x5b/0x75 [ 27.782590] [] __vlan_hwaccel_rx+0x79/0x162 [ 27.782664] [] atl1_intr+0x9a9/0xa7c [atl1] [ 27.782738] [] handle_IRQ_event+0x23/0x51 [ 27.782808] [] handle_edge_irq+0xc2/0x102 [ 27.782878] [] do_IRQ+0x4d/0x64 Split hardware accelerated VLAN reception into two parts to fix this: - __vlan_hwaccel_rx just stores the VLAN TCI and performs the VLAN device lookup, then calls netif_receive_skb()/netif_rx() - vlan_hwaccel_do_receive(), which is invoked by netif_receive_skb() in softirq context, performs the real reception and delivery to packet sockets. Reported-and-tested-by: Ramon Casellas Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 46 +++++++++++++++++++++++++++++++++------------- net/core/dev.c | 3 +++ 2 files changed, 36 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 916061f681b..68ced4bf158 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -3,11 +3,20 @@ #include #include "vlan.h" +struct vlan_hwaccel_cb { + struct net_device *dev; +}; + +static inline struct vlan_hwaccel_cb *vlan_hwaccel_cb(struct sk_buff *skb) +{ + return (struct vlan_hwaccel_cb *)skb->cb; +} + /* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, u16 vlan_tci, int polling) { - struct net_device_stats *stats; + struct vlan_hwaccel_cb *cb = vlan_hwaccel_cb(skb); if (skb_bond_should_drop(skb)) { dev_kfree_skb_any(skb); @@ -15,23 +24,35 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, } skb->vlan_tci = vlan_tci; + cb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK); + + return (polling ? netif_receive_skb(skb) : netif_rx(skb)); +} +EXPORT_SYMBOL(__vlan_hwaccel_rx); + +int vlan_hwaccel_do_receive(struct sk_buff *skb) +{ + struct vlan_hwaccel_cb *cb = vlan_hwaccel_cb(skb); + struct net_device *dev = cb->dev; + struct net_device_stats *stats; + netif_nit_deliver(skb); - skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK); - if (skb->dev == NULL) { - dev_kfree_skb_any(skb); - /* Not NET_RX_DROP, this is not being dropped - * due to congestion. */ - return NET_RX_SUCCESS; + if (dev == NULL) { + kfree_skb(skb); + return -1; } - skb->dev->last_rx = jiffies; + + skb->dev = dev; + skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci); skb->vlan_tci = 0; - stats = &skb->dev->stats; + dev->last_rx = jiffies; + + stats = &dev->stats; stats->rx_packets++; stats->rx_bytes += skb->len; - skb->priority = vlan_get_ingress_priority(skb->dev, vlan_tci); switch (skb->pkt_type) { case PACKET_BROADCAST: break; @@ -43,13 +64,12 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, * This allows the VLAN to have a different MAC than the * underlying device, and still route correctly. */ if (!compare_ether_addr(eth_hdr(skb)->h_dest, - skb->dev->dev_addr)) + dev->dev_addr)) skb->pkt_type = PACKET_HOST; break; }; - return (polling ? netif_receive_skb(skb) : netif_rx(skb)); + return 0; } -EXPORT_SYMBOL(__vlan_hwaccel_rx); struct net_device *vlan_dev_real_dev(const struct net_device *dev) { diff --git a/net/core/dev.c b/net/core/dev.c index d9038e328cc..9174c77d311 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2218,6 +2218,9 @@ int netif_receive_skb(struct sk_buff *skb) int ret = NET_RX_DROP; __be16 type; + if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) + return NET_RX_SUCCESS; + /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) return NET_RX_DROP; -- cgit v1.2.3 From b22cecdd8fa4667ebab02def0866387e709927ee Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 5 Nov 2008 01:35:55 -0800 Subject: net/9p: fix printk format warnings Fix printk format warnings in net/9p. Built cleanly on 7 arches. net/9p/client.c:820: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'u64' net/9p/client.c:820: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:867: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'u64' net/9p/client.c:867: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:932: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:932: warning: format '%llx' expects type 'long long unsigned int', but argument 6 has type 'u64' net/9p/client.c:982: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'u64' net/9p/client.c:982: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:1025: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'u64' net/9p/client.c:1025: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:1227: warning: format '%llx' expects type 'long long unsigned int', but argument 7 has type 'u64' net/9p/client.c:1227: warning: format '%llx' expects type 'long long unsigned int', but argument 12 has type 'u64' net/9p/client.c:1227: warning: format '%llx' expects type 'long long unsigned int', but argument 8 has type 'u64' net/9p/client.c:1227: warning: format '%llx' expects type 'long long unsigned int', but argument 13 has type 'u64' net/9p/client.c:1252: warning: format '%llx' expects type 'long long unsigned int', but argument 7 has type 'u64' net/9p/client.c:1252: warning: format '%llx' expects type 'long long unsigned int', but argument 12 has type 'u64' net/9p/client.c:1252: warning: format '%llx' expects type 'long long unsigned int', but argument 8 has type 'u64' net/9p/client.c:1252: warning: format '%llx' expects type 'long long unsigned int', but argument 13 has type 'u64' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/9p/client.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 67717f69412..0a04faa2211 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -818,7 +818,9 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, } P9_DPRINTK(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n", - qid.type, qid.path, qid.version); + qid.type, + (unsigned long long)qid.path, + qid.version); memmove(&fid->qid, &qid, sizeof(struct p9_qid)); @@ -865,7 +867,9 @@ p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname) } P9_DPRINTK(P9_DEBUG_9P, "<<< RAUTH qid %x.%llx.%x\n", - qid.type, qid.path, qid.version); + qid.type, + (unsigned long long)qid.path, + qid.version); memmove(&afid->qid, &qid, sizeof(struct p9_qid)); p9_free_req(clnt, req); @@ -930,7 +934,8 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames, for (count = 0; count < nwqids; count++) P9_DPRINTK(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n", - count, wqids[count].type, wqids[count].path, + count, wqids[count].type, + (unsigned long long)wqids[count].path, wqids[count].version); if (nwname) @@ -980,7 +985,9 @@ int p9_client_open(struct p9_fid *fid, int mode) } P9_DPRINTK(P9_DEBUG_9P, "<<< ROPEN qid %x.%llx.%x iounit %x\n", - qid.type, qid.path, qid.version, iounit); + qid.type, + (unsigned long long)qid.path, + qid.version, iounit); fid->mode = mode; fid->iounit = iounit; @@ -1023,7 +1030,9 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode, } P9_DPRINTK(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n", - qid.type, qid.path, qid.version, iounit); + qid.type, + (unsigned long long)qid.path, + qid.version, iounit); fid->mode = mode; fid->iounit = iounit; @@ -1230,9 +1239,9 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n" "<<< uid=%d gid=%d n_muid=%d\n", ret->size, ret->type, ret->dev, ret->qid.type, - ret->qid.path, ret->qid.version, ret->mode, - ret->atime, ret->mtime, ret->length, ret->name, - ret->uid, ret->gid, ret->muid, ret->extension, + (unsigned long long)ret->qid.path, ret->qid.version, ret->mode, + ret->atime, ret->mtime, (unsigned long long)ret->length, + ret->name, ret->uid, ret->gid, ret->muid, ret->extension, ret->n_uid, ret->n_gid, ret->n_muid); free_and_error: @@ -1255,9 +1264,9 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) " name=%s uid=%s gid=%s muid=%s extension=(%s)\n" " uid=%d gid=%d n_muid=%d\n", wst->size, wst->type, wst->dev, wst->qid.type, - wst->qid.path, wst->qid.version, wst->mode, - wst->atime, wst->mtime, wst->length, wst->name, - wst->uid, wst->gid, wst->muid, wst->extension, + (unsigned long long)wst->qid.path, wst->qid.version, wst->mode, + wst->atime, wst->mtime, (unsigned long long)wst->length, + wst->name, wst->uid, wst->gid, wst->muid, wst->extension, wst->n_uid, wst->n_gid, wst->n_muid); err = 0; clnt = fid->clnt; -- cgit v1.2.3 From e3ec6cfc260e2322834e200c2fa349cdf104fd13 Mon Sep 17 00:00:00 2001 From: Benjamin Thery Date: Wed, 5 Nov 2008 01:43:57 -0800 Subject: ipv6: fix run pending DAD when interface becomes ready With some net devices types, an IPv6 address configured while the interface was down can stay 'tentative' forever, even after the interface is set up. In some case, pending IPv6 DADs are not executed when the device becomes ready. I observed this while doing some tests with kvm. If I assign an IPv6 address to my interface eth0 (kvm driver rtl8139) when it is still down then the address is flagged tentative (IFA_F_TENTATIVE). Then, I set eth0 up, and to my surprise, the address stays 'tentative', no DAD is executed and the address can't be pinged. I also observed the same behaviour, without kvm, with virtual interfaces types macvlan and veth. Some easy steps to reproduce the issue with macvlan: 1. ip link add link eth0 type macvlan 2. ip -6 addr add 2003::ab32/64 dev macvlan0 3. ip addr show dev macvlan0 ... inet6 2003::ab32/64 scope global tentative ... 4. ip link set macvlan0 up 5. ip addr show dev macvlan0 ... inet6 2003::ab32/64 scope global tentative ... Address is still tentative I think there's a bug in net/ipv6/addrconf.c, addrconf_notify(): addrconf_dad_run() is not always run when the interface is flagged IF_READY. Currently it is only run when receiving NETDEV_CHANGE event. Looks like some (virtual) devices doesn't send this event when becoming up. For both NETDEV_UP and NETDEV_CHANGE events, when the interface becomes ready, run_pending should be set to 1. Patch below. 'run_pending = 1' could be moved below the if/else block but it makes the code less readable. Signed-off-by: Benjamin Thery Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index eea9542728c..d9da5eb9dcb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2483,8 +2483,10 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!idev && dev->mtu >= IPV6_MIN_MTU) idev = ipv6_add_dev(dev); - if (idev) + if (idev) { idev->if_flags |= IF_READY; + run_pending = 1; + } } else { if (!addrconf_qdisc_ok(dev)) { /* device is still not ready. */ -- cgit v1.2.3 From efb9a8c28ca0edd9e2572117105ebad9bbc0c368 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 5 Nov 2008 03:03:18 -0800 Subject: netfilter: netns ct: walk netns list under RTNL netns list (just list) is under RTNL. But helper and proto unregistration happen during rmmod when RTNL is not held, and that's how it was tested: modprobe/rmmod vs clone(CLONE_NEWNET)/exit. BUG: unable to handle kernel paging request at 0000000000100100 <=== IP: [] nf_conntrack_l4proto_unregister+0x96/0xae [nf_conntrack] PGD 15e300067 PUD 15e1d8067 PMD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC last sysfs file: /sys/kernel/uevent_seqnum CPU 0 Modules linked in: nf_conntrack_proto_sctp(-) nf_conntrack_proto_dccp(-) af_packet iptable_nat nf_nat nf_conntrack_ipv4 nf_conntrack nf_defrag_ipv4 iptable_filter ip_tables xt_tcpudp ip6table_filter ip6_tables x_tables ipv6 sr_mod cdrom [last unloaded: nf_conntrack_proto_sctp] Pid: 16758, comm: rmmod Not tainted 2.6.28-rc2-netns-xfrm #3 RIP: 0010:[] [] nf_conntrack_l4proto_unregister+0x96/0xae [nf_conntrack] RSP: 0018:ffff88015dc1fec8 EFLAGS: 00010212 RAX: 0000000000000000 RBX: 00000000001000f8 RCX: 0000000000000000 RDX: ffffffffa009575c RSI: 0000000000000003 RDI: ffffffffa00956b5 RBP: ffff88015dc1fed8 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000000000000 R11: ffff88015dc1fe48 R12: ffffffffa0458f60 R13: 0000000000000880 R14: 00007fff4c361d30 R15: 0000000000000880 FS: 00007f624435a6f0(0000) GS:ffffffff80521580(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000100100 CR3: 0000000168969000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process rmmod (pid: 16758, threadinfo ffff88015dc1e000, task ffff880179864218) Stack: ffffffffa0459100 0000000000000000 ffff88015dc1fee8 ffffffffa0457934 ffff88015dc1ff78 ffffffff80253fef 746e6e6f635f666e 6f72705f6b636172 00707463735f6f74 ffffffff8024cb30 00000000023b8010 0000000000000000 Call Trace: [] nf_conntrack_proto_sctp_fini+0x10/0x1e [nf_conntrack_proto_sctp] [] sys_delete_module+0x19f/0x1fe [] ? trace_hardirqs_on_caller+0xf0/0x114 [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] system_call_fastpath+0x16/0x1b Code: 13 35 e0 e8 c4 6c 1a e0 48 8b 1d 6d c6 46 e0 eb 16 48 89 df 4c 89 e2 48 c7 c6 fc 85 09 a0 e8 61 cd ff ff 48 8b 5b 08 48 83 eb 08 <48> 8b 43 08 0f 18 08 48 8d 43 08 48 3d 60 4f 50 80 75 d3 5b 41 RIP [] nf_conntrack_l4proto_unregister+0x96/0xae [nf_conntrack] RSP CR2: 0000000000100100 ---[ end trace bde8ac82debf7192 ]--- Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_helper.c | 3 +++ net/netfilter/nf_conntrack_proto.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 9c06b9f86ad..c39b6a99413 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -167,10 +168,12 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) */ synchronize_rcu(); + rtnl_lock(); spin_lock_bh(&nf_conntrack_lock); for_each_net(net) __nf_conntrack_helper_unregister(me, net); spin_unlock_bh(&nf_conntrack_lock); + rtnl_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index a59a307e685..592d73344d4 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -221,8 +222,10 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ + rtnl_lock(); for_each_net(net) nf_ct_iterate_cleanup(net, kill_l3proto, proto); + rtnl_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); @@ -333,8 +336,10 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ + rtnl_lock(); for_each_net(net) nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); + rtnl_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); -- cgit v1.2.3 From 518a09ef11f8454f4676125d47c3e775b300c6a5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 5 Nov 2008 03:36:01 -0800 Subject: tcp: Fix recvmsg MSG_PEEK influence of blocking behavior. Vito Caputo noticed that tcp_recvmsg() returns immediately from partial reads when MSG_PEEK is used. In particular, this means that SO_RCVLOWAT is not respected. Simply remove the test. And this matches the behavior of several other systems, including BSD. Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eccb7165a80..c5aca0bb116 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1374,8 +1374,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || - signal_pending(current) || - (flags & MSG_PEEK)) + signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) -- cgit v1.2.3 From 517ac45af4b55913587279d89001171c222f22e7 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 23 Oct 2008 16:30:13 -0500 Subject: 9p: rdma: Set trans prior to requesting async connection ops The RDMA connection manager is fundamentally asynchronous. Since the async callback context is the client pointer, the transport in the client struct needs to be set prior to calling the first async op. Signed-off-by: Tom Tucker Signed-off-by: Eric Van Hensbergen --- net/9p/trans_rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 8d6cc4777aa..4e9d2e673cf 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -589,6 +589,9 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (IS_ERR(rdma->cm_id)) goto error; + /* Associate the client with the transport */ + client->trans = rdma; + /* Resolve the server's address */ rdma->addr.sin_family = AF_INET; rdma->addr.sin_addr.s_addr = in_aton(addr); @@ -669,7 +672,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (err || (rdma->state != P9_RDMA_CONNECTED)) goto error; - client->trans = rdma; client->status = Connected; return 0; -- cgit v1.2.3 From cac23d6505546f4cfa42d949ec46d431a44bd39c Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 23 Oct 2008 16:31:02 -0500 Subject: 9p: Make all client spin locks IRQ safe The client lock must be IRQ safe. Some of the lock acquisition paths took regular spin locks. Signed-off-by: Tom Tucker Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 67717f69412..f4e6c05b3c6 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -613,6 +613,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) { int err; struct p9_fid *fid; + unsigned long flags; P9_DPRINTK(P9_DEBUG_FID, "clnt %p\n", clnt); fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL); @@ -632,9 +633,9 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) fid->clnt = clnt; fid->aux = NULL; - spin_lock(&clnt->lock); + spin_lock_irqsave(&clnt->lock, flags); list_add(&fid->flist, &clnt->fidlist); - spin_unlock(&clnt->lock); + spin_unlock_irqrestore(&clnt->lock, flags); return fid; @@ -646,13 +647,14 @@ error: static void p9_fid_destroy(struct p9_fid *fid) { struct p9_client *clnt; + unsigned long flags; P9_DPRINTK(P9_DEBUG_FID, "fid %d\n", fid->fid); clnt = fid->clnt; p9_idpool_put(fid->fid, clnt->fidpool); - spin_lock(&clnt->lock); + spin_lock_irqsave(&clnt->lock, flags); list_del(&fid->flist); - spin_unlock(&clnt->lock); + spin_unlock_irqrestore(&clnt->lock, flags); kfree(fid); } -- cgit v1.2.3 From 82b189eaaf6186b7694317632255fa87460820a0 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 23 Oct 2008 16:32:28 -0500 Subject: 9p: Remove unneeded free of fcall for Flush T and R fcall are reused until the client is destroyed. There does not need to be a special case for Flush Signed-off-by: Tom Tucker Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index f4e6c05b3c6..26ca8ab4519 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -311,12 +311,6 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r) r->status = REQ_STATUS_IDLE; if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool)) p9_idpool_put(tag, c->tagpool); - - /* if this was a flush request we have to free response fcall */ - if (r->rc->id == P9_RFLUSH) { - kfree(r->tc); - kfree(r->rc); - } } /** -- cgit v1.2.3 From 45abdf1c7be80d6ec3b0b14e59ee75a0d5d9fb37 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 23 Oct 2008 16:33:25 -0500 Subject: p9: Fix leak of waitqueue in request allocation path If a T or R fcall cannot be allocated, the function returns an error but neglects to free the wait queue that was successfully allocated. If it comes through again a second time this wq will be overwritten with a new allocation and the old allocation will be leaked. Also, if the client is subsequently closed, the close path will attempt to clean up these allocations, so set the req fields to NULL to avoid duplicate free. Signed-off-by: Tom Tucker Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 26ca8ab4519..b56d808e63a 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -189,6 +189,9 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) printk(KERN_ERR "Couldn't grow tag array\n"); kfree(req->tc); kfree(req->rc); + kfree(req->wq); + req->tc = req->rc = NULL; + req->wq = NULL; return ERR_PTR(-ENOMEM); } req->tc->sdata = (char *) req->tc + sizeof(struct p9_fcall); -- cgit v1.2.3 From 1558c6214904c636d5a37f05f84202d6cdd9cff8 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Mon, 27 Oct 2008 13:15:16 -0500 Subject: 9p: rdma: remove duplicated #include Removed duplicated #include in net/9p/trans_rdma.c. Signed-off-by: Huang Weiyi Signed-off-by: Eric Van Hensbergen --- net/9p/trans_rdma.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 4e9d2e673cf..2f1fe5fc122 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -45,7 +45,6 @@ #include #include #include -#include #define P9_PORT 5640 #define P9_RDMA_SQ_DEPTH 32 -- cgit v1.2.3 From 9f3e9bbe62b0bdbbaa7c689a68a22a7d3c1670f0 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 28 Oct 2008 14:22:43 -0500 Subject: unsigned fid->fid cannot be negative Signed-off-by: Roel Kluin Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index b56d808e63a..6e800dd51f0 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -608,7 +608,7 @@ reterr: static struct p9_fid *p9_fid_create(struct p9_client *clnt) { - int err; + int ret; struct p9_fid *fid; unsigned long flags; @@ -617,11 +617,12 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) if (!fid) return ERR_PTR(-ENOMEM); - fid->fid = p9_idpool_get(clnt->fidpool); + ret = p9_idpool_get(clnt->fidpool); if (fid->fid < 0) { - err = -ENOSPC; + ret = -ENOSPC; goto error; } + fid->fid = ret; memset(&fid->qid, 0, sizeof(struct p9_qid)); fid->mode = -1; @@ -638,7 +639,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) error: kfree(fid); - return ERR_PTR(err); + return ERR_PTR(ret); } static void p9_fid_destroy(struct p9_fid *fid) -- cgit v1.2.3 From b0d5fdef521b1eadb3fc2c1283000af7ef0297bc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Nov 2008 20:46:46 -0800 Subject: net/9p: fix printk format warnings Fix printk format warnings in net/9p. Built cleanly on 7 arches. net/9p/client.c:820: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'u64' net/9p/client.c:820: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:867: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'u64' net/9p/client.c:867: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:932: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:932: warning: format '%llx' expects type 'long long unsigned int', but argument 6 has type 'u64' net/9p/client.c:982: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'u64' net/9p/client.c:982: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:1025: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'u64' net/9p/client.c:1025: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'u64' net/9p/client.c:1227: warning: format '%llx' expects type 'long long unsigned int', but argument 7 has type 'u64' net/9p/client.c:1227: warning: format '%llx' expects type 'long long unsigned int', but argument 12 has type 'u64' net/9p/client.c:1227: warning: format '%llx' expects type 'long long unsigned int', but argument 8 has type 'u64' net/9p/client.c:1227: warning: format '%llx' expects type 'long long unsigned int', but argument 13 has type 'u64' net/9p/client.c:1252: warning: format '%llx' expects type 'long long unsigned int', but argument 7 has type 'u64' net/9p/client.c:1252: warning: format '%llx' expects type 'long long unsigned int', but argument 12 has type 'u64' net/9p/client.c:1252: warning: format '%llx' expects type 'long long unsigned int', but argument 8 has type 'u64' net/9p/client.c:1252: warning: format '%llx' expects type 'long long unsigned int', but argument 13 has type 'u64' Signed-off-by: Randy Dunlap Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 6e800dd51f0..4b529454616 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -818,7 +818,9 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, } P9_DPRINTK(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n", - qid.type, qid.path, qid.version); + qid.type, + (unsigned long long)qid.path, + qid.version); memmove(&fid->qid, &qid, sizeof(struct p9_qid)); @@ -865,7 +867,9 @@ p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname) } P9_DPRINTK(P9_DEBUG_9P, "<<< RAUTH qid %x.%llx.%x\n", - qid.type, qid.path, qid.version); + qid.type, + (unsigned long long)qid.path, + qid.version); memmove(&afid->qid, &qid, sizeof(struct p9_qid)); p9_free_req(clnt, req); @@ -930,7 +934,8 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames, for (count = 0; count < nwqids; count++) P9_DPRINTK(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n", - count, wqids[count].type, wqids[count].path, + count, wqids[count].type, + (unsigned long long)wqids[count].path, wqids[count].version); if (nwname) @@ -980,7 +985,9 @@ int p9_client_open(struct p9_fid *fid, int mode) } P9_DPRINTK(P9_DEBUG_9P, "<<< ROPEN qid %x.%llx.%x iounit %x\n", - qid.type, qid.path, qid.version, iounit); + qid.type, + (unsigned long long)qid.path, + qid.version, iounit); fid->mode = mode; fid->iounit = iounit; @@ -1023,7 +1030,9 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode, } P9_DPRINTK(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n", - qid.type, qid.path, qid.version, iounit); + qid.type, + (unsigned long long)qid.path, + qid.version, iounit); fid->mode = mode; fid->iounit = iounit; @@ -1230,9 +1239,9 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n" "<<< uid=%d gid=%d n_muid=%d\n", ret->size, ret->type, ret->dev, ret->qid.type, - ret->qid.path, ret->qid.version, ret->mode, - ret->atime, ret->mtime, ret->length, ret->name, - ret->uid, ret->gid, ret->muid, ret->extension, + (unsigned long long)ret->qid.path, ret->qid.version, ret->mode, + ret->atime, ret->mtime, (unsigned long long)ret->length, + ret->name, ret->uid, ret->gid, ret->muid, ret->extension, ret->n_uid, ret->n_gid, ret->n_muid); free_and_error: @@ -1255,9 +1264,9 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) " name=%s uid=%s gid=%s muid=%s extension=(%s)\n" " uid=%d gid=%d n_muid=%d\n", wst->size, wst->type, wst->dev, wst->qid.type, - wst->qid.path, wst->qid.version, wst->mode, - wst->atime, wst->mtime, wst->length, wst->name, - wst->uid, wst->gid, wst->muid, wst->extension, + (unsigned long long)wst->qid.path, wst->qid.version, wst->mode, + wst->atime, wst->mtime, (unsigned long long)wst->length, + wst->name, wst->uid, wst->gid, wst->muid, wst->extension, wst->n_uid, wst->n_gid, wst->n_muid); err = 0; clnt = fid->clnt; -- cgit v1.2.3 From 4a9d916717de0aab4313d43817164577255242fb Mon Sep 17 00:00:00 2001 From: Jonathan McDowell Date: Thu, 30 Oct 2008 22:46:48 +0000 Subject: Fix logic error in rfkill_check_duplicity > I'll have a prod at why the [hso] rfkill stuff isn't working next Ok, I believe this is due to the addition of rfkill_check_duplicity in rfkill and the fact that test_bit actually returns a negative value rather than the postive one expected (which is of course equally true). So when the second WLAN device (the hso device, with the EEE PC WLAN being the first) comes along rfkill_check_duplicity returns a negative value and so rfkill_register returns an error. Patch below fixes this for me. Signed-Off-By: Jonathan McDowell Acked-by: Henrique de Moraes Holschuh Signed-off-by: John W. Linville --- net/rfkill/rfkill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c index f949a482b00..25ba3bd57e6 100644 --- a/net/rfkill/rfkill.c +++ b/net/rfkill/rfkill.c @@ -603,7 +603,7 @@ static int rfkill_check_duplicity(const struct rfkill *rfkill) } /* 0: first switch of its kind */ - return test_bit(rfkill->type, seen); + return (test_bit(rfkill->type, seen)) ? 1 : 0; } static int rfkill_add_switch(struct rfkill *rfkill) -- cgit v1.2.3 From f8d570a4745835f2238a33b537218a1bb03fc671 Mon Sep 17 00:00:00 2001 From: David Miller Date: Thu, 6 Nov 2008 00:37:40 -0800 Subject: net: Fix recursive descent in __scm_destroy(). __scm_destroy() walks the list of file descriptors in the scm_fp_list pointed to by the scm_cookie argument. Those, in turn, can close sockets and invoke __scm_destroy() again. There is nothing which limits how deeply this can occur. The idea for how to fix this is from Linus. Basically, we do all of the fput()s at the top level by collecting all of the scm_fp_list objects hit by an fput(). Inside of the initial __scm_destroy() we keep running the list until it is empty. Signed-off-by: David S. Miller Signed-off-by: Linus Torvalds --- net/core/scm.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index 10f5c65f6a4..ab242cc1acc 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -75,6 +75,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) return -ENOMEM; *fplp = fpl; + INIT_LIST_HEAD(&fpl->list); fpl->count = 0; } fpp = &fpl->fp[fpl->count]; @@ -106,9 +107,25 @@ void __scm_destroy(struct scm_cookie *scm) if (fpl) { scm->fp = NULL; - for (i=fpl->count-1; i>=0; i--) - fput(fpl->fp[i]); - kfree(fpl); + if (current->scm_work_list) { + list_add_tail(&fpl->list, current->scm_work_list); + } else { + LIST_HEAD(work_list); + + current->scm_work_list = &work_list; + + list_add(&fpl->list, &work_list); + while (!list_empty(&work_list)) { + fpl = list_first_entry(&work_list, struct scm_fp_list, list); + + list_del(&fpl->list); + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } + + current->scm_work_list = NULL; + } } } @@ -284,6 +301,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); if (new_fpl) { + INIT_LIST_HEAD(&new_fpl->list); for (i=fpl->count-1; i>=0; i--) get_file(fpl->fp[i]); memcpy(new_fpl, fpl, sizeof(*fpl)); -- cgit v1.2.3 From 3b53fbf4314594fa04544b02b2fc6e607912da18 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 6 Nov 2008 15:45:32 -0800 Subject: net: Fix recursive descent in __scm_destroy(). __scm_destroy() walks the list of file descriptors in the scm_fp_list pointed to by the scm_cookie argument. Those, in turn, can close sockets and invoke __scm_destroy() again. There is nothing which limits how deeply this can occur. The idea for how to fix this is from Linus. Basically, we do all of the fput()s at the top level by collecting all of the scm_fp_list objects hit by an fput(). Inside of the initial __scm_destroy() we keep running the list until it is empty. Signed-off-by: David S. Miller --- net/core/scm.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index 10f5c65f6a4..ab242cc1acc 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -75,6 +75,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) return -ENOMEM; *fplp = fpl; + INIT_LIST_HEAD(&fpl->list); fpl->count = 0; } fpp = &fpl->fp[fpl->count]; @@ -106,9 +107,25 @@ void __scm_destroy(struct scm_cookie *scm) if (fpl) { scm->fp = NULL; - for (i=fpl->count-1; i>=0; i--) - fput(fpl->fp[i]); - kfree(fpl); + if (current->scm_work_list) { + list_add_tail(&fpl->list, current->scm_work_list); + } else { + LIST_HEAD(work_list); + + current->scm_work_list = &work_list; + + list_add(&fpl->list, &work_list); + while (!list_empty(&work_list)) { + fpl = list_first_entry(&work_list, struct scm_fp_list, list); + + list_del(&fpl->list); + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } + + current->scm_work_list = NULL; + } } } @@ -284,6 +301,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); if (new_fpl) { + INIT_LIST_HEAD(&new_fpl->list); for (i=fpl->count-1; i>=0; i--) get_file(fpl->fp[i]); memcpy(new_fpl, fpl, sizeof(*fpl)); -- cgit v1.2.3 From 6209344f5a3795d34b7f2c0061f49802283b6bdd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sun, 9 Nov 2008 15:23:57 +0100 Subject: net: unix: fix inflight counting bug in garbage collector Previously I assumed that the receive queues of candidates don't change during the GC. This is only half true, nothing can be received from the queues (see comment in unix_gc()), but buffers could be added through the other half of the socket pair, which may still have file descriptors referring to it. This can result in inc_inflight_move_tail() erronously increasing the "inflight" counter for a unix socket for which dec_inflight() wasn't previously called. This in turn can trigger the "BUG_ON(total_refs < inflight_refs)" in a later garbage collection run. Fix this by only manipulating the "inflight" counter for sockets which are candidates themselves. Duplicating the file references in unix_attach_fds() is also needed to prevent a socket becoming a candidate for GC while the skb that contains it is not yet queued. Reported-by: Andrea Bittau Signed-off-by: Miklos Szeredi CC: stable@kernel.org Signed-off-by: Linus Torvalds --- net/unix/af_unix.c | 31 ++++++++++++++++++++++++------- net/unix/garbage.c | 49 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 61 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4d3c6071b9a..eb90f77bb0e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1302,14 +1302,23 @@ static void unix_destruct_fds(struct sk_buff *skb) sock_wfree(skb); } -static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; + + /* + * Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + for (i=scm->fp->count-1; i>=0; i--) unix_inflight(scm->fp->fp[i]); - UNIXCB(skb).fp = scm->fp; skb->destructor = unix_destruct_fds; - scm->fp = NULL; + return 0; } /* @@ -1368,8 +1377,11 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) - unix_attach_fds(siocb->scm, skb); + if (siocb->scm->fp) { + err = unix_attach_fds(siocb->scm, skb); + if (err) + goto out_free; + } unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1538,8 +1550,13 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, size = min_t(int, size, skb_tailroom(skb)); memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) - unix_attach_fds(siocb->scm, skb); + if (siocb->scm->fp) { + err = unix_attach_fds(siocb->scm, skb); + if (err) { + kfree_skb(skb); + goto out_err; + } + } if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) { kfree_skb(skb); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2a27b84f740..6d4a9a8de5e 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -186,8 +186,17 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), */ struct sock *sk = unix_get_socket(*fp++); if (sk) { - hit = true; - func(unix_sk(sk)); + struct unix_sock *u = unix_sk(sk); + + /* + * Ignore non-candidates, they could + * have been added to the queues after + * starting the garbage collection + */ + if (u->gc_candidate) { + hit = true; + func(u); + } } } if (hit && hitlist != NULL) { @@ -249,11 +258,11 @@ static void inc_inflight_move_tail(struct unix_sock *u) { atomic_long_inc(&u->inflight); /* - * If this is still a candidate, move it to the end of the - * list, so that it's checked even if it was already passed - * over + * If this still might be part of a cycle, move it to the end + * of the list, so that it's checked even if it was already + * passed over */ - if (u->gc_candidate) + if (u->gc_maybe_cycle) list_move_tail(&u->link, &gc_candidates); } @@ -267,6 +276,7 @@ void unix_gc(void) struct unix_sock *next; struct sk_buff_head hitlist; struct list_head cursor; + LIST_HEAD(not_cycle_list); spin_lock(&unix_gc_lock); @@ -282,10 +292,14 @@ void unix_gc(void) * * Holding unix_gc_lock will protect these candidates from * being detached, and hence from gaining an external - * reference. This also means, that since there are no - * possible receivers, the receive queues of these sockets are - * static during the GC, even though the dequeue is done - * before the detach without atomicity guarantees. + * reference. Since there are no possible receivers, all + * buffers currently on the candidates' queues stay there + * during the garbage collection. + * + * We also know that no new candidate can be added onto the + * receive queues. Other, non candidate sockets _can_ be + * added to queue, so we must make sure only to touch + * candidates. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; @@ -299,6 +313,7 @@ void unix_gc(void) if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); u->gc_candidate = 1; + u->gc_maybe_cycle = 1; } } @@ -325,13 +340,23 @@ void unix_gc(void) list_move(&cursor, &u->link); if (atomic_long_read(&u->inflight) > 0) { - list_move_tail(&u->link, &gc_inflight_list); - u->gc_candidate = 0; + list_move_tail(&u->link, ¬_cycle_list); + u->gc_maybe_cycle = 0; scan_children(&u->sk, inc_inflight_move_tail, NULL); } } list_del(&cursor); + /* + * not_cycle_list contains those sockets which do not make up a + * cycle. Restore these to the inflight list. + */ + while (!list_empty(¬_cycle_list)) { + u = list_entry(not_cycle_list.next, struct unix_sock, link); + u->gc_candidate = 0; + list_move_tail(&u->link, &gc_inflight_list); + } + /* * Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs -- cgit v1.2.3