From fb286bb2990a107009dbf25f6ffebeb7df77f9be Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 10 Nov 2005 13:01:24 -0800 Subject: [NET]: Detect hardware rx checksum faults correctly Here is the patch that introduces the generic skb_checksum_complete which also checks for hardware RX checksum faults. If that happens, it'll call netdev_rx_csum_fault which currently prints out a stack trace with the device name. In future it can turn off RX checksum. I've converted every spot under net/ that does RX checksum checks to use skb_checksum_complete or __skb_checksum_complete with the exceptions of: * Those places where checksums are done bit by bit. These will call netdev_rx_csum_fault directly. * The following have not been completely checked/converted: ipmr ip_vs netfilter dccp This patch is based on patches and suggestions from Stephen Hemminger and David S. Miller. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/datagram.c | 21 ++++++++++++-- net/core/dev.c | 12 ++++++++ net/core/netpoll.c | 18 +++++++----- net/ipv4/icmp.c | 6 ++-- net/ipv4/igmp.c | 19 +++++++++---- net/ipv4/ip_gre.c | 15 +++++----- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 11 +++----- net/ipv4/tcp_ipv4.c | 24 ++++++---------- net/ipv4/udp.c | 7 ++--- net/ipv6/icmp.c | 21 +++++++------- net/ipv6/raw.c | 42 +++++++++++----------------- net/ipv6/tcp_ipv6.c | 20 ++++++------- net/ipv6/udp.c | 25 ++++++----------- net/rxrpc/transport.c | 15 ++++------ net/sunrpc/socklib.c | 5 ++++ net/sunrpc/svcsock.c | 9 ++---- 16 files changed, 139 insertions(+), 131 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index d219435d086..1bcfef51ac5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -350,6 +350,20 @@ fault: return -EFAULT; } +unsigned int __skb_checksum_complete(struct sk_buff *skb) +{ + unsigned int sum; + + sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + /** * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. * @skb: skbuff @@ -363,7 +377,7 @@ fault: * -EFAULT - fault during copy. Beware, in this case iovec * can be modified! */ -int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, +int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov) { unsigned int csum; @@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, iov++; if (iov->iov_len < chunk) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, - skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_error; if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) goto fault; @@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, goto fault; if ((unsigned short)csum_fold(csum)) goto csum_error; + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); iov->iov_len -= chunk; iov->iov_base += chunk; } diff --git a/net/core/dev.c b/net/core/dev.c index 8d154159527..0b48e294aaf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1108,6 +1108,18 @@ out: return ret; } +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + printk(KERN_ERR "%s: hw csum failure.\n", dev->name); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + #ifdef CONFIG_HIGHMEM /* Actually, we should eliminate this check as soon as we know, that: * 1. IOMMU is present and allows to map all the memory. diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 802fe11efad..49424a42a2c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb) static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, unsigned short ulen, u32 saddr, u32 daddr) { - if (uh->check == 0) + unsigned int psum; + + if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY) return 0; - if (skb->ip_summed == CHECKSUM_HW) - return csum_tcpudp_magic( - saddr, daddr, ulen, IPPROTO_UDP, skb->csum); + psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + + if (skb->ip_summed == CHECKSUM_HW && + !(u16)csum_fold(csum_add(psum, skb->csum))) + return 0; - skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + skb->csum = psum; - return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } /* @@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb) if (ulen != len) goto out; - if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) goto out; if (np->local_ip && np->local_ip != ntohl(iph->daddr)) goto out; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 175e093ec56..e3eceecd049 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) + skb->csum = 0; + if (__skb_checksum_complete(skb)) goto error; - default:; } if (!pskb_pull(skb, sizeof(struct icmphdr))) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c6247fc8406..c04607b4921 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb) return 0; } - if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || - (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { - in_dev_put(in_dev); - kfree_skb(skb); - return 0; + if (!pskb_may_pull(skb, sizeof(struct igmphdr))) + goto drop; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + if (__skb_checksum_complete(skb)) + goto drop; } ih = skb->h.igmph; @@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb) default: NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } + +drop: in_dev_put(in_dev); kfree_skb(skb); return 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f8f53..4e9c74b54b1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb) goto drop_nolock; if (flags&GRE_CSUM) { - if (skb->ip_summed == CHECKSUM_HW) { + switch (skb->ip_summed) { + case CHECKSUM_HW: csum = (u16)csum_fold(skb->csum); - if (csum) - skb->ip_summed = CHECKSUM_NONE; - } - if (skb->ip_summed == CHECKSUM_NONE) { - skb->csum = skb_checksum(skb, 0, skb->len, 0); + if (!csum) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); skb->ip_summed = CHECKSUM_HW; - csum = (u16)csum_fold(skb->csum); } offset += 4; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 5198f3a1e2c..e4d6b268e8c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: bad HW ICMP checksum "); - return -NF_ACCEPT; + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + skb->csum = 0; + if (__skb_checksum_complete(skb)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } - default: - break; } checksum_skipped: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 634dabb558f..ac1fcf5b4eb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static int tcp_v4_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->csum)) + skb->nh.iph->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); - skb->ip_summed = CHECKSUM_NONE; + } } + + skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len, IPPROTO_TCP, 0); + if (skb->len <= 76) { - if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, - skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v4_check(skb->h.th, skb->len, - skb->nh.iph->saddr, - skb->nh.iph->daddr, 0); + return __skb_checksum_complete(skb); } return 0; } @@ -1219,7 +1213,7 @@ int tcp_v4_rcv(struct sk_buff *skb) * provided case of th->doff==0 is elimineted. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v4_checksum_init(skb) < 0)) + tcp_v4_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e0bd1013cb0..2422a5f7195 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) static __inline__ int __udp_checksum_complete(struct sk_buff *skb) { - return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } static __inline__ int udp_checksum_complete(struct sk_buff *skb) @@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) - return 0; - LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 23e540365a1..1bdf0fb8bf8 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -585,17 +585,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) daddr = &skb->nh.ipv6h->daddr; /* Perform checksum. */ - if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } - if (skb->ip_summed == CHECKSUM_NONE) { - if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb_checksum(skb, 0, skb->len, 0))) { + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, 0); + if (__skb_checksum_complete(skb)) { LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", NIP6(*saddr), NIP6(*daddr)); goto discard_it; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 651c79b41ee..8e9628f1c4c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -298,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb, static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) { if ((raw6_sk(sk)->checksum || sk->sk_filter) && - skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - /* FIXME: increment a raw6 drops counter here */ - kfree_skb(skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_checksum_complete(skb)) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; } /* Charge it to the socket. */ @@ -337,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!rp->checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if (skb->ip_summed == CHECKSUM_HW) { - skb_postpull_rcsum(skb, skb->nh.raw, - skb->h.raw - skb->nh.raw); + if (skb->ip_summed == CHECKSUM_HW) { + skb_postpull_rcsum(skb, skb->nh.raw, + skb->h.raw - skb->nh.raw); + if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } - if (skb->ip_summed == CHECKSUM_NONE) - skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, 0); } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, 0); if (inet->hdrincl) { - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + if (skb_checksum_complete(skb)) { /* FIXME: increment a raw6 drops counter here */ kfree_skb(skb); return 0; } - skb->ip_summed = CHECKSUM_UNNECESSARY; } rawv6_rcv_skb(sk, skb); @@ -407,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (skb->ip_summed==CHECKSUM_UNNECESSARY) { err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d746d3b27ef..62c0e5bd931 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1401,20 +1401,18 @@ out: static int tcp_v6_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb->csum)) + &skb->nh.ipv6h->daddr,skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); + } } + + skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, 0); + if (skb->len <= 76) { - if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,0); + return __skb_checksum_complete(skb); } return 0; } @@ -1575,7 +1573,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) goto discard_it; if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v6_checksum_init(skb) < 0)) + tcp_v6_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bf9519341fd..e671153b47b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -248,7 +248,7 @@ try_again: err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); @@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) return -1; } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - UDP6_INC_STATS_BH(UDP_MIB_INERRORS); - kfree_skb(skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb_checksum_complete(skb)) { + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return 0; } if (sock_queue_rcv_skb(sk,skb)<0) { @@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) uh = skb->h.uh; } - if (skb->ip_summed==CHECKSUM_HW) { + if (skb->ip_summed == CHECKSUM_HW && + !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); @@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (skb_checksum_complete(skb)) goto discard; UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 122c086ee2d..dbe6105e83a 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans) /* we'll probably need to checksum it (didn't call * sock_recvmsg) */ - if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short) - csum_fold(skb_checksum(pkt, 0, pkt->len, - pkt->csum))) { - kfree_skb(pkt); - rxrpc_krxiod_queue_transport(trans); - _leave(" CSUM failed"); - return; - } + if (skb_checksum_complete(pkt)) { + kfree_skb(pkt); + rxrpc_krxiod_queue_transport(trans); + _leave(" CSUM failed"); + return; } addr = pkt->nh.iph->saddr; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 8f97e90f36c..eb330d4f66d 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -6,6 +6,9 @@ * Copyright (C) 1995, 1996 Olaf Kirch */ +#include +#include +#include #include #include #include @@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if ((unsigned short)csum_fold(desc.csum)) return -1; + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); return 0; no_checksum: if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f16e7cdd615..e50e7cf4373 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* we can use it in-place */ rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); rqstp->rq_arg.head[0].iov_len = len; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - skb_free_datagram(svsk->sk_sk, skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb_checksum_complete(skb)) { + skb_free_datagram(svsk->sk_sk, skb); + return 0; } rqstp->rq_skbuff = skb; } -- cgit v1.2.3 From f4805eded7d38c4e42bf473dc5eb2f34853beb06 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 16:53:30 -0800 Subject: [TCP]: fix congestion window update when using TSO deferal TCP peformance with TSO over networks with delay is awful. On a 100Mbit link with 150ms delay, we get 4Mbits/sec with TSO and 50Mbits/sec without TSO. The problem is with TSO, we intentionally do not keep the maximum number of packets in flight to fill the window, we hold out to until we can send a MSS chunk. But, we also don't update the congestion window unless we have filled, as per RFC2861. This patch replaces the check for the congestion window being full with something smarter that accounts for TSO. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 2 +- net/ipv4/tcp_cong.c | 2 +- net/ipv4/tcp_highspeed.c | 4 ++-- net/ipv4/tcp_htcp.c | 2 +- net/ipv4/tcp_hybla.c | 6 +++--- net/ipv4/tcp_output.c | 1 + net/ipv4/tcp_scalable.c | 3 ++- 7 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e060904..5af99b3ef5d 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -217,7 +217,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, bictcp_low_utilization(sk, data_acked); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e8..0705b496c6b 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -186,7 +186,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde08..5e56ad368dd 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,12 +111,12 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, int good) + u32 in_flight, u32 pkts_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e9..404a326ba34 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -207,7 +207,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623d..40dbb387751 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->minrtt = tp->srtt; } + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + if (!ca->hybla_en) return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); - if (in_flight < tp->snd_cwnd) - return; - if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f..998f6416ef8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2058,3 +2058,4 @@ EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf552..a2fd25617d2 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -20,7 +20,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { -- cgit v1.2.3 From 2d2abbab63f6726a147ae61ada39bf2c9ee0db9a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 16:56:12 -0800 Subject: [TCP]: simplify microsecond rtt sampling Simplify the code that comuputes microsecond rtt estimate used by TCP Vegas. Move the callback out of the RTT sampler and into the end of the ack cleanup. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 62 +++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578d..e4306565493 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -548,10 +548,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -610,9 +609,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } - - if (icsk->icsk_ca_ops->rtt_sample) - icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1921,7 +1917,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -1940,13 +1936,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1960,21 +1956,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt, u32 *usrtt) + const s32 seq_rtt) { const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, usrtt, flag); + tcp_ack_saw_tstamp(sk, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, flag); } static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, @@ -2054,20 +2050,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } +static inline u32 tcp_usrtt(const struct sk_buff *skb) +{ + struct timeval tv, now; + + do_gettimeofday(&now); + skb_get_timestamp(skb, &tv); + return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); +} /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; - struct timeval usnow; u32 pkts_acked = 0; - - if (seq_usrtt) - do_gettimeofday(&usnow); + void (*rtt_sample)(struct sock *sk, u32 usrtt) + = icsk->icsk_ca_ops->rtt_sample; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2107,16 +2110,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - if (seq_usrtt) { - struct timeval tv; - - skb_get_timestamp(skb, &tv); - *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 - + (usnow.tv_usec - tv.tv_usec); + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); } - if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2126,8 +2124,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt !before(scb->end_seq, tp->snd_up)) tp->urg_mode = 0; } - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); + } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); @@ -2135,8 +2136,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt } if (acked&FLAG_ACKED) { - const struct inet_connection_sock *icsk = inet_csk(sk); - tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk, tp); if (icsk->icsk_ca_ops->pkts_acked) @@ -2299,7 +2299,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; - s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2352,8 +2351,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); @@ -4242,7 +4240,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, NULL, 0); + tcp_ack_saw_tstamp(sk, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; -- cgit v1.2.3 From 7faffa1c7fb9b8e8917e3225d4e2638270c0a48b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:07:24 -0800 Subject: [TCP]: add tcp_slow_start helper Move all the code that does linear TCP slowstart to one inline function to ease later patch to add ABC support. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 10 ++++------ net/ipv4/tcp_cong.c | 11 +++++------ net/ipv4/tcp_highspeed.c | 7 +++---- net/ipv4/tcp_htcp.c | 11 +++++------ net/ipv4/tcp_scalable.c | 11 +++++------ net/ipv4/tcp_vegas.c | 42 +++++++++++------------------------------- 6 files changed, 33 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 5af99b3ef5d..1d0cd86621b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -220,14 +220,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { bictcp_update(ca, tp->snd_cwnd); - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= ca->cnt) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 0705b496c6b..6d3e883b48f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,12 +189,11 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 5e56ad368dd..82b3c189bd7 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -119,10 +119,9 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 404a326ba34..3284cfb993e 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -210,11 +210,10 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ @@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, htcp_alpha_update(ca); } - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index a2fd25617d2..26d7486ee50 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -24,17 +24,16 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { tp->snd_cwnd_cnt++; if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } } - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f..4376814d29f 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt); } else { u32 rtt, target_cwnd, diff; @@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, */ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ if (diff > gamma) { /* Going too fast. Time to slow down @@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, V_PARAM_SHIFT)+1); } + tcp_slow_start(tp); } else { /* Congestion avoidance. */ u32 next_snd_cwnd; @@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (next_snd_cwnd < tp->snd_cwnd) tp->snd_cwnd--; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } } - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; } /* Extract info for Tcp socket info provided via netlink. */ -- cgit v1.2.3 From 9772efb970780aeed488c19d8b4afd46c3b484af Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:09:53 -0800 Subject: [TCP]: Appropriate Byte Count support This is an updated version of the RFC3465 ABC patch originally for Linux 2.6.11-rc4 by Yee-Ting Li. ABC is a way of counting bytes ack'd rather than packets when updating congestion control. The orignal ABC described in the RFC applied to a Reno style algorithm. For advanced congestion control there is little change after leaving slow start. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 8 ++++++++ net/ipv4/tcp.c | 1 + net/ipv4/tcp_cong.c | 31 ++++++++++++++++++++----------- net/ipv4/tcp_input.c | 7 +++++++ net/ipv4/tcp_minisocks.c | 1 + 5 files changed, 37 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 65268562351..01444a02b48 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control, }, + { + .ctl_name = NET_TCP_ABC, + .procname = "tcp_abc", + .data = &sysctl_tcp_abc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 72b7c22e1ea..cfaf7613375 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6d3e883b48f..c7cc62c8dc1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -192,17 +192,26 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, /* In "safe" area, increase. */ if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); - else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e4306565493..4cb5e6f408d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,6 +89,7 @@ int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; +int sysctl_tcp_abc = 1; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -1247,6 +1248,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; + tp->bytes_acked = 0; tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing @@ -1904,6 +1906,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, TCP_ECN_queue_cwr(tp); } + tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -2310,6 +2313,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; + if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) + tp->bytes_acked += ack - prior_snd_una; + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -4370,6 +4376,7 @@ discard: EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4..9203a21e299 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; + newtp->bytes_acked = 0; newtp->frto_counter = 0; newtp->frto_highmark = 0; -- cgit v1.2.3 From 326f36e9e7de362e09745ce6f84b65e7ccac33ba Mon Sep 17 00:00:00 2001 From: John Heffner Date: Thu, 10 Nov 2005 17:11:48 -0800 Subject: [TCP]: receive buffer growth limiting with mixed MTU This is a patch for discussion addressing some receive buffer growing issues. This is partially related to the thread "Possible BUG in IPv4 TCP window handling..." last week. Specifically it addresses the problem of an interaction between rcvbuf moderation (receiver autotuning) and rcv_ssthresh. The problem occurs when sending small packets to a receiver with a larger MTU. (A very common case I have is a host with a 1500 byte MTU sending to a host with a 9k MTU.) In such a case, the rcv_ssthresh code is targeting a window size corresponding to filling up the current rcvbuf, not taking into account that the new rcvbuf moderation may increase the rcvbuf size. One hunk makes rcv_ssthresh use tcp_rmem[2] as the size target rather than rcvbuf. The other changes the behavior when it overflows its memory bounds with in-order data so that it tries to grow rcvbuf (the same as with out-of-order data). These changes should help my problem of mixed MTUs, and should also help the case from last week's thread I think. (In both cases though you still need tcp_rmem[2] to be set much larger than the TCP window.) One question is if this is too aggressive at trying to increase rcvbuf if it's under memory stress. Orignally-from: John Heffner Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cb5e6f408d..827cd4b9e86 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -234,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_full_space(sk)/2; + int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -327,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - unsigned int app_win = tp->rcv_nxt - tp->copied_seq; - int ofo_win = 0; icsk->icsk_ack.quick = 0; - skb_queue_walk(&tp->out_of_order_queue, skb) { - ofo_win += skb->len; - } - - /* If overcommit is due to out of order segments, - * do not clamp window. Try to expand rcvbuf instead. - */ - if (ofo_win) { - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); } - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - app_win += ofo_win; - if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) - app_win >>= 1; - if (app_win > icsk->icsk_ack.rcv_mss) - app_win -= icsk->icsk_ack.rcv_mss; - app_win = max(app_win, 2U*tp->advmss); - + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); - } } /* Receiver "autotuning" code. -- cgit v1.2.3 From caa20d9abe810be2ede9612b6c9db6ce7d6edf80 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:13:47 -0800 Subject: [TCP]: spelling fixes Minor spelling fixes for TCP code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 40 ++++++++++++++++++++-------------------- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_minisocks.c | 6 +++--- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 4 ++-- 6 files changed, 31 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cfaf7613375..9ac7a4f46bd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { - /* The last check adjusts for discrepance of Linux wrt. RFC + /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 827cd4b9e86..34cfa58eab7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -42,7 +42,7 @@ * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of + * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming @@ -224,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening - * window and then starts to feed us spagetti. But it should work + * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ @@ -278,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments - * will fit to window and correspoding skbs will fit to our rcvbuf. + * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) @@ -287,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); } -/* 4. Try to fixup all. It is made iimediately after connection enters +/* 4. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) @@ -367,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoothe things out - * else with timestamps disabled convergance takes too + * non-timestamp case, we do not smoother things out + * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { @@ -377,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) } else if (m < new_sample) new_sample = m << 3; } else { - /* No previous mesaure. */ + /* No previous measure. */ new_sample = m << 3; } @@ -506,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { - /* Too long gap. Apparently sender falled to + /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); @@ -546,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be incresed fastly, decrease too fastly + * too slowly, when it should be increased fastly, decrease too fastly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) @@ -607,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic - * ACKs in some curcumstances. + * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced - * with correct one. It is exaclty, which we pretend to do. + * with correct one. It is exactly, which we pretend to do. */ } @@ -772,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal curcumstances sending small + * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever @@ -1899,7 +1899,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } /* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) + * with this code. (Supersedes RFC1323) */ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { @@ -1912,7 +1912,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) * 1998/04/10 Andrey V. Savochkin * * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overstimated rto. With timestamps + * If we do not, we get strongly overestimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments @@ -2268,7 +2268,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) } /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavor is back to normal. + * At latest on third ACK the TCP behavior is back to normal. */ tp->frto_counter = (tp->frto_counter + 1) % 3; } @@ -2344,7 +2344,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(sk, flag)) { - /* Advanve CWND, if state allows this. */ + /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); @@ -3133,7 +3133,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, { struct sk_buff *skb; - /* First, check that queue is collapsable and find + /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ for (skb = head; skb != tail; ) { /* No new bits? It is possible on ofo queue. */ @@ -3441,7 +3441,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) /* * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be + * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. @@ -3486,7 +3486,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * or we break the semantics of SIOCATMARK (and thus sockatmark()) * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); @@ -3631,7 +3631,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_predition is to be made + * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ac1fcf5b4eb..4d5021e1929 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -39,7 +39,7 @@ * request_sock handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. - * Added new listen sematics. + * Added new listen semantics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. @@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb) /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, - * provided case of th->doff==0 is elimineted. + * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && tcp_v4_checksum_init(skb))) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9203a21e299..1b66a2ac432 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -158,7 +158,7 @@ kill_with_rst: /* I am shamed, but failed to make it more elegant. * Yes, it is direct reference to IP, which is impossible * to generalize to IPv6. Taking into account that IPv6 - * do not undertsnad recycling in any case, it not + * do not understand recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && @@ -194,7 +194,7 @@ kill_with_rst: /* In window segment, it may be only reset or bare ack. */ if (th->rst) { - /* This is TIME_WAIT assasination, in two flavors. + /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ @@ -551,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet - * sent (the segment carries an unaccaptable ACK) ... + * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 998f6416ef8..602e7057e43 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -599,7 +599,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. - It is minumum of user_mss and mss received with SYN. + It is minimum of user_mss and mss received with SYN. It also does not include TCP options. tp->pmtu_cookie is last pmtu, seen by this function. @@ -1171,7 +1171,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - /* MSS for the peer's data. Previous verions used mss_clamp + /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss @@ -1361,7 +1361,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err; /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: frgagmentation, tunneling, mangling etc. + * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c..e1880959614 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * - * Criterium is still not confirmed experimentally and may change. + * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. @@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) hole detection. :-( It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any + to make it. It is disgusting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: -- cgit v1.2.3 From 6a438bbe68c7013a42d9c5aee5a40d7dafdbe6ec Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:14:59 -0800 Subject: [TCP]: speed up SACK processing Use "hints" to speed up the SACK processing. Various forms of this have been used by TCP developers (Web100, STCP, BIC) to avoid the 2x linear search of outstanding segments. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 144 ++++++++++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_output.c | 54 +++++++++++++++---- 2 files changed, 172 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 34cfa58eab7..40a26b7157b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -897,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; + int dup_sack = 0; int i; if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; - for (i=0; istart_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - int dup_sack = 0; + /* SACK fastpath: + * if the only SACK change is the increase of the end_seq of + * the first block then only apply that SACK block + * and use retrans queue hinting otherwise slowpath */ + flag = 1; + for (i = 0; i< num_sacks; i++) { + __u32 start_seq = ntohl(sp[i].start_seq); + __u32 end_seq = ntohl(sp[i].end_seq); + + if (i == 0){ + if (tp->recv_sack_cache[i].start_seq != start_seq) + flag = 0; + } else { + if ((tp->recv_sack_cache[i].start_seq != start_seq) || + (tp->recv_sack_cache[i].end_seq != end_seq)) + flag = 0; + } + tp->recv_sack_cache[i].start_seq = start_seq; + tp->recv_sack_cache[i].end_seq = end_seq; /* Check for D-SACK. */ if (i == 0) { @@ -940,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (before(ack, prior_snd_una - tp->max_window)) return 0; } + } + + if (flag) + num_sacks = 1; + else { + int j; + tp->fastpath_skb_hint = NULL; + + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = num_sacks-1; i > 0; i--) { + for (j = 0; j < i; j++){ + if (after(ntohl(sp[j].start_seq), + ntohl(sp[j+1].start_seq))){ + sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); + sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); + sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); + sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + } + + } + } + } + + /* clear flag as used for different purpose in following code */ + flag = 0; + + for (i=0; istart_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count; + + /* Use SACK fastpath hint if valid */ + if (tp->fastpath_skb_hint) { + skb = tp->fastpath_skb_hint; + fack_count = tp->fastpath_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + fack_count = 0; + } /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue(skb, sk) { + sk_stream_for_retrans_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint = fack_count; + /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ @@ -1023,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } else { /* New sack for not retransmitted frame, @@ -1035,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } @@ -1058,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; } } } @@ -1085,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1192,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1258,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1482,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, int packets, u32 high_seq) { struct sk_buff *skb; - int cnt = packets; + int cnt; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(packets <= tp->packets_out); + if (tp->lost_skb_hint) { + skb = tp->lost_skb_hint; + cnt = tp->lost_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + cnt = 0; + } - sk_stream_for_retrans_queue(skb, sk) { - cnt -= tcp_skb_pcount(skb); - if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + sk_stream_for_retrans_queue_from(skb, sk) { + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->lost_skb_hint = skb; + tp->lost_cnt_hint = cnt; + cnt += tcp_skb_pcount(skb); + if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retransmit_queue hints + * if this is beyond hint */ + if(tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { + + tp->retransmit_skb_hint = NULL; + } } } tcp_sync_left_out(tp); @@ -1519,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(sk, skb) && - !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint + : sk->sk_write_queue.next; + + sk_stream_for_retrans_queue_from(skb, sk) { + if (!tcp_skb_timedout(sk, skb)) + break; + + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retrans hint */ + if (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + + tp->retransmit_skb_hint = NULL; } } + + tp->scoreboard_skb_hint = skb; + tcp_sync_left_out(tp); } } @@ -1605,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; + + /* There is something screwy going on with the retrans hints after + an undo */ + clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1688,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) sk_stream_for_retrans_queue(skb, sk) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } + + clear_all_retrans_hints(tp); + DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; @@ -2117,6 +2230,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); + clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 602e7057e43..029c70dfb58 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss u16 flags; BUG_ON(len > skb->len); + + clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* Ok. We will be able to collapse the packet. */ + /* changing transmit queue under us so clear hints */ + clear_all_retrans_hints(tp); + + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); @@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) } } + clear_all_retrans_hints(tp); + if (!lost) return; @@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + packet_cnt = tp->retransmit_cnt_hint; + }else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } /* First pass: retransmit lost packets. */ - if (packet_cnt) { - sk_stream_for_retrans_queue(skb, sk) { + if (tp->lost_out) { + sk_stream_for_retrans_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* we could do better than to assign each time */ + tp->retransmit_skb_hint = skb; + tp->retransmit_cnt_hint = packet_cnt; + /* Assume this retransmit will generate * only one packet for congestion window * calculation purposes. This works because @@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; - if (sacked&TCPCB_LOST) { + if (sacked & TCPCB_LOST) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->retransmit_skb_hint = NULL; return; + } if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else @@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) TCP_RTO_MAX); } - packet_cnt -= tcp_skb_pcount(skb); - if (packet_cnt <= 0) + packet_cnt += tcp_skb_pcount(skb); + if (packet_cnt >= tp->lost_out) break; } } @@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_may_send_now(sk, tp)) return; - packet_cnt = 0; + if (tp->forward_skb_hint) { + skb = tp->forward_skb_hint; + packet_cnt = tp->forward_cnt_hint; + } else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } + + sk_stream_for_retrans_queue_from(skb, sk) { + tp->forward_cnt_hint = packet_cnt; + tp->forward_skb_hint = skb; - sk_stream_for_retrans_queue(skb, sk) { /* Similar to the retransmit loop above we * can pretend that the retransmitted SKB * we send out here will be composed of one @@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->forward_skb_hint = NULL; break; + } if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, -- cgit v1.2.3 From c050970a257a4060e927e497a12323e961fcbadc Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Fri, 11 Nov 2005 04:43:47 -0500 Subject: [PATCH] TCP: fix vegas build Recent TCP changes broke the build. Signed-off-by: Jeff Garzik Signed-off-by: Linus Torvalds --- net/ipv4/tcp_vegas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 4376814d29f..b7d296a8ac6 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -236,7 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt); + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); } else { u32 rtt, target_cwnd, diff; -- cgit v1.2.3 From 9eb5c94ef217d61d974f384b29b7298490b0a4d9 Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Fri, 11 Nov 2005 12:04:28 -0800 Subject: [DECNET]: fix SIGPIPE Currently recvmsg generates SIGPIPE whereas sendmsg does not; for the other stacks it seems to be the other way round! It also fixes the bug where reading from a socket whose peer has shutdown returned -EINVAL rather than 0. Signed-off-by: Patrick Caulfield Signed-off-by: David S. Miller --- net/decnet/af_decnet.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 3f25cadccdd..f89e55f814d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1664,17 +1664,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; } - rv = dn_check_state(sk, NULL, 0, &timeo, flags); - if (rv) - goto out; - if (sk->sk_shutdown & RCV_SHUTDOWN) { - if (!(flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - rv = -EPIPE; + rv = 0; goto out; } + rv = dn_check_state(sk, NULL, 0, &timeo, flags); + if (rv) + goto out; + if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { rv = -EOPNOTSUPP; goto out; @@ -1928,6 +1926,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_shutdown & SEND_SHUTDOWN) { err = -EPIPE; + if (!(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); goto out_err; } -- cgit v1.2.3 From 8eb55910521f3449a8125a38dd32fb676703395d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 11 Nov 2005 15:05:47 -0800 Subject: [IPV6]: Fix inet6_init missing unregister. Based mostly upon a patch from Olaf Kirch When initialization fails in inet6_init(), we should unregister the PF_INET6 socket ops. Also, check sock_register()'s return value for errors. Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4f8795af2ed..c63b8ce0e1b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -699,12 +699,14 @@ static int __init inet6_init(void) /* Register the family here so that the init calls below will * be able to create sockets. (?? is this dangerous ??) */ - (void) sock_register(&inet6_family_ops); + err = sock_register(&inet6_family_ops); + if (err) + goto out_unregister_raw_proto; /* Initialise ipv6 mibs */ err = init_ipv6_mibs(); if (err) - goto out_unregister_raw_proto; + goto out_unregister_sock; /* * ipngwg API draft makes clear that the correct semantics @@ -796,6 +798,8 @@ icmp_fail: ipv6_sysctl_unregister(); #endif cleanup_ipv6_mibs(); +out_unregister_sock: + sock_unregister(PF_INET6); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udp_proto: -- cgit v1.2.3 From 23ec47a0889dabf4b9e7f8d52e848194734159ee Mon Sep 17 00:00:00 2001 From: Vladislav Yasevich Date: Fri, 11 Nov 2005 16:05:55 -0800 Subject: [SCTP]: Fix potential NULL pointer dereference in sctp_v4_get_saddr It is possible to get to sctp_v4_get_saddr() without a valid association. This happens when processing OOTB packets and the cached route entry is no longer valid. However, when responding to OOTB packets we already properly set the source address based on the information in the OOTB packet. So, if we we get to sctp_v4_get_saddr() without an association we can simply return. Signed-off-by: Vladislav Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/protocol.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 26de4d3e1bd..dc9dff396fa 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -530,6 +530,9 @@ static void sctp_v4_get_saddr(struct sctp_association *asoc, { struct rtable *rt = (struct rtable *)dst; + if (!asoc) + return; + if (rt) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = asoc->base.bind_addr.port; -- cgit v1.2.3 From 1e7d3d90c95b32374057e454417b2f50440be20e Mon Sep 17 00:00:00 2001 From: Vladislav Yasevich Date: Fri, 11 Nov 2005 16:06:16 -0800 Subject: [SCTP]: Remove timeouts[] array from sctp_endpoint. The socket level timeout values are maintained in sctp_sock and association level timeouts are in sctp_association. So there is no need for ep->timeouts. Signed-off-by: Vladislav Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/associola.c | 24 ++++++++++++++++++++++-- net/sctp/endpointola.c | 23 ----------------------- net/sctp/sm_sideeffect.c | 6 +++--- net/sctp/socket.c | 1 - 4 files changed, 25 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 8c8ddf7f9b6..5f07ddb1955 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -128,9 +128,29 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a */ asoc->max_burst = sctp_max_burst; - /* Copy things from the endpoint. */ + /* initialize association timers */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; + + /* sctpimpguide Section 2.12.2 + * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the + * recommended value of 5 times 'RTO.Max'. + */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] + = 5 * asoc->rto_max; + + asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + sp->autoclose * HZ; + + /* Initilizes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { - asoc->timeouts[i] = ep->timeouts[i]; init_timer(&asoc->timers[i]); asoc->timers[i].function = sctp_timer_events[i]; asoc->timers[i].data = (unsigned long) asoc; diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 96984f7a2d6..0df76897f56 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -70,7 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct sock *sk, gfp_t gfp) { - struct sctp_sock *sp = sctp_sk(sk); memset(ep, 0, sizeof(struct sctp_endpoint)); /* Initialize the base structure. */ @@ -100,28 +99,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Create the lists of associations. */ INIT_LIST_HEAD(&ep->asocs); - /* Set up the base timeout information. */ - ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; - - /* sctpimpguide-05 Section 2.12.2 - * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the - * recommended value of 5 times 'RTO.Max'. - */ - ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] - = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); - - ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout; - ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; - /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = sctp_sndbuf_policy; sk->sk_write_space = sctp_write_space; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f84173ea8ec..823947170a3 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -385,7 +385,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { NULL, sctp_generate_t4_rto_event, sctp_generate_t5_shutdown_guard_event, - sctp_generate_heartbeat_event, + NULL, sctp_generate_sack_event, sctp_generate_autoclose_event, }; @@ -689,9 +689,9 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, * increased due to timer expirations. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = - asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]; + asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = - asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]; + asoc->rto_initial; } if (sctp_state(asoc, ESTABLISHED) || diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b529af5e6f2..4d1b8d8904c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1932,7 +1932,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; - sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; return 0; } -- cgit v1.2.3 From 049b3ff5a86d0187184a189d2e31b8654d58fe22 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 11 Nov 2005 16:08:24 -0800 Subject: [SCTP]: Include ulpevents in socket receive buffer accounting. Also introduces a sysctl option to configure the receive buffer accounting policy to be either at socket or association level. Default is all the associations on the same socket share the receive buffer. Signed-off-by: Neil Horman Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/associola.c | 9 +++++++-- net/sctp/endpointola.c | 3 +++ net/sctp/input.c | 20 -------------------- net/sctp/protocol.c | 3 +++ net/sctp/sm_statefuns.c | 22 ++++++++++++++++++++++ net/sctp/socket.c | 4 ++++ net/sctp/sysctl.c | 8 ++++++++ net/sctp/ulpevent.c | 24 +++++++----------------- 8 files changed, 54 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5f07ddb1955..dec68a60477 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -177,10 +177,10 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a * RFC 6 - A SCTP receiver MUST be able to receive a minimum of * 1500 bytes in one SCTP packet. */ - if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW) + if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW) asoc->rwnd = SCTP_DEFAULT_MINWINDOW; else - asoc->rwnd = sk->sk_rcvbuf; + asoc->rwnd = sk->sk_rcvbuf/2; asoc->a_rwnd = asoc->rwnd; @@ -192,6 +192,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* Set the sndbuf size for transmit. */ asoc->sndbuf_used = 0; + /* Initialize the receive memory counter */ + atomic_set(&asoc->rmem_alloc, 0); + init_waitqueue_head(&asoc->wait); asoc->c.my_vtag = sctp_generate_tag(ep); @@ -400,6 +403,8 @@ static void sctp_association_destroy(struct sctp_association *asoc) spin_unlock_bh(&sctp_assocs_id_lock); } + BUG_TRAP(!atomic_read(&asoc->rmem_alloc)); + if (asoc->base.malloced) { kfree(asoc); SCTP_DBG_OBJCNT_DEC(assoc); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 0df76897f56..67bd53070ee 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -104,6 +104,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + /* Get the receive buffer policy for this endpoint */ + ep->rcvbuf_policy = sctp_rcvbuf_policy; + /* Initialize the secret key used with cookie. */ get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); ep->last_key = ep->current_key = 0; diff --git a/net/sctp/input.c b/net/sctp/input.c index 28f32243397..b24ff2c1aef 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -100,21 +100,6 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb) return 0; } -/* The free routine for skbuffs that sctp receives */ -static void sctp_rfree(struct sk_buff *skb) -{ - atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc); - sock_rfree(skb); -} - -/* The ownership wrapper routine to do receive buffer accounting */ -static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb_set_owner_r(skb,sk); - skb->destructor = sctp_rfree; - atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); -} - struct sctp_input_cb { union { struct inet_skb_parm h4; @@ -217,9 +202,6 @@ int sctp_rcv(struct sk_buff *skb) rcvr = &ep->base; } - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - goto discard_release; - /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) @@ -256,8 +238,6 @@ int sctp_rcv(struct sk_buff *skb) } SCTP_INPUT_CB(skb)->chunk = chunk; - sctp_rcv_set_owner_r(skb,sk); - /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index dc9dff396fa..f775d78aa59 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1050,6 +1050,9 @@ SCTP_STATIC __init int sctp_init(void) /* Sendbuffer growth - do per-socket accounting */ sctp_sndbuf_policy = 0; + /* Rcvbuffer growth - do per-socket accounting */ + sctp_rcvbuf_policy = 0; + /* HB.interval - 30 seconds */ sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 505c7de10c5..475bfb4972d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -5160,6 +5160,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, sctp_verb_t deliver; int tmp; __u32 tsn; + int account_value; + struct sock *sk = asoc->base.sk; data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); @@ -5169,6 +5171,26 @@ static int sctp_eat_data(const struct sctp_association *asoc, /* ASSERT: Now skb->data is really the user data. */ + /* + * if we are established, and we have used up our receive + * buffer memory, drop the frame + */ + if (asoc->state == SCTP_STATE_ESTABLISHED) { + /* + * If the receive buffer policy is 1, then each + * association can allocate up to sk_rcvbuf bytes + * otherwise, all the associations in aggregate + * may allocate up to sk_rcvbuf bytes + */ + if (asoc->ep->rcvbuf_policy) + account_value = atomic_read(&asoc->rmem_alloc); + else + account_value = atomic_read(&sk->sk_rmem_alloc); + + if (account_value > sk->sk_rcvbuf) + return SCTP_IERROR_IGNORE_TSN; + } + /* Process ECN based congestion. * * Since the chunk structure is reused for all chunks within diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4d1b8d8904c..abab81f3818 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5114,8 +5114,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { + sock_rfree(skb); __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); + skb_set_owner_r(skb, newsk); } } @@ -5143,8 +5145,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { + sock_rfree(skb); __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); + skb_set_owner_r(skb, newsk); } } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 75b28dd634f..fcd7096c953 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -120,6 +120,14 @@ static ctl_table sctp_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = NET_SCTP_RCVBUF_POLICY, + .procname = "rcvbuf_policy", + .data = &sctp_rcvbuf_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = NET_SCTP_PATH_MAX_RETRANS, .procname = "path_max_retrans", diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index e049f41faa4..ba97f974f57 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -52,19 +52,6 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc); static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); -/* Stub skb destructor. */ -static void sctp_stub_rfree(struct sk_buff *skb) -{ -/* WARNING: This function is just a warning not to use the - * skb destructor. If the skb is shared, we may get the destructor - * callback on some processor that does not own the sock_lock. This - * was occuring with PACKET socket applications that were monitoring - * our skbs. We can't take the sock_lock, because we can't risk - * recursing if we do really own the sock lock. Instead, do all - * of our rwnd manipulation while we own the sock_lock outright. - */ -} - /* Initialize an ULP event from an given skb. */ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) { @@ -111,15 +98,19 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, */ sctp_association_hold((struct sctp_association *)asoc); skb = sctp_event2skb(event); - skb->sk = asoc->base.sk; event->asoc = (struct sctp_association *)asoc; - skb->destructor = sctp_stub_rfree; + atomic_add(skb->truesize, &event->asoc->rmem_alloc); + skb_set_owner_r(skb, asoc->base.sk); } /* A simple destructor to give up the reference to the association. */ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) { - sctp_association_put(event->asoc); + struct sctp_association *asoc = event->asoc; + struct sk_buff *skb = sctp_event2skb(event); + + atomic_sub(skb->truesize, &asoc->rmem_alloc); + sctp_association_put(asoc); } /* Create and initialize an SCTP_ASSOC_CHANGE event. @@ -922,7 +913,6 @@ done: /* Free a ulpevent that has an owner. It includes releasing the reference * to the owner, updating the rwnd in case of a DATA event and freeing the * skb. - * See comments in sctp_stub_rfree(). */ void sctp_ulpevent_free(struct sctp_ulpevent *event) { -- cgit v1.2.3 From efacfbcb6c88677809f44a574fbcd9824835dccb Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 12 Nov 2005 12:12:05 -0800 Subject: [IPV6]: Fix rtnetlink dump infinite loop The recent change to netlink dump "done" callback handling broke IPv6 which played dirty tricks with the "done" callback. This causes an infinite loop during a dump. The following patch fixes it. This bug was reported by Jeff Garzik. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f7f42c3e96c..a7a537b5059 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1701,10 +1701,8 @@ static void fib6_dump_end(struct netlink_callback *cb) fib6_walker_unlink(w); kfree(w); } - if (cb->args[1]) { - cb->done = (void*)cb->args[1]; - cb->args[1] = 0; - } + cb->done = (void*)cb->args[1]; + cb->args[1] = 0; } static int fib6_dump_done(struct netlink_callback *cb) -- cgit v1.2.3 From a2d7222f0f5861ce13b9308c30bd18f28ebeb583 Mon Sep 17 00:00:00 2001 From: Vlad Drukker Date: Sat, 12 Nov 2005 12:13:14 -0800 Subject: [NETFILTER] {ip,nf}_conntrack TCP: Accept SYN+PUSH like SYN Some devices (e.g. Qlogic iSCSI HBA hardware like QLA4010 up to firmware 3.0.0.4) initiates TCP with SYN and PUSH flags set. The Linux TCP/IP stack deals fine with that, but the connection tracking code doesn't. This patch alters TCP connection tracking to accept SYN+PUSH as a valid flag combination. Signed-off-by: Vlad Drukker Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 1 + net/netfilter/nf_conntrack_proto_tcp.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 468c6003b4c..5b3f5220f28 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -814,6 +814,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 83d90dd624f..156680ddb04 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -779,6 +779,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, -- cgit v1.2.3 From 8225ccbaf01b459cf1e462047a51b2851e756bc1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 12 Nov 2005 12:15:16 -0800 Subject: [IPV6]: Fix unnecessary GFP_ATOMIC allocation in fib6 dump Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a7a537b5059..9a71a8d1078 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1732,7 +1732,7 @@ int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) /* * 2. allocate and initialize walker. */ - w = kmalloc(sizeof(*w), GFP_ATOMIC); + w = kmalloc(sizeof(*w), GFP_KERNEL); if (w == NULL) return -ENOMEM; RT6_TRACE("dump<%p", w); -- cgit v1.2.3 From dbd36ea496726460299842fdbeaaa7fff2f0c5c7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2005 15:21:01 -0800 Subject: [NETFILTER] ctnetlink: use size_t to make gcc-4.x happy Make gcc-4.x happy. Use size_t instead of int. Thanks to Patrick McHardy for the hint. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index d2a4fec2286..853d0ac5534 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -467,7 +467,7 @@ out: } #endif -static const int cta_min_ip[CTA_IP_MAX] = { +static const size_t cta_min_ip[CTA_IP_MAX] = { [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), [CTA_IP_V4_DST-1] = sizeof(u_int32_t), }; @@ -497,7 +497,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) return 0; } -static const int cta_min_proto[CTA_PROTO_MAX] = { +static const size_t cta_min_proto[CTA_PROTO_MAX] = { [CTA_PROTO_NUM-1] = sizeof(u_int16_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), @@ -576,7 +576,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, } #ifdef CONFIG_IP_NF_NAT_NEEDED -static const int cta_min_protonat[CTA_PROTONAT_MAX] = { +static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), }; -- cgit v1.2.3 From 56558208521729fa6b2a0f12df22e1569dee297a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2005 15:22:11 -0800 Subject: [NETFILTER] ctnetlink: More thorough size checking of attributes Add missing size checks. Thanks Patrick McHardy for the hint. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 39 +++++++++++++++++++++++++++++ net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 7 ++++++ 2 files changed, 46 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 853d0ac5534..f5e5e315867 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -614,6 +614,11 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, return 0; } +static const size_t cta_min_nat[CTA_NAT_MAX] = { + [CTA_NAT_MINIP-1] = sizeof(u_int32_t), + [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +}; + static inline int ctnetlink_parse_nat(struct nfattr *cda[], const struct ip_conntrack *ct, struct ip_nat_range *range) @@ -627,6 +632,9 @@ ctnetlink_parse_nat(struct nfattr *cda[], nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) + return -EINVAL; + if (tb[CTA_NAT_MINIP-1]) range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); @@ -667,6 +675,14 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) return 0; } +static const size_t cta_min[CTA_MAX] = { + [CTA_STATUS-1] = sizeof(u_int32_t), + [CTA_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_MARK-1] = sizeof(u_int32_t), + [CTA_USE-1] = sizeof(u_int32_t), + [CTA_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -678,6 +694,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -760,6 +779,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return 0; } + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -1047,6 +1069,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); if (err < 0) @@ -1252,6 +1277,11 @@ out: return skb->len; } +static const size_t cta_min_exp[CTA_EXPECT_MAX] = { + [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_EXPECT_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -1263,6 +1293,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct nfgenmsg *msg = NLMSG_DATA(nlh); u32 rlen; @@ -1333,6 +1366,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct ip_conntrack_helper *h; int err; + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (cda[CTA_EXPECT_TUPLE-1]) { /* delete a single expect by tuple */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); @@ -1462,6 +1498,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1] || !cda[CTA_EXPECT_MASTER-1]) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 5b3f5220f28..ee3b7d6c4d2 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -357,6 +357,10 @@ nfattr_failure: return -1; } +static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), +}; + static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) { struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; @@ -369,6 +373,9 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + return -EINVAL; + if (!tb[CTA_PROTOINFO_TCP_STATE-1]) return -EINVAL; -- cgit v1.2.3 From 3746a2b1402e7933c7f1eabdce384b8454dc2ef7 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Mon, 14 Nov 2005 15:23:01 -0800 Subject: [NETFILTER] nf_conntrack: Add missing code to TCP conntrack module Looks like the nf_conntrack TCP code was slightly mismerged: it does not contain an else branch present in the IPv4 version. Let's add that code and make the testsuite happy. Signed-off-by: KOVACS Krisztian Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_tcp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 156680ddb04..5a6fcf349bd 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -970,6 +970,12 @@ static int tcp_packet(struct nf_conn *conntrack, conntrack->timeout.function((unsigned long) conntrack); return -NF_REPEAT; + } else { + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, + NULL, "nf_ct_tcp: invalid SYN"); + return -NF_ACCEPT; } case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET -- cgit v1.2.3 From 37d2e7a20d745035b600f1a6be56cbb9c7259419 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 14 Nov 2005 15:24:59 -0800 Subject: [NETFILTER] nfnetlink: unconditionally require CAP_NET_ADMIN This patch unconditionally requires CAP_NET_ADMIN for all nfnetlink messages. It also removes the per-message cap_required field, since all existing subsystems use CAP_NET_ADMIN for all their messages anyway. Patrick McHardy owes me a beer if we ever need to re-introduce this. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 21 +++++++-------------- net/netfilter/nfnetlink.c | 28 ++++++++++++---------------- net/netfilter/nfnetlink_log.c | 6 ++---- net/netfilter/nfnetlink_queue.c | 9 +++------ 4 files changed, 24 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index f5e5e315867..de9f4464438 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1543,29 +1543,22 @@ static struct notifier_block ctnl_notifier_exp = { static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, }; static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, }; static struct nfnetlink_subsystem ctnl_subsys = { diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 83f4c53030f..a60c59b9763 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -223,6 +223,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, NFNL_SUBSYS_ID(nlh->nlmsg_type), NFNL_MSG_TYPE(nlh->nlmsg_type)); + if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { + DEBUGP("missing CAP_NET_ADMIN\n"); + *errp = -EPERM; + return -1; + } + /* Only requests are handled by kernel now. */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { DEBUGP("received non-request message\n"); @@ -240,15 +246,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, ss = nfnetlink_get_subsys(type); if (!ss) { #ifdef CONFIG_KMOD - if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { - /* don't call nfnl_shunlock, since it would reenter - * with further packet processing */ - up(&nfnl_sem); - request_module("nfnetlink-subsys-%d", - NFNL_SUBSYS_ID(type)); - nfnl_shlock(); - ss = nfnetlink_get_subsys(type); - } + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ + up(&nfnl_sem); + request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); + nfnl_shlock(); + ss = nfnetlink_get_subsys(type); if (!ss) #endif goto err_inval; @@ -260,13 +263,6 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, goto err_inval; } - if (nc->cap_required && - !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) { - DEBUGP("permission denied for type %d\n", type); - *errp = -EPERM; - return -1; - } - { u_int16_t attr_count = ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index d194676f365..cba63729313 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -862,11 +862,9 @@ out_put: static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, - .attr_count = NFULA_MAX, - .cap_required = CAP_NET_ADMIN, }, + .attr_count = NFULA_MAX, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, - .attr_count = NFULA_CFG_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFULA_CFG_MAX, }, }; static struct nfnetlink_subsystem nfulnl_subsys = { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f065a6c9495..f28460b61e4 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -931,14 +931,11 @@ out_put: static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_MAX, }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_CFG_MAX, }, }; static struct nfnetlink_subsystem nfqnl_subsys = { -- cgit v1.2.3 From 47d4305bf2275f82a51fa025257c2c1996356d6b Mon Sep 17 00:00:00 2001 From: Krzysztof Oledzki Date: Mon, 14 Nov 2005 15:25:59 -0800 Subject: [NETFILTER]: link 'netfilter' before ipv4 Staticaly linked nf_conntrack_ipv4 requires nf_conntrack. but currently nf_conntrack is linked after it. This changes the order of ipv4 and netfilter to fix this. Signed-off-by: Krzysztof Oledzki Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/Makefile b/net/Makefile index 4aa2f46d2a5..f5141b9d4f3 100644 --- a/net/Makefile +++ b/net/Makefile @@ -15,8 +15,8 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ -obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ ifneq ($(CONFIG_IPV6),) -- cgit v1.2.3 From 9bdf87d90bbd1a3e3183ac116a6a9d861f32baca Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Mon, 14 Nov 2005 15:26:58 -0800 Subject: [NETFILTER]: cleanup IPv6 Netfilter Kconfig This removes linux 2.4 configs in comments as TODO lists. And this also move the entry of nf_conntrack to top like IPv4 Netfilter Kconfig. Based on original patch by Krzysztof Piotr Oledzki . Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv6/netfilter/Kconfig | 50 +++++++++++++--------------------------------- 1 file changed, 14 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 971ba60bf6e..060d6120241 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,10 +5,20 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" depends on INET && IPV6 && NETFILTER && EXPERIMENTAL -#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK -#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then -# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK -#fi +config NF_CONNTRACK_IPV6 + tristate "IPv6 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv6 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_QUEUE tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" ---help--- @@ -114,7 +124,6 @@ config IP6_NF_MATCH_OWNER To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES config IP6_NF_MATCH_MARK tristate "netfilter MARK match support" depends on IP6_NF_IPTABLES @@ -170,15 +179,6 @@ config IP6_NF_MATCH_PHYSDEV To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES -# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES -# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then -# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES -# fi -# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES -# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES -# fi # The targets config IP6_NF_FILTER tristate "Packet filtering" @@ -220,12 +220,6 @@ config IP6_NF_TARGET_NFQUEUE To compile it as a module, choose M here. If unsure, say N. -# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then -# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER -# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER -# fi -# fi config IP6_NF_MANGLE tristate "Packet mangling" depends on IP6_NF_IPTABLES @@ -236,7 +230,6 @@ config IP6_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE config IP6_NF_TARGET_MARK tristate "MARK target support" depends on IP6_NF_MANGLE @@ -266,7 +259,6 @@ config IP6_NF_TARGET_HL To compile it as a module, choose M here. If unsure, say N. -#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES config IP6_NF_RAW tristate 'raw table support (required for TRACE)' depends on IP6_NF_IPTABLES @@ -278,19 +270,5 @@ config IP6_NF_RAW If you want to compile it as a module, say M here and read . If unsure, say `N'. -config NF_CONNTRACK_IPV6 - tristate "IPv6 support for new connection tracking (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv6 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - endmenu -- cgit v1.2.3 From 7686a02c0ebc11e4f881fe14db3df18569b7dbc1 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Mon, 14 Nov 2005 15:27:43 -0800 Subject: [NETFILTER]: fix type of sysctl variables in nf_conntrack_ipv6 These variables should be unsigned. This fixes sysctl handler for nf_ct_frag6_{low,high}_thresh. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 8 ++++---- net/ipv6/netfilter/nf_conntrack_reasm.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index e2c90b3a807..753a3ae8502 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -339,8 +339,8 @@ extern unsigned long nf_ct_icmpv6_timeout; /* From nf_conntrack_frag6.c */ extern unsigned long nf_ct_frag6_timeout; -extern unsigned long nf_ct_frag6_low_thresh; -extern unsigned long nf_ct_frag6_high_thresh; +extern unsigned int nf_ct_frag6_low_thresh; +extern unsigned int nf_ct_frag6_high_thresh; static struct ctl_table_header *nf_ct_ipv6_sysctl_header; @@ -367,7 +367,7 @@ static ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_frag6_low_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = &proc_dointvec, }, { .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, @@ -375,7 +375,7 @@ static ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_frag6_high_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = &proc_dointvec, }, { .ctl_name = 0 } }; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 7640b9bb769..ed7603fe5fe 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -55,9 +55,9 @@ #define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ #define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT -int nf_ct_frag6_high_thresh = 256*1024; -int nf_ct_frag6_low_thresh = 192*1024; -int nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT; +unsigned int nf_ct_frag6_high_thresh = 256*1024; +unsigned int nf_ct_frag6_low_thresh = 192*1024; +unsigned long nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT; struct nf_ct_frag6_skb_cb { -- cgit v1.2.3 From 1ba430bc3e243d38c0bb2b185bea664b04fc59df Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Mon, 14 Nov 2005 15:28:18 -0800 Subject: [NETFILTER] nf_conntrack: fix possibility of infinite loop while evicting nf_ct_frag6_queue This synchronizes nf_ct_reasm with ipv6 reassembly, and fixes a possibility of an infinite loop if CPUs evict and create nf_ct_frag6_queue in parallel. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 42 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index ed7603fe5fe..1b68d714c0a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -190,8 +190,10 @@ static void nf_ct_frag6_secret_rebuild(unsigned long dummy) atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0); /* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct sk_buff *skb) +static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) { + if (work) + *work -= skb->truesize; atomic_sub(skb->truesize, &nf_ct_frag6_mem); if (NFCT_FRAG6_CB(skb)->orig) kfree_skb(NFCT_FRAG6_CB(skb)->orig); @@ -199,8 +201,11 @@ static inline void frag_kfree_skb(struct sk_buff *skb) kfree_skb(skb); } -static inline void frag_free_queue(struct nf_ct_frag6_queue *fq) +static inline void frag_free_queue(struct nf_ct_frag6_queue *fq, + unsigned int *work) { + if (work) + *work -= sizeof(struct nf_ct_frag6_queue); atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); kfree(fq); } @@ -218,7 +223,8 @@ static inline struct nf_ct_frag6_queue *frag_alloc_queue(void) /* Destruction primitives. */ /* Complete destruction of fq. */ -static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq) +static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq, + unsigned int *work) { struct sk_buff *fp; @@ -230,17 +236,17 @@ static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq) while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp); + frag_kfree_skb(fp, work); fp = xp; } - frag_free_queue(fq); + frag_free_queue(fq, work); } -static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) +static __inline__ void fq_put(struct nf_ct_frag6_queue *fq, unsigned int *work) { if (atomic_dec_and_test(&fq->refcnt)) - nf_ct_frag6_destroy(fq); + nf_ct_frag6_destroy(fq, work); } /* Kill fq entry. It is not destroyed immediately, @@ -262,10 +268,14 @@ static void nf_ct_frag6_evictor(void) { struct nf_ct_frag6_queue *fq; struct list_head *tmp; + unsigned int work; - for (;;) { - if (atomic_read(&nf_ct_frag6_mem) <= nf_ct_frag6_low_thresh) - return; + work = atomic_read(&nf_ct_frag6_mem); + if (work <= nf_ct_frag6_low_thresh) + return; + + work -= nf_ct_frag6_low_thresh; + while (work > 0) { read_lock(&nf_ct_frag6_lock); if (list_empty(&nf_ct_frag6_lru_list)) { read_unlock(&nf_ct_frag6_lock); @@ -281,7 +291,7 @@ static void nf_ct_frag6_evictor(void) fq_kill(fq); spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, &work); } } @@ -298,7 +308,7 @@ static void nf_ct_frag6_expire(unsigned long data) out: spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, NULL); } /* Creation primitives. */ @@ -318,7 +328,7 @@ static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash, atomic_inc(&fq->refcnt); write_unlock(&nf_ct_frag6_lock); fq_in->last_in |= COMPLETE; - fq_put(fq_in); + fq_put(fq_in, NULL); return fq; } } @@ -535,7 +545,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, fq->fragments = next; fq->meat -= free_it->len; - frag_kfree_skb(free_it); + frag_kfree_skb(free_it, NULL); } } @@ -811,7 +821,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { spin_unlock(&fq->lock); DEBUGP("Can't insert skb to queue\n"); - fq_put(fq); + fq_put(fq, NULL); goto ret_orig; } @@ -822,7 +832,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) } spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, NULL); return ret_skb; ret_orig: -- cgit v1.2.3 From 302fe1758d85ad9c868e77625f61b7edad106381 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Mon, 14 Nov 2005 15:28:45 -0800 Subject: [NETFILTER] fix leak of fragment queue at unloading nf_conntrack_ipv6 This patch makes nf_conntrack_ipv6 free all IPv6 fragment queues at module unloading time. Also introduce a BUG_ON if we ever again have leaks in the memory accounting. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 1b68d714c0a..c2c52af9e56 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -282,6 +282,7 @@ static void nf_ct_frag6_evictor(void) return; } tmp = nf_ct_frag6_lru_list.next; + BUG_ON(tmp == NULL); fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list); atomic_inc(&fq->refcnt); read_unlock(&nf_ct_frag6_lock); @@ -891,5 +892,6 @@ int nf_ct_frag6_init(void) void nf_ct_frag6_cleanup(void) { del_timer(&nf_ct_frag6_secret_timer); + nf_ct_frag6_low_thresh = 0; nf_ct_frag6_evictor(); } -- cgit v1.2.3 From 12da2a435c1e8b9abb780d928ccbf04fc3d860a7 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Mon, 14 Nov 2005 21:42:46 -0800 Subject: [IPV6]: small fix for ipv6_dev_get_saddr(...) The "score.rule++" doesn't make any sense for me. According to codes above, I think it should be "hiscore.rule++;" . Signed-off-by: Yan Zheng Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ddcf7754eec..56a09a4ac41 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1045,9 +1045,10 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, } #endif /* Rule 8: Use longest matching prefix */ - if (hiscore.rule < 8) + if (hiscore.rule < 8) { hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); - score.rule++; + hiscore.rule++; + } score.matchlen = ipv6_addr_diff(&ifa->addr, daddr); if (score.matchlen > hiscore.matchlen) { score.rule = 8; -- cgit v1.2.3 From cb422c464bdaeeb3b9ad4539010e357bf1bd1745 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 14 Nov 2005 21:43:36 -0800 Subject: [IPV6]: Fixes sparse warning in ipv6/ipv6_sockglue.c The patch below fixes the following sparse warning: net/ipv6/ipv6_sockglue.c:291:13: warning: Using plain integer as NULL pointer Signed-off-by: Luiz Capitulino Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 003fd99ff59..25757ade989 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -287,7 +287,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, { struct ipv6_txoptions *opt; if (optlen == 0) - optval = 0; + optval = NULL; /* hop-by-hop / destination options are privileged option */ retv = -EPERM; -- cgit v1.2.3 From 59c6196e59a4b85d9c994e70ff20a460cdbaa003 Mon Sep 17 00:00:00 2001 From: Jochen Friedrich Date: Mon, 14 Nov 2005 21:57:15 -0800 Subject: [LLC]: Fix TX window scaling Signed-off-by: Jochen Friedrich Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/llc/llc_c_ac.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index b0bcfb1f12d..91fb6bc1b11 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -866,7 +866,8 @@ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb) llc->ack_must_be_send = 1; llc->ack_pf = pf_bit & 1; } - if (((llc->vR - llc->first_pdu_Ns + 129) % 128) >= llc->npta) { + if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO) + % LLC_2_SEQ_NBR_MODULO) >= llc->npta) { llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0; llc->ack_pf = 0; @@ -994,8 +995,8 @@ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) llc->dec_step = 0; llc->dec_cntr = llc->inc_cntr = 2; ++llc->npta; - if (llc->npta > 127) - llc->npta = 127 ; + if (llc->npta > ~LLC_2_SEQ_NBR_MODULO) + llc->npta = ~LLC_2_SEQ_NBR_MODULO ; } else --llc->inc_cntr; return 0; @@ -1065,9 +1066,10 @@ int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q); - llc->k -= unacked_pdu; - if (llc->k < 2) - llc->k = 2; + if (llc->k - unacked_pdu < 1) + llc->k = 1; + else + llc->k -= unacked_pdu; return 0; } @@ -1084,8 +1086,8 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); llc->k += 1; - if (llc->k > 128) - llc->k = 128 ; + if (llc->k > ~LLC_2_SEQ_NBR_MODULO) + llc->k = ~LLC_2_SEQ_NBR_MODULO ; return 0; } @@ -1309,7 +1311,7 @@ int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb) static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) { - llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % 128; + llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO; return 0; } -- cgit v1.2.3 From 451677c46feb5fb39cb7f71035b8716064fcbd57 Mon Sep 17 00:00:00 2001 From: Jochen Friedrich Date: Mon, 14 Nov 2005 21:57:46 -0800 Subject: [LLC]: Make core block on remote busy. Signed-off-by: Jochen Friedrich Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/llc/af_llc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 59d02cbbeb9..c3f0b078345 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -116,7 +116,9 @@ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) struct llc_sock* llc = llc_sk(sk); int rc = 0; - if (unlikely(llc_data_accept_state(llc->state) || llc->p_flag)) { + if (unlikely(llc_data_accept_state(llc->state) || + llc->remote_busy_flag || + llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); rc = llc_ui_wait_for_busy_core(sk, timeout); @@ -542,6 +544,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) if (sk_wait_event(sk, &timeout, (sk->sk_shutdown & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && + !llc->remote_busy_flag && !llc->p_flag))) break; rc = -ERESTARTSYS; -- cgit v1.2.3 From 1887b93529410633b5529a7c2d304897dbed5b3e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 15 Nov 2005 00:09:10 -0800 Subject: [PATCH] knfsd: make sure nfsd doesn't hog a cpu forever Being kernel-threads, nfsd servers don't get pre-empted (depending on CONFIG). If there is a steady stream of NFS requests that can be served from cache, an nfsd thread may hold on to a cpu indefinitely, which isn't very friendly. So it is good to have a cond_resched in there (just before looking for a new request to serve), to make sure we play nice. Signed-off-by: Neil Brown Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/svcsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e50e7cf4373..c6a51911e71 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1178,6 +1178,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) arg->tail[0].iov_len = 0; try_to_freeze(); + cond_resched(); if (signalled()) return -EINTR; -- cgit v1.2.3 From 31f3426904e066f17e3f88c468a2f7c869ad4aac Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 15 Nov 2005 15:17:10 -0800 Subject: [TCP]: More spelling fixes. From Joe Perches Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 40a26b7157b..bf2e23086bc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -367,7 +367,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoother things out + * non-timestamp case, we do not smooth things out * else with timestamps disabled convergence takes too * long. */ @@ -546,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be increased fastly, decrease too fastly + * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) -- cgit v1.2.3 From 96479376c89e5be92c85bd350e3e2e8f0e7e3b52 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Tue, 15 Nov 2005 16:47:09 -0800 Subject: [NETFILTER] Remove nf_conntrack stat proc file when cleaning up Fix nf_conntrack statistics proc file removal. Looks like the old bug was forward-ported from ip_conntrack. :-] Signed-off-by: KOVACS Krisztian Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_standalone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 45224db4fe2..5af381f9fe3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -694,7 +694,7 @@ static int init_or_cleanup(int init) cleanup_proc_stat: #endif #ifdef CONFIG_PROC_FS - proc_net_remove("nf_conntrack_stat"); + remove_proc_entry("nf_conntrack", proc_net_stat); cleanup_proc_exp: proc_net_remove("nf_conntrack_expect"); cleanup_proc: -- cgit v1.2.3 From 5a6f294e43e432bd207a702fea49ebb303ef9b23 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Tue, 15 Nov 2005 16:47:34 -0800 Subject: [NETFILTER] Free layer-3 specific protocol tables at cleanup Although the comment around the allocation code tells us that the layer-3 specific protocol tables will be freed when cleaning up, they aren't. And this makes nfsim complain loudly... Signed-off-by: KOVACS Krisztian Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9a67c796b38..ea094b231d6 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1395,6 +1395,13 @@ void nf_conntrack_cleanup(void) kmem_cache_destroy(nf_conntrack_expect_cachep); free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, nf_conntrack_htable_size); + + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) + if (nf_ct_protos[i]) { + kfree(nf_ct_protos[i]); + nf_ct_protos[i] = NULL; + } } static struct list_head *alloc_hashtable(int size, int *vmalloced) -- cgit v1.2.3 From e7c8a41e817f381ac5c2a59ecc81b483bd68a7df Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 16 Nov 2005 12:55:37 -0800 Subject: [IPV4,IPV6]: replace handmade list with hlist in IPv{4,6} reassembly Both of ipq and frag_queue have *next and **prev, and they can be replaced with hlist. Thanks Arnaldo Carvalho de Melo for the suggestion. Signed-off-by: Yasuyuki Kozakai Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 40 ++++++++++++++-------------------------- net/ipv6/reassembly.c | 41 ++++++++++++++++------------------------- 2 files changed, 30 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e7d26d9943c..8ce0ce2ee48 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -71,7 +71,7 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct ipq *next; /* linked list pointers */ + struct hlist_node list; struct list_head lru_list; /* lru list member */ u32 user; u32 saddr; @@ -89,7 +89,6 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; int iif; struct timeval stamp; }; @@ -99,7 +98,7 @@ struct ipq { #define IPQ_HASHSZ 64 /* Per-bucket lock is easy to add now. */ -static struct ipq *ipq_hash[IPQ_HASHSZ]; +static struct hlist_head ipq_hash[IPQ_HASHSZ]; static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); @@ -107,9 +106,7 @@ int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) { - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; + hlist_del(&qp->list); list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -139,27 +136,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); for (i = 0; i < IPQ_HASHSZ; i++) { struct ipq *q; + struct hlist_node *p, *n; - q = ipq_hash[i]; - while (q) { - struct ipq *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { unsigned int hval = ipqhashfn(q->id, q->saddr, q->daddr, q->protocol); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ipq_hash[hval]) != NULL) - q->next->pprev = &q->next; - ipq_hash[hval] = q; - q->pprev = &ipq_hash[hval]; + hlist_add_head(&q->list, &ipq_hash[hval]); } - - q = next; } } write_unlock(&ipfrag_lock); @@ -310,14 +298,16 @@ out: static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) { struct ipq *qp; - +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ipfrag_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == qp_in->id && qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && @@ -337,10 +327,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; + hlist_add_head(&qp->list, &ipq_hash[hash]); INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -392,9 +379,10 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) __u8 protocol = iph->protocol; unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); struct ipq *qp; + struct hlist_node *n; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == id && qp->saddr == saddr && qp->daddr == daddr && diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e4fe9ee484d..5d316cb72ec 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -74,7 +74,7 @@ struct ip6frag_skb_cb struct frag_queue { - struct frag_queue *next; + struct hlist_node list; struct list_head lru_list; /* lru list member */ __u32 id; /* fragment id */ @@ -95,14 +95,13 @@ struct frag_queue #define FIRST_IN 2 #define LAST_IN 1 __u16 nhoffset; - struct frag_queue **pprev; }; /* Hash table. */ #define IP6Q_HASHSZ 64 -static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ]; +static struct hlist_head ip6_frag_hash[IP6Q_HASHSZ]; static DEFINE_RWLOCK(ip6_frag_lock); static u32 ip6_frag_hash_rnd; static LIST_HEAD(ip6_frag_lru_list); @@ -110,9 +109,7 @@ int ip6_frag_nqueues = 0; static __inline__ void __fq_unlink(struct frag_queue *fq) { - if(fq->next) - fq->next->pprev = fq->pprev; - *fq->pprev = fq->next; + hlist_del(&fq->list); list_del(&fq->lru_list); ip6_frag_nqueues--; } @@ -163,28 +160,21 @@ static void ip6_frag_secret_rebuild(unsigned long dummy) get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32)); for (i = 0; i < IP6Q_HASHSZ; i++) { struct frag_queue *q; + struct hlist_node *p, *n; - q = ip6_frag_hash[i]; - while (q) { - struct frag_queue *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ip6_frag_hash[i], list) { unsigned int hval = ip6qhashfn(q->id, &q->saddr, &q->daddr); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ip6_frag_hash[hval]) != NULL) - q->next->pprev = &q->next; - ip6_frag_hash[hval] = q; - q->pprev = &ip6_frag_hash[hval]; - } + hlist_add_head(&q->list, + &ip6_frag_hash[hval]); - q = next; + } } } write_unlock(&ip6_frag_lock); @@ -337,10 +327,13 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, struct frag_queue *fq_in) { struct frag_queue *fq; +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ip6_frag_lock); #ifdef CONFIG_SMP - for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == fq_in->id && ipv6_addr_equal(&fq_in->saddr, &fq->saddr) && ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) { @@ -358,10 +351,7 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, atomic_inc(&fq->refcnt); atomic_inc(&fq->refcnt); - if((fq->next = ip6_frag_hash[hash]) != NULL) - fq->next->pprev = &fq->next; - ip6_frag_hash[hash] = fq; - fq->pprev = &ip6_frag_hash[hash]; + hlist_add_head(&fq->list, &ip6_frag_hash[hash]); INIT_LIST_HEAD(&fq->lru_list); list_add_tail(&fq->lru_list, &ip6_frag_lru_list); ip6_frag_nqueues++; @@ -401,10 +391,11 @@ static __inline__ struct frag_queue * fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) { struct frag_queue *fq; + struct hlist_node *n; unsigned int hash = ip6qhashfn(id, src, dst); read_lock(&ip6_frag_lock); - for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == id && ipv6_addr_equal(src, &fq->saddr) && ipv6_addr_equal(dst, &fq->daddr)) { -- cgit v1.2.3 From 4a59a810513d5f7aa76515908b8e3620fa1b9b69 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 16 Nov 2005 23:14:19 -0800 Subject: [NETFILTER]: Fix nf_conntrack compilation with CONFIG_NETFILTER_DEBUG CC [M] net/netfilter/nf_conntrack_core.o net/netfilter/nf_conntrack_core.c: In function 'nf_ct_unlink_expect': net/netfilter/nf_conntrack_core.c:390: error: 'exp_timeout' undeclared (first use in this function) net/netfilter/nf_conntrack_core.c:390: error: (Each undeclared identifier is reported only once net/netfilter/nf_conntrack_core.c:390: error: for each function it appears in.) Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ea094b231d6..1da678303d7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -387,7 +387,7 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { ASSERT_WRITE_LOCK(&nf_conntrack_lock); - NF_CT_ASSERT(!timer_pending(&exp_timeout)); + NF_CT_ASSERT(!timer_pending(&exp->timeout)); list_del(&exp->list); NF_CT_STAT_INC(expect_delete); exp->master->expecting--; -- cgit v1.2.3 From bd6af700a7191f483f41706467033588f28c8877 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 17 Nov 2005 14:11:18 -0800 Subject: [TCP]: TCP highspeed build error There is a compile error that crept in with the last patch of TCP patches. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_highspeed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 82b3c189bd7..63cf7e54084 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,7 +111,7 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, u32 pkts_acked) + u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); -- cgit v1.2.3 From 2fce76afdb067fa3e7f8ee33c9fe366bd65887ea Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 17 Nov 2005 15:06:47 -0800 Subject: [NETFILTER] ip_conntrack: fix ftp/irc/tftp helpers on ports >= 32768 Since we've converted the ftp/irc/tftp helpers to use the new module_parm_array() some time ago, we ware accidentially using signed data types - thus preventing those modules from being used on ports >= 32768. This patch fixes it by using 'ushort' module parameters. Thanks to Jan Nijs for reporting this bug. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_ftp.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_irc.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_tftp.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index d77d6b3f5f8..59e12b02b22 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -29,9 +29,9 @@ static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); static int loose; module_param(loose, int, 0600); diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 15457415a4f..2dea1db1440 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -34,7 +34,7 @@ #include #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; @@ -52,7 +52,7 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); module_param(max_dcc_channels, int, 0400); MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c index a78736b8525..d3c5a371f99 100644 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper"); MODULE_LICENSE("GPL"); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of tftp servers"); #if 0 -- cgit v1.2.3 From 381998241fd1fc635596f4e8ae835f0d64ca1ba2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 17 Nov 2005 15:17:42 -0800 Subject: [LLC]: Fix compiler warnings introduced by TX window scaling changes. Noticed by Olaf Hering. The comparisons want a u8 here (the data type on the left-hand branch is a u8 structure member, and the constant on the right-hand branch is "~((u8) 128)"), but C turns it into an integer so we get: net/llc/llc_c_ac.c: In function `llc_conn_ac_inc_npta_value': net/llc/llc_c_ac.c:998: warning: comparison is always true due to limited range of data type net/llc/llc_c_ac.c:999: warning: large integer implicitly truncated to unsigned type Fix this up by explicitly recasting the right-hand branch constant into a "u8" once more. Signed-off-by: David S. Miller --- net/llc/llc_c_ac.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 91fb6bc1b11..8169f24ed33 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -995,8 +995,8 @@ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) llc->dec_step = 0; llc->dec_cntr = llc->inc_cntr = 2; ++llc->npta; - if (llc->npta > ~LLC_2_SEQ_NBR_MODULO) - llc->npta = ~LLC_2_SEQ_NBR_MODULO ; + if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO; } else --llc->inc_cntr; return 0; @@ -1086,8 +1086,8 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); llc->k += 1; - if (llc->k > ~LLC_2_SEQ_NBR_MODULO) - llc->k = ~LLC_2_SEQ_NBR_MODULO ; + if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO; return 0; } -- cgit v1.2.3 From 05b8b0fafd4cac75d205ecd5ad40992e2cc5934d Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Thu, 17 Nov 2005 15:22:39 -0800 Subject: [NET]: Sanitize NET_SCHED protection in /net/sched/Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Thu, 17 Nov 2005, David Gómez wrote: > I found out that if i select NET_CLS_ROUTE4, save my changes and exit > menuconfig, execute again make menuconfig and go to QoS options, then the new > available options are visible. So menuconfig has some problem refreshing > contents :? No, they were there before too, but you have to go up one level to see them. It's better in 2.6.15-rc1-git5, but the menu structure is still a little messed up, the patch below properly indents all menu entries. Signed-off-by: Roman Zippel Signed-off-by: David S. Miller --- net/sched/Kconfig | 37 ++++++++----------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 7f34e7fd767..55cd5327fbd 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -40,9 +40,10 @@ config NET_SCHED The available schedulers are listed in the following questions; you can say Y to as many as you like. If unsure, say N now. +if NET_SCHED + choice prompt "Packet scheduler clock source" - depends on NET_SCHED default NET_SCH_CLK_JIFFIES ---help--- Packet schedulers need a monotonic clock that increments at a static @@ -98,11 +99,9 @@ config NET_SCH_CLK_CPU endchoice comment "Queueing/Scheduling" - depends on NET_SCHED config NET_SCH_CBQ tristate "Class Based Queueing (CBQ)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Class-Based Queueing (CBQ) packet scheduling algorithm. This algorithm classifies the waiting packets @@ -120,7 +119,6 @@ config NET_SCH_CBQ config NET_SCH_HTB tristate "Hierarchical Token Bucket (HTB)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Hierarchical Token Buckets (HTB) packet scheduling algorithm. See @@ -135,7 +133,6 @@ config NET_SCH_HTB config NET_SCH_HFSC tristate "Hierarchical Fair Service Curve (HFSC)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Hierarchical Fair Service Curve (HFSC) packet scheduling algorithm. @@ -145,7 +142,7 @@ config NET_SCH_HFSC config NET_SCH_ATM tristate "ATM Virtual Circuits (ATM)" - depends on NET_SCHED && ATM + depends on ATM ---help--- Say Y here if you want to use the ATM pseudo-scheduler. This provides a framework for invoking classifiers, which in turn @@ -159,7 +156,6 @@ config NET_SCH_ATM config NET_SCH_PRIO tristate "Multi Band Priority Queueing (PRIO)" - depends on NET_SCHED ---help--- Say Y here if you want to use an n-band priority queue packet scheduler. @@ -169,7 +165,6 @@ config NET_SCH_PRIO config NET_SCH_RED tristate "Random Early Detection (RED)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Random Early Detection (RED) packet scheduling algorithm. @@ -181,7 +176,6 @@ config NET_SCH_RED config NET_SCH_SFQ tristate "Stochastic Fairness Queueing (SFQ)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) packet scheduling algorithm . @@ -193,7 +187,6 @@ config NET_SCH_SFQ config NET_SCH_TEQL tristate "True Link Equalizer (TEQL)" - depends on NET_SCHED ---help--- Say Y here if you want to use the True Link Equalizer (TLE) packet scheduling algorithm. This queueing discipline allows the combination @@ -206,7 +199,6 @@ config NET_SCH_TEQL config NET_SCH_TBF tristate "Token Bucket Filter (TBF)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Token Bucket Filter (TBF) packet scheduling algorithm. @@ -218,7 +210,6 @@ config NET_SCH_TBF config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Generic Random Early Detection (GRED) packet scheduling algorithm for some of your network devices @@ -230,7 +221,6 @@ config NET_SCH_GRED config NET_SCH_DSMARK tristate "Differentiated Services marker (DSMARK)" - depends on NET_SCHED ---help--- Say Y if you want to schedule packets according to the Differentiated Services architecture proposed in RFC 2475. @@ -242,7 +232,6 @@ config NET_SCH_DSMARK config NET_SCH_NETEM tristate "Network emulator (NETEM)" - depends on NET_SCHED ---help--- Say Y if you want to emulate network delay, loss, and packet re-ordering. This is often useful to simulate networks when @@ -255,7 +244,6 @@ config NET_SCH_NETEM config NET_SCH_INGRESS tristate "Ingress Qdisc" - depends on NET_SCHED ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. @@ -264,14 +252,12 @@ config NET_SCH_INGRESS module will be called sch_ingress. comment "Classification" - depends on NET_SCHED config NET_CLS boolean config NET_CLS_BASIC tristate "Elementary classification (BASIC)" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to be able to classify packets using @@ -282,7 +268,6 @@ config NET_CLS_BASIC config NET_CLS_TCINDEX tristate "Traffic-Control Index (TCINDEX)" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to be able to classify packets based on @@ -294,7 +279,6 @@ config NET_CLS_TCINDEX config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" - depends NET_SCHED select NET_CLS_ROUTE select NET_CLS ---help--- @@ -306,11 +290,9 @@ config NET_CLS_ROUTE4 config NET_CLS_ROUTE bool - default n config NET_CLS_FW tristate "Netfilter mark (FW)" - depends NET_SCHED select NET_CLS ---help--- If you say Y here, you will be able to classify packets @@ -321,7 +303,6 @@ config NET_CLS_FW config NET_CLS_U32 tristate "Universal 32bit comparisons w/ hashing (U32)" - depends NET_SCHED select NET_CLS ---help--- Say Y here to be able to classify packetes using a universal @@ -345,7 +326,6 @@ config CLS_U32_MARK config NET_CLS_RSVP tristate "IPv4 Resource Reservation Protocol (RSVP)" - depends on NET_SCHED select NET_CLS select NET_ESTIMATOR ---help--- @@ -361,7 +341,6 @@ config NET_CLS_RSVP config NET_CLS_RSVP6 tristate "IPv6 Resource Reservation Protocol (RSVP6)" - depends on NET_SCHED select NET_CLS select NET_ESTIMATOR ---help--- @@ -377,7 +356,6 @@ config NET_CLS_RSVP6 config NET_EMATCH bool "Extended Matches" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to use extended matches on top of classifiers @@ -456,7 +434,7 @@ config NET_EMATCH_TEXT config NET_CLS_ACT bool "Actions" - depends on EXPERIMENTAL && NET_SCHED + depends on EXPERIMENTAL select NET_ESTIMATOR ---help--- Say Y here if you want to use traffic control actions. Actions @@ -539,7 +517,7 @@ config NET_ACT_SIMP config NET_CLS_POLICE bool "Traffic Policing (obsolete)" - depends on NET_SCHED && NET_CLS_ACT!=y + depends on NET_CLS_ACT!=y select NET_ESTIMATOR ---help--- Say Y here if you want to do traffic policing, i.e. strict @@ -549,7 +527,7 @@ config NET_CLS_POLICE config NET_CLS_IND bool "Incoming device classification" - depends on NET_SCHED && (NET_CLS_U32 || NET_CLS_FW) + depends on NET_CLS_U32 || NET_CLS_FW ---help--- Say Y here to extend the u32 and fw classifier to support classification based on the incoming device. This option is @@ -557,11 +535,12 @@ config NET_CLS_IND config NET_ESTIMATOR bool "Rate estimator" - depends on NET_SCHED ---help--- Say Y here to allow using rate estimators to estimate the current rate-of-flow for network devices, queues, etc. This module is automaticaly selected if needed but can be selected manually for statstical purposes. +endif # NET_SCHED + endmenu -- cgit v1.2.3 From 9e147a1cfce5ec6308b024abe425d5b4e1884a03 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 17 Nov 2005 16:52:51 -0800 Subject: [IPV6]: Fib dump really needs GFP_ATOMIC. Revert: 8225ccbaf01b459cf1e462047a51b2851e756bc1 Based upon a report by Yan Zheng. Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9a71a8d1078..a7a537b5059 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1732,7 +1732,7 @@ int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) /* * 2. allocate and initialize walker. */ - w = kmalloc(sizeof(*w), GFP_KERNEL); + w = kmalloc(sizeof(*w), GFP_ATOMIC); if (w == NULL) return -ENOMEM; RT6_TRACE("dump<%p", w); -- cgit v1.2.3 From 8b8aa4b5a66ecf90f0a7033c8cbc49cfd97c4347 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Sun, 20 Nov 2005 12:18:17 +0900 Subject: [IPV6]: Fix memory management error during setting up new advapi sockopts. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/exthdrs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 922549581ab..748577b76d7 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -628,6 +628,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, if (!tot_len) return NULL; + tot_len += sizeof(*opt2); opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC); if (!opt2) return ERR_PTR(-ENOBUFS); @@ -668,7 +669,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, return opt2; out: - sock_kfree_s(sk, p, tot_len); + sock_kfree_s(sk, opt2, opt2->tot_len); return ERR_PTR(err); } -- cgit v1.2.3 From a305989386e402f48b216786a5c8cf440b33bdad Mon Sep 17 00:00:00 2001 From: Ville Nuorvala Date: Sun, 20 Nov 2005 12:21:59 +0900 Subject: [IPV6]: Fix calculation of AH length during filling ancillary data. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc518405b3e..c4a3a993acb 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -437,7 +437,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) break; case IPPROTO_AH: nexthdr = ptr[0]; - len = (ptr[1] + 1) << 2; + len = (ptr[1] + 2) << 2; break; default: nexthdr = ptr[0]; -- cgit v1.2.3 From df9890c31a1a447254f39e40c3fd81ad6547945b Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Sun, 20 Nov 2005 12:23:18 +0900 Subject: [IPV6]: Fix sending extension headers before and including routing header. Based on suggestion from Masahide Nakamura . Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/exthdrs.c | 19 +++++++++++++++++++ net/ipv6/ip6_flowlabel.c | 16 ++++++---------- net/ipv6/raw.c | 4 +++- net/ipv6/udp.c | 4 +++- 4 files changed, 31 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 748577b76d7..be6faf31138 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -673,3 +673,22 @@ out: return ERR_PTR(err); } +struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, + struct ipv6_txoptions *opt) +{ + /* + * ignore the dest before srcrt unless srcrt is being included. + * --yoshfuji + */ + if (opt && opt->dst0opt && !opt->srcrt) { + if (opt_space != opt) { + memcpy(opt_space, opt, sizeof(*opt_space)); + opt = opt_space; + } + opt->opt_nflen -= ipv6_optlen(opt->dst0opt); + opt->dst0opt = NULL; + } + + return opt; +} + diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index bbbe80cdaf7..1cf02765fb5 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -225,20 +225,16 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, struct ip6_flowlabel * fl, struct ipv6_txoptions * fopt) { - struct ipv6_txoptions * fl_opt = fl ? fl->opt : NULL; - - if (fopt == NULL || fopt->opt_flen == 0) { - if (!fl_opt || !fl_opt->dst0opt || fl_opt->srcrt) - return fl_opt; - } - + struct ipv6_txoptions * fl_opt = fl->opt; + + if (fopt == NULL || fopt->opt_flen == 0) + return fl_opt; + if (fl_opt != NULL) { opt_space->hopopt = fl_opt->hopopt; - opt_space->dst0opt = fl_opt->srcrt ? fl_opt->dst0opt : NULL; + opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; - if (fl_opt->dst0opt && !fl_opt->srcrt) - opt_space->opt_nflen -= ipv6_optlen(fl_opt->dst0opt); } else { if (fopt->opt_nflen == 0) return fopt; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a1265a320b1..d77d3352c96 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -756,7 +756,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, } if (opt == NULL) opt = np->opt; - opt = fl6_merge_options(&opt_space, flowlabel, opt); + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + opt = ipv6_fixup_options(&opt_space, opt); fl.proto = proto; rawv6_probe_proto_opt(&fl, msg); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bf9519341fd..e2b87cc68b7 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -778,7 +778,9 @@ do_udp_sendmsg: } if (opt == NULL) opt = np->opt; - opt = fl6_merge_options(&opt_space, flowlabel, opt); + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + opt = ipv6_fixup_options(&opt_space, opt); fl->proto = IPPROTO_UDP; ipv6_addr_copy(&fl->fl6_dst, daddr); -- cgit v1.2.3 From aa8751667dcd757dd9a711b51140adf181501c44 Mon Sep 17 00:00:00 2001 From: Andrea Bittau Date: Sun, 20 Nov 2005 13:41:05 -0800 Subject: [PKT_SCHED]: sch_netem: correctly order packets to be sent simultaneously If two packets were queued to be sent at the same time in the future, their order would be reversed. This would occur because the queue is traversed back to front, and a position is found by checking whether the new packet needs to be sent before the packet being examined. If the new packet is to be sent at the same time of a previous packet, it would end up before the old packet in the queue. This patch places packets in the correct order when they are queued to be sent at a same time in the future. Signed-off-by: Andrea Bittau Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index cdc8d283791..82fb07aa06a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -464,7 +464,7 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) const struct netem_skb_cb *cb = (const struct netem_skb_cb *)skb->cb; - if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send)) + if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send)) break; } -- cgit v1.2.3 From fb0d366b0803571f06a5b838f02c6706fc287995 Mon Sep 17 00:00:00 2001 From: Kris Katterjohn Date: Sun, 20 Nov 2005 13:41:34 -0800 Subject: [NET]: Reject socket filter if division by constant zero is attempted. This way we don't have to check it in sk_run_filter(). Signed-off-by: Kris Katterjohn Signed-off-by: David S. Miller --- net/core/filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 079c2edff78..2841bfce29d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -116,8 +116,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) A /= X; continue; case BPF_ALU|BPF_DIV|BPF_K: - if (fentry->k == 0) - return 0; A /= fentry->k; continue; case BPF_ALU|BPF_AND|BPF_X: @@ -320,6 +318,10 @@ int sk_chk_filter(struct sock_filter *filter, int flen) } } + /* check for division by zero -Kris Katterjohn 2005-10-30 */ + if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) + return -EINVAL; + /* check that memory operations use valid addresses. */ if (ftest->k >= BPF_MEMWORDS) { /* but it might not be a memory operation... */ -- cgit v1.2.3 From 5d5780df23537ad0db72267fe11103d2c23d9b2a Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Sun, 20 Nov 2005 13:42:20 -0800 Subject: [IPV6]: Acquire addrconf_hash_lock for read in addrconf_verify(...) addrconf_verify(...) only traverse address hash table when addrconf_hash_lock is held for writing, and it may hold addrconf_hash_lock for a long time. So I think it's better to acquire addrconf_hash_lock for reading instead of writing Signed-off-by: Yan Zheng Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 56a09a4ac41..a16064ba0ca 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2627,7 +2627,7 @@ static void addrconf_verify(unsigned long foo) for (i=0; i < IN6_ADDR_HSIZE; i++) { restart: - write_lock(&addrconf_hash_lock); + read_lock(&addrconf_hash_lock); for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) { unsigned long age; #ifdef CONFIG_IPV6_PRIVACY @@ -2649,7 +2649,7 @@ restart: if (age >= ifp->valid_lft) { spin_unlock(&ifp->lock); in6_ifa_hold(ifp); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); goto restart; } else if (age >= ifp->prefered_lft) { @@ -2668,7 +2668,7 @@ restart: if (deprecate) { in6_ifa_hold(ifp); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); @@ -2686,7 +2686,7 @@ restart: in6_ifa_hold(ifp); in6_ifa_hold(ifpub); spin_unlock(&ifp->lock); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); @@ -2703,7 +2703,7 @@ restart: spin_unlock(&ifp->lock); } } - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); } addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next; -- cgit v1.2.3 From c9e53cbe7ad6eabb3c7c5140b6127b4e5f9ee840 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 20 Nov 2005 21:09:00 -0800 Subject: [FIB_TRIE]: Don't show local table in /proc/net/route output Don't show local table to behave similar to fib_hash. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 66247f38b37..705e3ce86df 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2378,6 +2378,7 @@ static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi) */ static int fib_route_seq_show(struct seq_file *seq, void *v) { + const struct fib_trie_iter *iter = seq->private; struct leaf *l = v; int i; char bf[128]; @@ -2389,6 +2390,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } + if (iter->trie == trie_local) + return 0; if (IS_TNODE(l)) return 0; -- cgit v1.2.3 From 2b8f2ff6f4c11fff9c3016b54fa261f522a54b70 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Sun, 20 Nov 2005 21:09:55 -0800 Subject: [NETFILTER]: fixed dependencies between modules related with ip_conntrack - IP_NF_CONNTRACK_MARK is bool and depends on only IP_NF_CONNTRACK which is tristate. If a variable depends on IP_NF_CONNTRACK_MARK and doesn't care about IP_NF_CONNTRACK, it can be y. This must be avoided. - IP_NF_CT_ACCT has same problem. - IP_NF_TARGET_CLUSTERIP also depends on IP_NF_MANGLE. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9d3c8b5f327..0bc00528d88 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -440,7 +440,7 @@ config IP_NF_MATCH_COMMENT config IP_NF_MATCH_CONNMARK tristate 'Connection mark match support' depends on IP_NF_IPTABLES - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. @@ -452,7 +452,7 @@ config IP_NF_MATCH_CONNMARK config IP_NF_MATCH_CONNBYTES tristate 'Connection byte/packet counter match support' depends on IP_NF_IPTABLES - depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || (NF_CT_ACCT && NF_CONNTRACK_IPV4) help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -767,7 +767,7 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' depends on IP_NF_MANGLE - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but @@ -779,8 +779,8 @@ config IP_NF_TARGET_CONNMARK config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_IPTABLES && EXPERIMENTAL - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on IP_NF_MANGLE && EXPERIMENTAL + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing -- cgit v1.2.3 From a516b04950f62d36b33db00c0d3c69d09dae99a9 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 20 Nov 2005 21:16:13 -0800 Subject: [DCCP]: Add missing no_policy flag to struct net_protocol Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/dccp/proto.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/proto.c b/net/dccp/proto.c index e0ace7cbb99..8a6b2a9e458 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -46,6 +46,7 @@ atomic_t dccp_orphan_count = ATOMIC_INIT(0); static struct net_protocol dccp_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, + .no_policy = 1, }; const char *dccp_packet_name(const int type) -- cgit v1.2.3 From c27bd492fd84c590767a3c0f9f74e637b17af138 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Nov 2005 14:41:50 -0800 Subject: [NETLINK]: Use tgid instead of pid for nlmsg_pid Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 8c38ee6d255..96020d7087e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -476,7 +476,7 @@ static int netlink_autobind(struct socket *sock) struct hlist_head *head; struct sock *osk; struct hlist_node *node; - s32 pid = current->pid; + s32 pid = current->tgid; int err; static s32 rover = -4097; -- cgit v1.2.3 From 0ff60a45678e67b2547256a636fd00c1667ce4fa Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Tue, 22 Nov 2005 14:47:37 -0800 Subject: [IPV4]: Fix secondary IP addresses after promotion This patch fixes the problem with promoting aliases when: a) a single primary and > 1 secondary addresses b) multiple primary addresses each with at least one secondary address Based on earlier efforts from Brian Pomerantz , Patrick McHardy and Thomas Graf Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 40 ++++++++++++++++++++++++++++++---------- net/ipv4/fib_frontend.c | 2 +- 2 files changed, 31 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 4ec4b2ca6ab..04a6fe3e95a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -234,7 +234,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) { struct in_ifaddr *promote = NULL; - struct in_ifaddr *ifa1 = *ifap; + struct in_ifaddr *ifa, *ifa1 = *ifap; + struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *prev_prom = NULL; + int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); @@ -243,18 +246,22 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { - struct in_ifaddr *ifa; struct in_ifaddr **ifap1 = &ifa1->ifa_next; while ((ifa = *ifap1) != NULL) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY) && + ifa1->ifa_scope <= ifa->ifa_scope) + last_prim = ifa; + if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; + prev_prom = ifa; continue; } - if (!IN_DEV_PROMOTE_SECONDARIES(in_dev)) { + if (!do_promote) { *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa); @@ -283,18 +290,31 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, */ rtmsg_ifa(RTM_DELADDR, ifa1); notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); - if (destroy) { - inet_free_ifa(ifa1); - if (!in_dev->ifa_list) - inetdev_destroy(in_dev); - } + if (promote) { + + if (prev_prom) { + prev_prom->ifa_next = promote->ifa_next; + promote->ifa_next = last_prim->ifa_next; + last_prim->ifa_next = promote; + } - if (promote && IN_DEV_PROMOTE_SECONDARIES(in_dev)) { - /* not sure if we should send a delete notify first? */ promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote); notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); + for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) + continue; + fib_add_ifaddr(ifa); + } + + } + if (destroy) { + inet_free_ifa(ifa1); + + if (!in_dev->ifa_list) + inetdev_destroy(in_dev); } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 2267c1fad87..882f88f6d13 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -407,7 +407,7 @@ static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); } -static void fib_add_ifaddr(struct in_ifaddr *ifa) +void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; -- cgit v1.2.3 From 00cb277a4a1fb76aafb2fb28aa99f30546e619c5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 22 Nov 2005 14:54:34 -0800 Subject: [NETFILTER] ctnetlink: Fix refcount leak ip_conntrack/nat_proto Remove proto == NULL checking since ip_conntrack_[nat_]proto_find_get always returns a valid pointer. Fix missing ip_conntrack_proto_put in some paths. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index de9f4464438..6c18a2b6d5c 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -59,11 +59,13 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + /* If no protocol helper is found, this function will return the + * generic protocol helper, so proto won't *ever* be NULL */ proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (likely(proto && proto->tuple_to_nfattr)) { + if (likely(proto->tuple_to_nfattr)) ret = proto->tuple_to_nfattr(skb, tuple); - ip_conntrack_proto_put(proto); - } + + ip_conntrack_proto_put(proto); return ret; @@ -128,9 +130,11 @@ ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) struct nfattr *nest_proto; int ret; - - if (!proto || !proto->to_nfattr) + + if (!proto->to_nfattr) { + ip_conntrack_proto_put(proto); return 0; + } nest_proto = NFA_NEST(skb, CTA_PROTOINFO); @@ -527,10 +531,10 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (likely(proto && proto->nfattr_to_tuple)) { + if (likely(proto->nfattr_to_tuple)) ret = proto->nfattr_to_tuple(tb, tuple); - ip_conntrack_proto_put(proto); - } + + ip_conntrack_proto_put(proto); return ret; } @@ -596,8 +600,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, return -EINVAL; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - if (!npt) - return 0; if (!npt->nfattr_to_range) { ip_nat_proto_put(npt); @@ -957,8 +959,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); proto = ip_conntrack_proto_find_get(npt); - if (!proto) - return -EINVAL; if (proto->from_nfattr) err = proto->from_nfattr(tb, ct); -- cgit v1.2.3 From de919820cf7fe6674cdf47f8f47d2af284e4309f Mon Sep 17 00:00:00 2001 From: Benoit Boissinot Date: Wed, 23 Nov 2005 19:03:46 -0800 Subject: [NETFILTER]: ip_conntrack_netlink.c needs linux/interrupt.h net/ipv4/netfilter/ip_conntrack_netlink.c: In function 'ctnetlink_dump_table': net/ipv4/netfilter/ip_conntrack_netlink.c:409: warning: implicit declaration of function 'local_bh_disable' net/ipv4/netfilter/ip_conntrack_netlink.c:427: warning: implicit declaration of function 'local_bh_enable' Signed-off-by: Benoit Boissinot Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 6c18a2b6d5c..3fce91bcc0b 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 133747e8d1e912863edfb3869e36b97b9939d4fc Mon Sep 17 00:00:00 2001 From: Olaf Rempel Date: Wed, 23 Nov 2005 19:04:08 -0800 Subject: [BRIDGE]: recompute features when adding a new device We must recompute bridge features everytime the list of underlying devices changes, or we might end up with features that are not supported by all devices (eg. NETIF_F_TSO) This patch adds the missing recompute when adding a device to the bridge. Signed-off-by: Olaf Rempel Signed-off-by: David S. Miller --- net/bridge/br_if.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index defcf6a8607..975abe254b7 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -366,6 +366,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) spin_lock_bh(&br->lock); br_stp_recalculate_bridge_id(br); + br_features_recompute(br); if ((br->dev->flags & IFF_UP) && (dev->flags & IFF_UP) && netif_carrier_ok(dev)) br_stp_enable_port(p); -- cgit v1.2.3 From b3eb67a2ab4e14fc6cc035907400b86462d174db Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Nov 2005 17:10:11 -0500 Subject: SUNRPC: Funny looking code in __rpc_purge_upcall In __rpc_purge_upcall (net/sunrpc/rpc_pipe.c), the newer code to clean up the in_upcall list has a typo. Thanks to Vince Busam for spotting this! Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 81e00a6c19d..e3b242daf53 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -39,23 +39,27 @@ static kmem_cache_t *rpc_inode_cachep __read_mostly; #define RPC_UPCALL_TIMEOUT (30*HZ) static void -__rpc_purge_upcall(struct inode *inode, int err) +__rpc_purge_list(struct rpc_inode *rpci, struct list_head *head, int err) { - struct rpc_inode *rpci = RPC_I(inode); struct rpc_pipe_msg *msg; + void (*destroy_msg)(struct rpc_pipe_msg *); - while (!list_empty(&rpci->pipe)) { - msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list); + destroy_msg = rpci->ops->destroy_msg; + while (!list_empty(head)) { + msg = list_entry(head->next, struct rpc_pipe_msg, list); list_del_init(&msg->list); msg->errno = err; - rpci->ops->destroy_msg(msg); - } - while (!list_empty(&rpci->in_upcall)) { - msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list); - list_del_init(&msg->list); - msg->errno = err; - rpci->ops->destroy_msg(msg); + destroy_msg(msg); } +} + +static void +__rpc_purge_upcall(struct inode *inode, int err) +{ + struct rpc_inode *rpci = RPC_I(inode); + + __rpc_purge_list(rpci, &rpci->pipe, err); + __rpc_purge_list(rpci, &rpci->in_upcall, err); rpci->pipelen = 0; wake_up(&rpci->waitq); } -- cgit v1.2.3 From 220bbd748335f73aafb472a97716762a42cb0d58 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 28 Nov 2005 22:27:11 -0800 Subject: [IPV6]: Implement appropriate dummy rule 4 in ipv6_dev_get_saddr(). Ensure to update hiscore.rule in dummy rule 4 in ipv6_dev_get_saddr(). Pointed out by Yan Zheng . Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a16064ba0ca..76ff9f4fe89 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -985,6 +985,8 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, } /* Rule 4: Prefer home address -- not implemented yet */ + if (hiscore.rule < 4) + hiscore.rule++; /* Rule 5: Prefer outgoing interface */ if (hiscore.rule < 5) { -- cgit v1.2.3 From 18955cfcb2a5d75a08e0cb297f13ccfb6904de48 Mon Sep 17 00:00:00 2001 From: Mike Stroyan Date: Tue, 29 Nov 2005 16:12:55 -0800 Subject: [IPV4] tcp/route: Another look at hash table sizes The tcp_ehash hash table gets too big on systems with really big memory. It is worse on systems with pages larger than 4KB. It wastes memory that could be better used. It also makes the netstat command slow because reading /proc/net/tcp and /proc/net/tcp6 needs to go through the full hash table. The default value should not be larger for larger page sizes. It seems that the effect of page size is an unintended error dating back a long time. I also wonder if the default value really should be a larger fraction of memory for systems with more memory. While systems with really big ram can afford more space for hash tables, it is not clear to me that they benefit from increasing the allocation ratio for this table. The amount of memory allocated is determined by net/ipv4/tcp.c:tcp_init and mm/page_alloc.c:alloc_large_system_hash. tcp_init calls alloc_large_system_hash passing parameters- bucketsize=sizeof(struct tcp_ehash_bucket) numentries=thash_entries scale=(num_physpages >= 128 * 1024) ? (25-PAGE_SHIFT) : (27-PAGE_SHIFT) limit=0 On i386, PAGE_SHIFT is 12 for a page size of 4K On ia64, PAGE_SHIFT defaults to 14 for a page size of 16K The num_physpages test above makes the allocation take a larger fraction of the total memory on systems with larger memory. The threshold size for a i386 system is 512MB. For an ia64 system with 16KB pages the threshold is 2GB. For smaller memory systems- On i386, scale = (27 - 12) = 15 On ia64, scale = (27 - 14) = 13 For larger memory systems- On i386, scale = (25 - 12) = 13 On ia64, scale = (25 - 14) = 11 For the rest of this discussion, I'll just track the larger memory case. The default behavior has numentries=thash_entries=0, so the allocated size is determined by either scale or by the default limit of 1/16 of total memory. In alloc_large_system_hash- | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | numentries >>= 20 - PAGE_SHIFT; | numentries <<= 20 - PAGE_SHIFT; At this point, numentries is pages for all of memory, rounded up to the nearest megabyte boundary. | /* limit to 1 bucket per 2^scale bytes of low memory */ | if (scale > PAGE_SHIFT) | numentries >>= (scale - PAGE_SHIFT); | else | numentries <<= (PAGE_SHIFT - scale); On i386, numentries >>= (13 - 12), so numentries is 1/8196 of bytes of total memory. On ia64, numentries <<= (14 - 11), so numentries is 1/2048 of bytes of total memory. | log2qty = long_log2(numentries); | | do { | size = bucketsize << log2qty; bucketsize is 16, so size is 16 times numentries, rounded down to a power of two. On i386, size is 1/512 of bytes of total memory. On ia64, size is 1/128 of bytes of total memory. For smaller systems the results are On i386, size is 1/2048 of bytes of total memory. On ia64, size is 1/512 of bytes of total memory. The large page effect can be removed by just replacing the use of PAGE_SHIFT with a constant of 12 in the calls to alloc_large_system_hash. That makes them more like the other uses of that function from fs/inode.c and fs/dcache.c Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +-- net/ipv4/tcp.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 381dd6a6aeb..e9c14f4a2eb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3149,8 +3149,7 @@ int __init ip_rt_init(void) sizeof(struct rt_hash_bucket), rhash_entries, (num_physpages >= 128 * 1024) ? - (27 - PAGE_SHIFT) : - (29 - PAGE_SHIFT), + 15 : 17, HASH_HIGHMEM, &rt_hash_log, &rt_hash_mask, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9ac7a4f46bd..5e6bc4b3287 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2065,8 +2065,7 @@ void __init tcp_init(void) sizeof(struct inet_ehash_bucket), thash_entries, (num_physpages >= 128 * 1024) ? - (25 - PAGE_SHIFT) : - (27 - PAGE_SHIFT), + 13 : 15, HASH_HIGHMEM, &tcp_hashinfo.ehash_size, NULL, @@ -2082,8 +2081,7 @@ void __init tcp_init(void) sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? - (25 - PAGE_SHIFT) : - (27 - PAGE_SHIFT), + 13 : 15, HASH_HIGHMEM, &tcp_hashinfo.bhash_size, NULL, -- cgit v1.2.3 From c9933d0856d6d0ede6b4b30e5e7330614f5203af Mon Sep 17 00:00:00 2001 From: Mitchell Blank Jr Date: Tue, 29 Nov 2005 16:13:32 -0800 Subject: [ATM]: always return the first interface for ATM_ITF_ANY From: Mitchell Blank Jr Signed-off-by: Chas Williams Signed-off-by: David S. Miller --- net/atm/common.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/atm/common.c b/net/atm/common.c index 63feea49fb1..83454e12317 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -423,33 +423,23 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci) if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) return -EINVAL; - if (itf != ATM_ITF_ANY) { + if (likely(itf != ATM_ITF_ANY)) { dev = atm_dev_lookup(itf); - if (!dev) - return -ENODEV; - error = __vcc_connect(vcc, dev, vpi, vci); - if (error) { - atm_dev_put(dev); - return error; - } } else { - struct list_head *p, *next; - dev = NULL; spin_lock(&atm_dev_lock); - list_for_each_safe(p, next, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); + if (!list_empty(&atm_devs)) { + dev = list_entry(atm_devs.next, struct atm_dev, dev_list); atm_dev_hold(dev); - spin_unlock(&atm_dev_lock); - if (!__vcc_connect(vcc, dev, vpi, vci)) - break; - atm_dev_put(dev); - dev = NULL; - spin_lock(&atm_dev_lock); } spin_unlock(&atm_dev_lock); - if (!dev) - return -ENODEV; + } + if (!dev) + return -ENODEV; + error = __vcc_connect(vcc, dev, vpi, vci); + if (error) { + atm_dev_put(dev); + return error; } if (vpi == ATM_VPI_UNSPEC || vci == ATM_VCI_UNSPEC) set_bit(ATM_VF_PARTIAL,&vcc->flags); -- cgit v1.2.3 From c219750b2e667f4f79f4d8faca5057dad793db87 Mon Sep 17 00:00:00 2001 From: Mitchell Blank Jr Date: Tue, 29 Nov 2005 16:13:55 -0800 Subject: [ATM]: atm_pcr_goal() doesn't modify its argument's contents -- mark it as const Signed-off-by: Mitchell Blank Jr Signed-off-by: Chas Williams Signed-off-by: David S. Miller --- net/atm/atm_misc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c index 223c7ad5bd0..02cc7e71efe 100644 --- a/net/atm/atm_misc.c +++ b/net/atm/atm_misc.c @@ -74,11 +74,14 @@ struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size, */ -int atm_pcr_goal(struct atm_trafprm *tp) +int atm_pcr_goal(const struct atm_trafprm *tp) { - if (tp->pcr && tp->pcr != ATM_MAX_PCR) return -tp->pcr; - if (tp->min_pcr && !tp->pcr) return tp->min_pcr; - if (tp->max_pcr != ATM_MAX_PCR) return -tp->max_pcr; + if (tp->pcr && tp->pcr != ATM_MAX_PCR) + return -tp->pcr; + if (tp->min_pcr && !tp->pcr) + return tp->min_pcr; + if (tp->max_pcr != ATM_MAX_PCR) + return -tp->max_pcr; return 0; } -- cgit v1.2.3 From 50accc9c428273501dd2a6295c84a533dd1fe645 Mon Sep 17 00:00:00 2001 From: Mitchell Blank Jr Date: Tue, 29 Nov 2005 16:15:18 -0800 Subject: [ATM]: attempt to autoload atm drivers From: Mitchell Blank Jr Signed-off-by: Chas Williams Signed-off-by: David S. Miller --- net/atm/common.c | 2 +- net/atm/resources.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/atm/common.c b/net/atm/common.c index 83454e12317..db9318fc603 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -424,7 +424,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci) vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) return -EINVAL; if (likely(itf != ATM_ITF_ANY)) { - dev = atm_dev_lookup(itf); + dev = try_then_request_module(atm_dev_lookup(itf), "atm-device-%d", itf); } else { dev = NULL; spin_lock(&atm_dev_lock); diff --git a/net/atm/resources.c b/net/atm/resources.c index 415d2615d47..35f3ceb7686 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -245,7 +245,8 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg) if (get_user(number, &sioc->number)) return -EFAULT; - if (!(dev = atm_dev_lookup(number))) + if (!(dev = try_then_request_module(atm_dev_lookup(number), + "atm-device-%d", number))) return -ENODEV; switch (cmd) { -- cgit v1.2.3 From aaaaaadbe7a663d110814db50fcbe7d320eb4c32 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 29 Nov 2005 16:16:21 -0800 Subject: [ATM]: avoid race conditions related to atm_devs list Use semaphore to protect atm_devs list, as no one need access to it from interrupt context. Avoid race conditions between atm_dev_register(), atm_dev_lookup() and atm_dev_deregister(). Fix double spin_unlock() bug. Signed-off-by: Stanislaw Gruszka Signed-off-by: Chas Williams Signed-off-by: David S. Miller --- net/atm/common.c | 4 ++-- net/atm/resources.c | 36 +++++++++++++++++------------------- net/atm/resources.h | 3 +-- 3 files changed, 20 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/atm/common.c b/net/atm/common.c index db9318fc603..9e016f404e1 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -427,12 +427,12 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci) dev = try_then_request_module(atm_dev_lookup(itf), "atm-device-%d", itf); } else { dev = NULL; - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); if (!list_empty(&atm_devs)) { dev = list_entry(atm_devs.next, struct atm_dev, dev_list); atm_dev_hold(dev); } - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); } if (!dev) return -ENODEV; diff --git a/net/atm/resources.c b/net/atm/resources.c index 35f3ceb7686..ad533b02b84 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -25,7 +25,7 @@ LIST_HEAD(atm_devs); -DEFINE_SPINLOCK(atm_dev_lock); +DECLARE_MUTEX(atm_dev_mutex); static struct atm_dev *__alloc_atm_dev(const char *type) { @@ -52,7 +52,7 @@ static struct atm_dev *__atm_dev_lookup(int number) list_for_each(p, &atm_devs) { dev = list_entry(p, struct atm_dev, dev_list); - if ((dev->ops) && (dev->number == number)) { + if (dev->number == number) { atm_dev_hold(dev); return dev; } @@ -64,9 +64,9 @@ struct atm_dev *atm_dev_lookup(int number) { struct atm_dev *dev; - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); dev = __atm_dev_lookup(number); - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); return dev; } @@ -81,11 +81,11 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, type); return NULL; } - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); if (number != -1) { if ((inuse = __atm_dev_lookup(number))) { atm_dev_put(inuse); - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); kfree(dev); return NULL; } @@ -105,19 +105,17 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, memset(&dev->flags, 0, sizeof(dev->flags)); memset(&dev->stats, 0, sizeof(dev->stats)); atomic_set(&dev->refcnt, 1); - list_add_tail(&dev->dev_list, &atm_devs); - spin_unlock(&atm_dev_lock); if (atm_proc_dev_register(dev) < 0) { printk(KERN_ERR "atm_dev_register: " "atm_proc_dev_register failed for dev %s\n", type); - spin_lock(&atm_dev_lock); - list_del(&dev->dev_list); - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); kfree(dev); return NULL; } + list_add_tail(&dev->dev_list, &atm_devs); + up(&atm_dev_mutex); return dev; } @@ -129,9 +127,9 @@ void atm_dev_deregister(struct atm_dev *dev) atm_proc_dev_deregister(dev); - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); list_del(&dev->dev_list); - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); warning_time = jiffies; while (atomic_read(&dev->refcnt) != 1) { @@ -211,16 +209,16 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg) return -EFAULT; if (get_user(len, &iobuf->length)) return -EFAULT; - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); list_for_each(p, &atm_devs) size += sizeof(int); if (size > len) { - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); return -E2BIG; } tmp_buf = kmalloc(size, GFP_ATOMIC); if (!tmp_buf) { - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); return -ENOMEM; } tmp_p = tmp_buf; @@ -228,7 +226,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg) dev = list_entry(p, struct atm_dev, dev_list); *tmp_p++ = dev->number; } - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); error = ((copy_to_user(buf, tmp_buf, size)) || put_user(size, &iobuf->length)) ? -EFAULT : 0; @@ -415,13 +413,13 @@ static __inline__ void *dev_get_idx(loff_t left) void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos) { - spin_lock(&atm_dev_lock); + down(&atm_dev_mutex); return *pos ? dev_get_idx(*pos) : (void *) 1; } void atm_dev_seq_stop(struct seq_file *seq, void *v) { - spin_unlock(&atm_dev_lock); + up(&atm_dev_mutex); } void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) diff --git a/net/atm/resources.h b/net/atm/resources.h index 12910619dbb..b7fb82a93b4 100644 --- a/net/atm/resources.h +++ b/net/atm/resources.h @@ -11,8 +11,7 @@ extern struct list_head atm_devs; -extern spinlock_t atm_dev_lock; - +extern struct semaphore atm_dev_mutex; int atm_dev_ioctl(unsigned int cmd, void __user *arg); -- cgit v1.2.3 From 64bf69ddff7637b7ed7acf9b2a823cc0ee519439 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 29 Nov 2005 16:16:41 -0800 Subject: [ATM]: deregistration removes device from atm_devs list immediately atm_dev_deregister() removes device from atm_dev list immediately to prevent operations on a phantom device. Decision to free device based only on ->refcnt now. Remove shutdown_atm_dev() use atm_dev_deregister() instead. atm_dev_deregister() also asynchronously releases all vccs related to device. Signed-off-by: Stanislaw Gruszka Signed-off-by: Chas Williams Signed-off-by: David S. Miller --- net/atm/common.c | 30 +++++++++++++++++++++++++++--- net/atm/common.h | 2 ++ net/atm/resources.c | 39 ++++++++++++--------------------------- 3 files changed, 41 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/atm/common.c b/net/atm/common.c index 9e016f404e1..6656b111cc0 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -221,6 +221,29 @@ void vcc_release_async(struct atm_vcc *vcc, int reply) EXPORT_SYMBOL(vcc_release_async); +void atm_dev_release_vccs(struct atm_dev *dev) +{ + int i; + + write_lock_irq(&vcc_sklist_lock); + for (i = 0; i < VCC_HTABLE_SIZE; i++) { + struct hlist_head *head = &vcc_hash[i]; + struct hlist_node *node, *tmp; + struct sock *s; + struct atm_vcc *vcc; + + sk_for_each_safe(s, node, tmp, head) { + vcc = atm_sk(s); + if (vcc->dev == dev) { + vcc_release_async(vcc, -EPIPE); + sk_del_node_init(s); + } + } + } + write_unlock_irq(&vcc_sklist_lock); +} + + static int adjust_tp(struct atm_trafprm *tp,unsigned char aal) { int max_sdu; @@ -332,12 +355,13 @@ static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi, return -EINVAL; if (vci > 0 && vci < ATM_NOT_RSV_VCI && !capable(CAP_NET_BIND_SERVICE)) return -EPERM; - error = 0; + error = -ENODEV; if (!try_module_get(dev->ops->owner)) - return -ENODEV; + return error; vcc->dev = dev; write_lock_irq(&vcc_sklist_lock); - if ((error = find_ci(vcc, &vpi, &vci))) { + if (test_bit(ATM_DF_REMOVED, &dev->flags) || + (error = find_ci(vcc, &vpi, &vci))) { write_unlock_irq(&vcc_sklist_lock); goto fail_module_put; } diff --git a/net/atm/common.h b/net/atm/common.h index e49ed41c0e3..4887c317cef 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -47,4 +47,6 @@ static inline void atm_proc_exit(void) /* SVC */ int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos); +void atm_dev_release_vccs(struct atm_dev *dev); + #endif diff --git a/net/atm/resources.c b/net/atm/resources.c index ad533b02b84..c8c459fcb03 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -70,6 +70,7 @@ struct atm_dev *atm_dev_lookup(int number) return dev; } + struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, int number, unsigned long *flags) { @@ -123,37 +124,22 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, void atm_dev_deregister(struct atm_dev *dev) { - unsigned long warning_time; - - atm_proc_dev_deregister(dev); - + BUG_ON(test_bit(ATM_DF_REMOVED, &dev->flags)); + set_bit(ATM_DF_REMOVED, &dev->flags); + + /* + * if we remove current device from atm_devs list, new device + * with same number can appear, such we need deregister proc, + * release async all vccs and remove them from vccs list too + */ down(&atm_dev_mutex); list_del(&dev->dev_list); up(&atm_dev_mutex); - warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 1) { - msleep(250); - if ((jiffies - warning_time) > 10 * HZ) { - printk(KERN_EMERG "atm_dev_deregister: waiting for " - "dev %d to become free. Usage count = %d\n", - dev->number, atomic_read(&dev->refcnt)); - warning_time = jiffies; - } - } - - kfree(dev); -} + atm_dev_release_vccs(dev); + atm_proc_dev_deregister(dev); -void shutdown_atm_dev(struct atm_dev *dev) -{ - if (atomic_read(&dev->refcnt) > 1) { - set_bit(ATM_DF_CLOSE, &dev->flags); - return; - } - if (dev->ops->dev_close) - dev->ops->dev_close(dev); - atm_dev_deregister(dev); + atm_dev_put(dev); } @@ -433,4 +419,3 @@ void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL(atm_dev_register); EXPORT_SYMBOL(atm_dev_deregister); EXPORT_SYMBOL(atm_dev_lookup); -EXPORT_SYMBOL(shutdown_atm_dev); -- cgit v1.2.3 From 9b5b5cff9a6655dbb6d2e2be365bb95eec3950eb Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 29 Nov 2005 16:21:38 -0800 Subject: [NET]: Add const markers to various variables. the patch below marks various variables const in net/; the goal is to move them to the .rodata section so that they can't false-share cachelines with things that get written to, as well as potentially helping gcc a bit with optimisations. (these were found using a gcc patch to warn about such variables) Signed-off-by: Arjan van de Ven Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 2 +- net/ipv4/fib_semantics.c | 2 +- net/ipv4/icmp.c | 4 ++-- net/ipv4/ipvs/ip_vs_conn.c | 2 +- net/ipv4/ipvs/ip_vs_ctl.c | 4 ++-- net/ipv4/ipvs/ip_vs_proto_tcp.c | 2 +- net/ipv4/netfilter/ip_conntrack_amanda.c | 2 +- net/ipv4/netfilter/ip_conntrack_ftp.c | 2 +- net/ipv4/netfilter/ip_conntrack_irc.c | 2 +- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 6 +++--- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv4/netfilter/ipt_LOG.c | 2 +- net/ipv4/proc.c | 10 +++++----- net/ipv4/route.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv6/icmp.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- 19 files changed, 29 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 2a8c9afc369..7ea0209cb16 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -975,7 +975,7 @@ static void fib_seq_stop(struct seq_file *seq, void *v) static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi) { - static unsigned type2flags[RTN_MAX + 1] = { + static const unsigned type2flags[RTN_MAX + 1] = { [7] = RTF_REJECT, [8] = RTF_REJECT, }; unsigned flags = type2flags[type]; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 186f20c4a45..6d2a6ac070e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -83,7 +83,7 @@ for (nhsel=0; nhsel < 1; nhsel++) #define endfor_nexthops(fi) } -static struct +static const struct { int error; u8 scope; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e3eceecd049..92e23b2ad4d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -220,7 +220,7 @@ struct icmp_control { short error; /* This ICMP is classed as an error message */ }; -static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; /* * The ICMP socket(s). This is the most convenient way to flow control @@ -994,7 +994,7 @@ error: /* * This table is the definition of how we handle ICMP. */ -static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { .output_entry = ICMP_MIB_OUTECHOREPS, .input_entry = ICMP_MIB_INECHOREPS, diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index f828fa2eb7d..2a3a8c59c65 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -771,7 +771,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) * The drop rate array needs tuning for real environments. * Called from timer bh only => no locking */ - static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; static char todrop_counter[9] = {0}; int i; diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 2d66848e7aa..9bdcf31b760 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1909,7 +1909,7 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) #define MAX_ARG_LEN SVCDEST_ARG_LEN -static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, @@ -2180,7 +2180,7 @@ __ip_vs_get_timeouts(struct ip_vs_timeout_user *u) #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) -static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index c19408973c0..0e878fd6215 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -251,7 +251,7 @@ tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) #define TCP_DIR_OUTPUT 4 #define TCP_DIR_INPUT_ONLY 8 -static int tcp_state_off[IP_VS_DIR_LAST] = { +static const int tcp_state_off[IP_VS_DIR_LAST] = { [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index fa3f914117e..e52847fa10f 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -37,7 +37,7 @@ MODULE_LICENSE("GPL"); module_param(master_timeout, int, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); -static char *conns[] = { "DATA ", "MESG ", "INDEX " }; +static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; /* This is slow, but it's simple. --RR */ static char *amanda_buffer; diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 59e12b02b22..68b173bcda6 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -55,7 +55,7 @@ static int try_rfc959(const char *, size_t, u_int32_t [], char); static int try_eprt(const char *, size_t, u_int32_t [], char); static int try_epsv_response(const char *, size_t, u_int32_t [], char); -static struct ftp_search { +static const struct ftp_search { enum ip_conntrack_dir dir; const char *pattern; size_t plen; diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 2dea1db1440..d7c40421d0d 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -59,7 +59,7 @@ MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC module_param(dcc_timeout, int, 0400); MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); -static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; +static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; #define MINMATCHLEN 5 #if 0 diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index e4d6b268e8c..5f9925db608 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -51,7 +51,7 @@ static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, const struct ip_conntrack_tuple *orig) { /* Add 1; spaces filled with 0. */ - static u_int8_t invmap[] + static const u_int8_t invmap[] = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, [ICMP_ECHOREPLY] = ICMP_ECHO + 1, [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, @@ -110,7 +110,7 @@ static int icmp_packet(struct ip_conntrack *ct, return NF_ACCEPT; } -static u_int8_t valid_new[] = { +static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 59a4a0111dd..977fb59d456 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -65,7 +65,7 @@ static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; -static unsigned long * sctp_timeouts[] +static const unsigned long * sctp_timeouts[] = { NULL, /* SCTP_CONNTRACK_NONE */ &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ @@ -118,7 +118,7 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index ee3b7d6c4d2..62598167677 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -99,7 +99,7 @@ unsigned long ip_ct_tcp_timeout_close = 10 SECS; to ~13-30min depending on RTO. */ unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; -static unsigned long * tcp_timeouts[] +static const unsigned long * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ @@ -170,7 +170,7 @@ enum tcp_bit_set { * if they are invalid * or we do not support the request (simultaneous open) */ -static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { +static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ @@ -817,7 +817,7 @@ void ip_conntrack_tcp_update(struct sk_buff *skb, #define TH_CWR 0x80 /* table of valid flag combinations - ECE and CWR are always valid */ -static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 75c27e92f6a..45886c8475e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1892,7 +1892,7 @@ static int ipt_get_matches(char *buffer, char **start, off_t offset, int length) return pos; } -static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = +static const struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = { { "ip_tables_names", ipt_get_tables }, { "ip_tables_targets", ipt_get_targets }, { "ip_tables_matches", ipt_get_matches }, diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 92ed050fac6..180d6ca04af 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -197,7 +197,7 @@ static void dump_packet(const struct nf_loginfo *info, } case IPPROTO_ICMP: { struct icmphdr _icmph, *ich; - static size_t required_len[NR_ICMP_TYPES+1] + static const size_t required_len[NR_ICMP_TYPES+1] = { [ICMP_ECHOREPLY] = 4, [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr), diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a65e508fbd4..0d7dc668db4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -98,7 +98,7 @@ fold_field(void *mib[], int offt) } /* snmp items */ -static struct snmp_mib snmp4_ipstats_list[] = { +static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES), SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), @@ -119,7 +119,7 @@ static struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_icmp_list[] = { +static const struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS), SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS), SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS), @@ -149,7 +149,7 @@ static struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_tcp_list[] = { +static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), @@ -167,7 +167,7 @@ static struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_udp_list[] = { +static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), @@ -175,7 +175,7 @@ static struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_net_list[] = { +static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e9c14f4a2eb..f701a136a6a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1371,7 +1371,7 @@ out: kfree_skb(skb); * are needed for AMPRnet AX.25 paths. */ -static unsigned short mtu_plateau[] = +static const unsigned short mtu_plateau[] = {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; static __inline__ unsigned short guess_mtu(unsigned short old_mtu) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5e6bc4b3287..ef98b14ac56 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1413,7 +1413,7 @@ recv_urg: * closed. */ -static unsigned char new_state[16] = { +static const unsigned char new_state[16] = { /* current state: new state: action: */ /* (Invalid) */ TCP_CLOSE, /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 1bdf0fb8bf8..34a332225c1 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -751,7 +751,7 @@ void icmpv6_cleanup(void) inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } -static struct icmp6_err { +static const struct icmp6_err { int err; int fatal; } tab_unreach[] = { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d492226c16..95d469271c4 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1972,7 +1972,7 @@ static int ip6t_get_matches(char *buffer, char **start, off_t offset, int length return pos; } -static struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] = +static const struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] = { { "ip6_tables_names", ip6t_get_tables }, { "ip6_tables_targets", ip6t_get_targets }, { "ip6_tables_matches", ip6t_get_matches }, -- cgit v1.2.3 From 4b30b1c6a3e58dc74f2dbb0aa39f16a23cfcdd56 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 29 Nov 2005 16:27:20 -0800 Subject: [IPV4]: make two functions static This patch makes two needlessly global functions static. Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4e9c74b54b1..a4c347c3b8e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1217,7 +1217,7 @@ static int ipgre_tunnel_init(struct net_device *dev) return 0; } -int __init ipgre_fb_tunnel_init(struct net_device *dev) +static int __init ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; struct iphdr *iph = &tunnel->parms.iph; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 11c2f68254f..eba64e2bd39 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -690,7 +690,7 @@ csum_page(struct page *page, int offset, int copy) return csum; } -inline int ip_ufo_append_data(struct sock *sk, +static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, -- cgit v1.2.3 From d127e94a5cf1c9c996b8c67cd2595b96c3e35e4c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 29 Nov 2005 16:28:18 -0800 Subject: [NETFILTER] ipv4: small cleanups This patch contains the following cleanups: - make needlessly global code static - ip_conntrack_core.c: ip_conntrack_flush() -> ip_conntrack_flush(void) Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 4 ++-- net/ipv4/netfilter/ip_nat_core.c | 2 +- net/ipv4/netfilter/ipt_LOG.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 422ab68ee7f..7a4ecddd597 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1354,7 +1354,7 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) get_order(sizeof(struct list_head) * size)); } -void ip_conntrack_flush() +void ip_conntrack_flush(void) { /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module @@ -1408,7 +1408,7 @@ static struct list_head *alloc_hashtable(int size, int *vmalloced) return hash; } -int set_hashsize(const char *val, struct kernel_param *kp) +static int set_hashsize(const char *val, struct kernel_param *kp) { int i, bucket, hashsize, vmalloced; int old_vmalloced, old_size; diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 762f4d93936..c1a61462507 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -49,7 +49,7 @@ static unsigned int ip_nat_htable_size; static struct list_head *bysource; #define MAX_IP_NAT_PROTO 256 -struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; +static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; static inline struct ip_nat_protocol * __ip_nat_proto_find(u_int8_t protonum) diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 180d6ca04af..30be0f1dae3 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -351,7 +351,7 @@ static void dump_packet(const struct nf_loginfo *info, /* maxlen = 230+ 91 + 230 + 252 = 803 */ } -struct nf_loginfo default_loginfo = { +static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { -- cgit v1.2.3 From 34a0b3cdc078746788ffc49e56da0db62b8b6ea4 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 29 Nov 2005 16:28:56 -0800 Subject: [IPV6]: make two functions static This patch makes two needlessly global functions static. Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 3 ++- net/ipv6/ipv6_sockglue.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c1fa693511a..8523c76ebf7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -774,7 +774,8 @@ out_err_release: *dst = NULL; return err; } -inline int ip6_ufo_append_data(struct sock *sk, + +static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 25757ade989..3620718defe 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -628,8 +628,8 @@ e_inval: return -EINVAL; } -int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr, - char __user *optval, int len) +static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr, + char __user *optval, int len) { if (!hdr) return 0; -- cgit v1.2.3 From 73f306024c15bd12e59677d6eaf43ecced614f04 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Thu, 1 Dec 2005 14:28:58 -0800 Subject: [NETFILTER]: Ignore ACKs ACKs on half open connections in TCP conntrack Mounting NFS file systems after a (warm) reboot could take a long time if firewalling and connection tracking was enabled. The reason is that the NFS clients tends to use the same ports (800 and counting down). Now on reboot, the server would still have a TCB for an existing TCP connection client:800 -> server:2049. The client sends a SYN from port 800 to server:2049, which elicits an ACK from the server. The firewall on the client drops the ACK because (from its point of view) the connection is still in half-open state, and it expects to see a SYNACK. The client will eventually time out after several minutes. The following patch corrects this, by accepting ACKs on half open connections as well. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 29 ++++++++++++++++++++--------- net/netfilter/nf_conntrack_proto_tcp.c | 29 ++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 62598167677..aeb7353d477 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -272,9 +272,9 @@ static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* - * sSS -> sIV Might be a half-open connection. + * sSS -> sIG Might be a half-open connection. * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -917,8 +917,12 @@ static int tcp_packet(struct ip_conntrack *conntrack, switch (new_state) { case TCP_CONNTRACK_IGNORE: - /* Either SYN in ORIGINAL - * or SYN/ACK in REPLY. */ + /* Ignored packets: + * + * a) SYN in ORIGINAL + * b) SYN/ACK in REPLY + * c) ACK in reply direction after initial SYN in original. + */ if (index == TCP_SYNACK_SET && conntrack->proto.tcp.last_index == TCP_SYN_SET && conntrack->proto.tcp.last_dir != dir @@ -985,13 +989,20 @@ static int tcp_packet(struct ip_conntrack *conntrack, } case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET - && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN we had let trough - * SYN was in window then, tear down connection. + /* RST sent to invalid SYN or ACK we had let trough + * at a) and c) above: + * + * a) SYN was in window then + * c) we hold a half-open connection. + * + * Delete our connection entry. * We skip window checking, because packet might ACK - * segments we ignored in the SYN. */ + * segments we ignored. */ goto in_window; } /* Just fall trough */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 5a6fcf349bd..6035633d822 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -280,9 +280,9 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* - * sSS -> sIV Might be a half-open connection. + * sSS -> sIG Might be a half-open connection. * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -912,8 +912,12 @@ static int tcp_packet(struct nf_conn *conntrack, switch (new_state) { case TCP_CONNTRACK_IGNORE: - /* Either SYN in ORIGINAL - * or SYN/ACK in REPLY. */ + /* Ignored packets: + * + * a) SYN in ORIGINAL + * b) SYN/ACK in REPLY + * c) ACK in reply direction after initial SYN in original. + */ if (index == TCP_SYNACK_SET && conntrack->proto.tcp.last_index == TCP_SYN_SET && conntrack->proto.tcp.last_dir != dir @@ -979,13 +983,20 @@ static int tcp_packet(struct nf_conn *conntrack, } case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET - && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN we had let trough - * SYN was in window then, tear down connection. + /* RST sent to invalid SYN or ACK we had let trough + * at a) and c) above: + * + * a) SYN was in window then + * c) we hold a half-open connection. + * + * Delete our connection entry. * We skip window checking, because packet might ACK - * segments we ignored in the SYN. */ + * segments we ignored. */ goto in_window; } /* Just fall trough */ -- cgit v1.2.3 From 2a43c4af3fa2e701008d51c28365e26fccf9cbb0 Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Thu, 1 Dec 2005 14:29:24 -0800 Subject: [NETFILTER]: Fix recent match jiffies wrap mismatches Around jiffies wrap time (i.e. within first 5 mins after boot), recent match rules which contain both --seconds and --hitcount arguments experience false matches. This is because the last_pkts array is filled with zeros on creation, and when comparing 'now' to 0 (+ --seconds argument), time_before_eq thinks it has found a hit. Below patch adds a break if the packet value is zero. This has the unfortunate side effect of causing mismatches if a packet was received when jiffies really was equal to zero. The odds of that happening are slim compared to the problems caused by not adding the break however. Plus, the author used this same method just below, so it is "good enough". This fixes netfilter bugs #383 and #395. Signed-off-by: Phil Oester Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 2d44b07688a..261cbb4d4c4 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -532,6 +532,7 @@ match(const struct sk_buff *skb, } if(info->seconds && info->hit_count) { for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(r_list[location].last_pkts[pkt_count] == 0) break; if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; } if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; -- cgit v1.2.3 From ea86575eaf99a9262a969309d934318028dbfacb Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 1 Dec 2005 14:30:00 -0800 Subject: [NETLINK]: Fix processing of fib_lookup netlink messages The receive path for fib_lookup netlink messages is lacking sanity checks for header and payload and is thus vulnerable to malformed netlink messages causing illegal memory references. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 882f88f6d13..19b1b984d68 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -544,12 +544,16 @@ static void nl_fib_input(struct sock *sk, int len) struct sk_buff *skb = NULL; struct nlmsghdr *nlh = NULL; struct fib_result_nl *frn; - int err; u32 pid; struct fib_table *tb; - skb = skb_recv_datagram(sk, 0, 0, &err); + skb = skb_dequeue(&sk->sk_receive_queue); nlh = (struct nlmsghdr *)skb->data; + if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || + nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) { + kfree_skb(skb); + return; + } frn = (struct fib_result_nl *) NLMSG_DATA(nlh); tb = fib_get_table(frn->tb_id_in); -- cgit v1.2.3 From 6736dc35e9e1b9c8084d5c362a429a3e8189af6b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 2 Dec 2005 20:30:06 -0800 Subject: [SCTP]: Return socket errors only if the receive queue is empty. This patch fixes an issue where it is possible to get valid data after a ENOTCONN error. It returns socket errors only after data queued on socket receive queue is consumed. Signed-off-by: Neil Horman Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/socket.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index abab81f3818..d890dfa8818 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4743,11 +4743,6 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, struct sk_buff *skb; long timeo; - /* Caller is allowed not to check sk->sk_err before calling. */ - error = sock_error(sk); - if (error) - goto no_packet; - timeo = sock_rcvtimeo(sk, noblock); SCTP_DEBUG_PRINTK("Timeout: timeo: %ld, MAX: %ld.\n", @@ -4774,6 +4769,11 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (skb) return skb; + /* Caller is allowed not to check sk->sk_err before calling. */ + error = sock_error(sk); + if (error) + goto no_packet; + if (sk->sk_shutdown & RCV_SHUTDOWN) break; -- cgit v1.2.3 From bf031fff1fac77775b2cd2c72ad8b017f4c0af13 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 2 Dec 2005 20:32:29 -0800 Subject: [SCTP]: Fix getsockname for sctp when an ipv6 socket accepts a connection from an ipv4 socket. Signed-off-by: Neil Horman Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 6bc27200e6c..268ddaf2dc0 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -261,7 +261,8 @@ void sctp_transport_route(struct sctp_transport *transport, * association's active path for getsockname(). */ if (asoc && (transport == asoc->peer.active_path)) - af->to_sk_saddr(&transport->saddr, asoc->base.sk); + opt->pf->af->to_sk_saddr(&transport->saddr, + asoc->base.sk); } else transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; } -- cgit v1.2.3 From 24c6927505ca77ee4ac25fb31dcd56f6506979ed Mon Sep 17 00:00:00 2001 From: David Stevens Date: Fri, 2 Dec 2005 20:32:59 -0800 Subject: [IGMP]: workaround for IGMP v1/v2 bug From: David Stevens As explained at: http://www.cs.ucsb.edu/~krishna/igmp_dos/ With IGMP version 1 and 2 it is possible to inject a unicast report to a client which will make it ignore multicast reports sent later by the router. The fix is to only accept the report if is was sent to a multicast or unicast address. Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 5 ++++- net/ipv6/mcast.c | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c04607b4921..4a195c724f0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -897,7 +897,10 @@ int igmp_rcv(struct sk_buff *skb) /* Is it our report looped back? */ if (((struct rtable*)skb->dst)->fl.iif == 0) break; - igmp_heard_report(in_dev, ih->group); + /* don't rely on MC router hearing unicast reports */ + if (skb->pkt_type == PACKET_MULTICAST || + skb->pkt_type == PACKET_BROADCAST) + igmp_heard_report(in_dev, ih->group); break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index f15e04ad026..fd939da090c 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1231,6 +1231,11 @@ int igmp6_event_report(struct sk_buff *skb) if (skb->pkt_type == PACKET_LOOPBACK) return 0; + /* send our report if the MC router may not have heard this report */ + if (skb->pkt_type != PACKET_MULTICAST && + skb->pkt_type != PACKET_BROADCAST) + return 0; + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) return -EINVAL; -- cgit v1.2.3 From 86c8f9d158f68538a971a47206a46a22c7479bac Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2005 20:43:26 -0800 Subject: [IPV4] Fix EPROTONOSUPPORT error in inet_create There is a coding error in inet_create that causes it to always return ESOCKTNOSUPPORT. It should return EPROTONOSUPPORT when there are protocols registered for a given socket type but none of them match the requested protocol. This is based on a patch by Jayachandran C. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaa150c33b0..d368cf24900 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -228,13 +228,14 @@ static int inet_create(struct socket *sock, int protocol) unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; - int err = -ESOCKTNOSUPPORT; + int err; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ answer = NULL; lookup_protocol: + err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -252,6 +253,7 @@ lookup_protocol: if (IPPROTO_IP == answer->protocol) break; } + err = -EPROTONOSUPPORT; answer = NULL; } @@ -280,9 +282,6 @@ lookup_protocol: err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; - err = -EPROTONOSUPPORT; - if (!protocol) - goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; -- cgit v1.2.3 From af1afe866297448ad8a1da99fa8a6af86c43c909 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 2 Dec 2005 20:56:57 -0800 Subject: [IPV6]: Load protocol module dynamically. [ Modified to match inet_create() bug fix by Herbert Xu -DaveM ] Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c63b8ce0e1b..d9546380fa0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -92,10 +92,13 @@ static int inet6_create(struct socket *sock, int protocol) struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; - int rc; + int try_loading_module = 0; + int err; /* Look for the requested type/protocol pair. */ answer = NULL; +lookup_protocol: + err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_rcu(p, &inetsw6[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -113,21 +116,37 @@ static int inet6_create(struct socket *sock, int protocol) if (IPPROTO_IP == answer->protocol) break; } + err = -EPROTONOSUPPORT; answer = NULL; } - rc = -ESOCKTNOSUPPORT; - if (!answer) - goto out_rcu_unlock; - rc = -EPERM; + if (!answer) { + if (try_loading_module < 2) { + rcu_read_unlock(); + /* + * Be more specific, e.g. net-pf-10-proto-132-type-1 + * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM) + */ + if (++try_loading_module == 1) + request_module("net-pf-%d-proto-%d-type-%d", + PF_INET6, protocol, sock->type); + /* + * Fall back to generic, e.g. net-pf-10-proto-132 + * (net-pf-PF_INET6-proto-IPPROTO_SCTP) + */ + else + request_module("net-pf-%d-proto-%d", + PF_INET6, protocol); + goto lookup_protocol; + } else + goto out_rcu_unlock; + } + + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; - rc = -EPROTONOSUPPORT; - if (!protocol) - goto out_rcu_unlock; sock->ops = answer->ops; - answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; @@ -135,14 +154,14 @@ static int inet6_create(struct socket *sock, int protocol) BUG_TRAP(answer_prot->slab != NULL); - rc = -ENOBUFS; + err = -ENOBUFS; sk = sk_alloc(PF_INET6, GFP_KERNEL, answer_prot, 1); if (sk == NULL) goto out; sock_init_data(sock, sk); - rc = 0; + err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = 1; @@ -202,14 +221,14 @@ static int inet6_create(struct socket *sock, int protocol) sk->sk_prot->hash(sk); } if (sk->sk_prot->init) { - rc = sk->sk_prot->init(sk); - if (rc) { + err = sk->sk_prot->init(sk); + if (err) { sk_common_release(sk); goto out; } } out: - return rc; + return err; out_rcu_unlock: rcu_read_unlock(); goto out; -- cgit v1.2.3 From bb184f3356adbbb1605df2f7c570b4c92231fa3a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Dec 2005 15:20:10 -0500 Subject: SUNRPC: Fix Oopsable condition in rpc_pipefs The elements on rpci->in_upcall are tracked by the filp->private_data, which will ensure that they get released when the file is closed. The exception is if rpc_close_pipes() gets called first, since that sets rpci->ops to NULL. Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index e3b242daf53..c76ea221798 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -59,7 +59,6 @@ __rpc_purge_upcall(struct inode *inode, int err) struct rpc_inode *rpci = RPC_I(inode); __rpc_purge_list(rpci, &rpci->pipe, err); - __rpc_purge_list(rpci, &rpci->in_upcall, err); rpci->pipelen = 0; wake_up(&rpci->waitq); } @@ -119,6 +118,7 @@ rpc_close_pipes(struct inode *inode) down(&inode->i_sem); if (rpci->ops != NULL) { rpci->nreaders = 0; + __rpc_purge_list(rpci, &rpci->in_upcall, -EPIPE); __rpc_purge_upcall(inode, -EPIPE); rpci->nwriters = 0; if (rpci->ops->release_pipe) -- cgit v1.2.3 From 8d1ca69984ed1e5930c0537b8f606c54007d7319 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 5 Dec 2005 13:32:14 -0800 Subject: [NETFILTER]: Fix incorrect argument to ip_nat_initialized() in ctnetlink ip_nat_initialized() takes enum ip_nat_manip_type as it's second argument, not a hook number. Noticed and initial patch by Marcus Sundberg . Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 3fce91bcc0b..70402e0ed00 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -877,7 +877,7 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) DEBUGP("NAT status: %lu\n", status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - if (ip_nat_initialized(ct, hooknum)) + if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) return -EEXIST; ip_nat_setup_info(ct, &range, hooknum); -- cgit v1.2.3 From f16c910724250c1af0f53111b4c76505000819f6 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Mon, 5 Dec 2005 13:32:50 -0800 Subject: [NETFILTER]: nf_conntrack: Fix missing check for ICMPv6 type This makes nf_conntrack_icmpv6 check that ICMPv6 type isn't < 128 to avoid accessing out of array valid_new[] and invmap[]. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index c0f1da5497a..a7e03cfacd0 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -68,8 +68,8 @@ static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 }; - __u8 type = orig->dst.u.icmp.type - 128; - if (type >= sizeof(invmap) || !invmap[type]) + int type = orig->dst.u.icmp.type - 128; + if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return 0; tuple->src.u.icmp.id = orig->src.u.icmp.id; @@ -129,12 +129,12 @@ static int icmpv6_new(struct nf_conn *conntrack, [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; + int type = conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128; - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new) - || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) { + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ - DEBUGP("icmp: can't create new conn with type %u\n", - conntrack->tuplehash[0].tuple.dst.u.icmp.type); + DEBUGP("icmpv6: can't create new conn with type %u\n", + type + 128); NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); return 0; } -- cgit v1.2.3 From 3ebbe0cdd41f62e5c79cb4d7aa42a1ca50d7d2f2 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Mon, 5 Dec 2005 13:33:26 -0800 Subject: [NETFILTER]: nfnetlink: Fix calculation of minimum message length At least, valid nfnetlink message should have nlmsghdr and nfgenmsg. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nfnetlink.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a60c59b9763..95fdf04f1d8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -162,7 +162,7 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, return -EINVAL; } - min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); + min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); if (unlikely(nlh->nlmsg_len < min_len)) return -EINVAL; @@ -236,8 +236,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, } /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < - NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) { + if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) { DEBUGP("received message was too short\n"); return 0; } -- cgit v1.2.3 From afe5c6bb034bfa5824f8e7def6a739653e8f4655 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:33:50 -0800 Subject: [NETFILTER]: Fix ip_conntrack_flush abuse in ctnetlink ip_conntrack_flush() used to be part of ip_conntrack_cleanup(), which needs to drop _all_ references on module unload. Table flushed using ctnetlink just needs to clean the table and doesn't need to flush the event cache or wait for any references attached to skbs. Move everything but pure table flushing back to ip_conntrack_cleanup(). Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 7a4ecddd597..84c66dbfeda 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1345,6 +1345,11 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } +void ip_conntrack_flush(void) +{ + ip_ct_iterate_cleanup(kill_all, NULL); +} + static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) { if (vmalloced) @@ -1354,8 +1359,12 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) get_order(sizeof(struct list_head) * size)); } -void ip_conntrack_flush(void) +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) { + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ @@ -1363,7 +1372,7 @@ void ip_conntrack_flush(void) ip_ct_event_cache_flush(); i_see_dead_people: - ip_ct_iterate_cleanup(kill_all, NULL); + ip_conntrack_flush(); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; @@ -1371,14 +1380,7 @@ void ip_conntrack_flush(void) /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); -} -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) -{ - ip_ct_attach = NULL; - ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, -- cgit v1.2.3 From 0be7fa92ca162bf5e7993c392e6f93909d617bbb Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:34:51 -0800 Subject: [NETFILTER]: Fix CTA_PROTO_NUM attribute size in ctnetlink CTA_PROTO_NUM is a u_int8_t. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 70402e0ed00..d058ac41bfd 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -503,7 +503,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) } static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_NUM-1] = sizeof(u_int16_t), + [CTA_PROTO_NUM-1] = sizeof(u_int8_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), @@ -528,7 +528,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, if (!tb[CTA_PROTO_NUM-1]) return -EINVAL; - tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); -- cgit v1.2.3 From a79575633300adb5d3f1bd856cc518c45fefcb86 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:36:25 -0800 Subject: [NETFILTER]: Mark ctnetlink as EXPERIMENTAL Should have been marked EXPERIMENTAL from the beginning, as the current bunch of fixes show. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 8 ++++---- net/netfilter/Kconfig | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 0bc00528d88..88a60650e6b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -56,8 +56,8 @@ config IP_NF_CONNTRACK_MARK instead of the individual packets. config IP_NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on IP_NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code @@ -66,8 +66,8 @@ config IP_NF_CONNTRACK_EVENTS IF unsure, say `N'. config IP_NF_CONNTRACK_NETLINK - tristate 'Connection tracking netlink interface' - depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + tristate 'Connection tracking netlink interface (EXPERIMENTAL)' + depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m help This option enables support for a netlink-based userspace interface diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index a84f9221e5f..794c41d19b2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -61,8 +61,8 @@ config NF_CONNTRACK_MARK instead of the individual packets. config NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code -- cgit v1.2.3 From 6636568cf85ef5898a892e90fcc88b61cca9ca27 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:36:50 -0800 Subject: [NETFILTER]: Wait for untracked references in nf_conntrack module unload Noticed by Pablo Neira . Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1da678303d7..a7c7b490cf2 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1383,6 +1383,9 @@ void nf_conntrack_cleanup(void) schedule(); goto i_see_dead_people; } + /* wait until all references to nf_conntrack_untracked are dropped */ + while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) + schedule(); for (i = 0; i < NF_CT_F_NUM; i++) { if (nf_ct_cache[i].use == 0) -- cgit v1.2.3 From 266c8543480e2202ab63d1d604a5ca049f350cd8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:37:33 -0800 Subject: [NETFILTER]: Fix unbalanced read_unlock_bh in ctnetlink NFA_NEST calls NFA_PUT which jumps to nfattr_failure if the skb has no room left. We call read_unlock_bh at nfattr_failure for the NFA_PUT inside the locked section, so move NFA_NEST inside the locked section too. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index aeb7353d477..e7fa29e576d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -341,9 +341,10 @@ static int tcp_print_conntrack(struct seq_file *s, static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, const struct ip_conntrack *ct) { - struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); + struct nfattr *nest_parms; read_lock_bh(&tcp_lock); + nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), &ct->proto.tcp.state); read_unlock_bh(&tcp_lock); -- cgit v1.2.3 From 2fdf1faa8e33082d691bcba18814276f2bd5a6f0 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:38:16 -0800 Subject: [NETFILTER]: Don't use conntrack entry after dropping the reference Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index d058ac41bfd..91fe8f2e38f 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -728,11 +728,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return -ENOENT; } } - if (del_timer(&ct->timeout)) { - ip_conntrack_put(ct); + if (del_timer(&ct->timeout)) ct->timeout.function((unsigned long)ct); - return 0; - } + ip_conntrack_put(ct); DEBUGP("leaving\n"); -- cgit v1.2.3 From dab9630fb3d206f49658066a3ecf80ea120364db Mon Sep 17 00:00:00 2001 From: Martin Waitz Date: Mon, 5 Dec 2005 13:40:12 -0800 Subject: [NET]: make function pointer argument parseable by kernel-doc When a function takes a function pointer as argument it should use the 'return (*pointer)(params...)' syntax used everywhere else in the kernel as this is recognized by kernel-doc. Signed-off-by: Martin Waitz Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b7d13a4fff4..83fee37de38 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1725,7 +1725,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, * of the skb if any page alloc fails user this procedure returns -ENOMEM */ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int getfrag(void *from, char *to, int offset, + int (*getfrag)(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length) { -- cgit v1.2.3 From 1f12bcc9d1840fd26bf577065214f1ebeb2609ba Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 5 Dec 2005 13:42:06 -0800 Subject: [DECNET]: add memory buffer settings The patch (originally from Steve) simply adds memory buffer settings to DECnet similar to those in TCP. Signed-off-by: Patrick Caulfield Signed-off-by: David S. Miller --- net/decnet/af_decnet.c | 25 ++++++++++++++++++++++--- net/decnet/sysctl_net_decnet.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index f89e55f814d..d402e9020c6 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -153,6 +153,7 @@ static struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; +static atomic_t decnet_memory_allocated; static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); @@ -446,10 +447,26 @@ static void dn_destruct(struct sock *sk) dst_release(xchg(&sk->sk_dst_cache, NULL)); } +static int dn_memory_pressure; + +static void dn_enter_memory_pressure(void) +{ + if (!dn_memory_pressure) { + dn_memory_pressure = 1; + } +} + static struct proto dn_proto = { - .name = "DECNET", - .owner = THIS_MODULE, - .obj_size = sizeof(struct dn_sock), + .name = "NSP", + .owner = THIS_MODULE, + .enter_memory_pressure = dn_enter_memory_pressure, + .memory_pressure = &dn_memory_pressure, + .memory_allocated = &decnet_memory_allocated, + .sysctl_mem = sysctl_decnet_mem, + .sysctl_wmem = sysctl_decnet_wmem, + .sysctl_rmem = sysctl_decnet_rmem, + .max_header = DN_MAX_NSP_DATA_HEADER + 64, + .obj_size = sizeof(struct dn_sock), }; static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp) @@ -470,6 +487,8 @@ static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp) sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; + sk->sk_sndbuf = sysctl_decnet_wmem[1]; + sk->sk_rcvbuf = sysctl_decnet_rmem[1]; /* Initialization of DECnet Session Control Port */ scp = DN_SK(sk); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 02bca49cb50..0e9d2c57116 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -10,6 +10,7 @@ * * Changes: * Steve Whitehouse - C99 changes and default device handling + * Steve Whitehouse - Memory buffer settings, like the tcp ones * */ #include @@ -37,6 +38,11 @@ int decnet_dr_count = 3; int decnet_log_martians = 1; int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; +/* Reasonable defaults, I hope, based on tcp's defaults */ +int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; +int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; + #ifdef CONFIG_SYSCTL extern int decnet_dst_gc_interval; static int min_decnet_time_wait[] = { 5 }; @@ -428,6 +434,33 @@ static ctl_table dn_table[] = { .extra1 = &min_decnet_no_fc_max_cwnd, .extra2 = &max_decnet_no_fc_max_cwnd }, + { + .ctl_name = NET_DECNET_MEM, + .procname = "decnet_mem", + .data = &sysctl_decnet_mem, + .maxlen = sizeof(sysctl_decnet_mem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = NET_DECNET_RMEM, + .procname = "decnet_rmem", + .data = &sysctl_decnet_rmem, + .maxlen = sizeof(sysctl_decnet_rmem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = NET_DECNET_WMEM, + .procname = "decnet_wmem", + .data = &sysctl_decnet_wmem, + .maxlen = sizeof(sysctl_decnet_wmem), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, { .ctl_name = NET_DECNET_DEBUG_LEVEL, .procname = "debug", -- cgit v1.2.3 From 5b4956138173cb8b58d83d3173360e8e681a2b66 Mon Sep 17 00:00:00 2001 From: Thomas Young Date: Tue, 6 Dec 2005 16:16:34 -0800 Subject: [TCP] Vegas: stop resetting rtt every ack Move the resetting of rtt measurements to inside the once per RTT block of code. Signed-off-by: Thomas Young Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index b7d296a8ac6..8f06a479305 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -333,11 +333,11 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (tp->snd_cwnd > tp->snd_cwnd_clamp) tp->snd_cwnd = tp->snd_cwnd_clamp; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } } /* Extract info for Tcp socket info provided via netlink. */ -- cgit v1.2.3 From 0d7bef600acab393898bd5553e167496587da3e1 Mon Sep 17 00:00:00 2001 From: Thomas Young Date: Tue, 6 Dec 2005 16:17:11 -0800 Subject: [TCP] Vegas: Remove extra call to tcp_vegas_rtt_calc Remove unneeded call to tcp_vegas_rtt_calc. The more accurate microsecond value has already been registered prior to calling tcp_vegas_cong_avoid. Signed-off-by: Thomas Young Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 8f06a479305..13e7e6e8df1 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -215,14 +215,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->beg_snd_nxt = tp->snd_nxt; vegas->beg_snd_cwnd = tp->snd_cwnd; - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - tcp_vegas_rtt_calc(sk, seq_rtt * 1000); - /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. -- cgit v1.2.3 From dfb4b9dceb35c567a595ae5e9d035cfda044a103 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 6 Dec 2005 16:24:52 -0800 Subject: [TCP] Vegas: timestamp before clone We have to store the congestion control timestamp on the SKB before we clone it, not after. Else we get no timestamping information at all. tcp_transmit_skb() has been reworked so that we can do the timestamp still in one spot, instead of at all the call sites. Problem discovered, and initial fix, from Tom Young . Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 233 +++++++++++++++++++++++++++----------------------- 1 file changed, 124 insertions(+), 109 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 029c70dfb58..b7325e0b406 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { - if (skb != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - int tcp_header_size = tp->tcp_header_len; - struct tcphdr *th; - int sysctl_flags; - int err; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; + struct tcp_sock *tp; + struct tcp_skb_cb *tcb; + int tcp_header_size; + struct tcphdr *th; + int sysctl_flags; + int err; + + BUG_ON(!skb || !tcp_skb_pcount(skb)); + + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->rtt_sample) + __net_timestamp(skb); + + if (likely(clone_it)) { + if (unlikely(skb_cloned(skb))) + skb = pskb_copy(skb, gfp_mask); + else + skb = skb_clone(skb, gfp_mask); + if (unlikely(!skb)) + return -ENOBUFS; + } - BUG_ON(!tcp_skb_pcount(skb)); + inet = inet_sk(sk); + tp = tcp_sk(sk); + tcb = TCP_SKB_CB(skb); + tcp_header_size = tp->tcp_header_len; #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 - /* If congestion control is doing timestamping */ - if (icsk->icsk_ca_ops->rtt_sample) - __net_timestamp(skb); - - sysctl_flags = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if(sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if(sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (tp->rx_opt.eff_sacks) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + sysctl_flags = 0; + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } - - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - - th = (struct tcphdr *) skb_push(skb, tcp_header_size); - skb->h.th = th; - skb_set_owner_w(skb, sk); - - /* Build TCP header and checksum it. */ - th->source = inet->sport; - th->dest = inet->dport; - th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); - *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); - if (tcb->flags & TCPCB_FLAG_SYN) { - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - th->window = htons(tp->rcv_wnd); - } else { - th->window = htons(tcp_select_window(sk)); + if (sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; } - th->check = 0; - th->urg_ptr = 0; - - if (tp->urg_mode && - between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { - th->urg_ptr = htons(tp->snd_up-tcb->seq); - th->urg = 1; + if (sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } + } else if (unlikely(tp->rx_opt.eff_sacks)) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK)); + } + + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = inet->sport; + th->dest = inet->dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + tcb->flags); + + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_syn_build_options((__u32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent); - } else { - tcp_build_and_update_options((__u32 *)(th + 1), - tp, tcb->when); + if (unlikely(tp->urg_mode && + between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { + th->urg_ptr = htons(tp->snd_up-tcb->seq); + th->urg = 1; + } - TCP_ECN_send(sk, tp, skb, tcp_header_size); - } - tp->af_specific->send_check(sk, th, skb->len, skb); + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_syn_build_options((__u32 *)(th + 1), + tcp_advertise_mss(sk), + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rx_opt.rcv_wscale, + tcb->when, + tp->rx_opt.ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, tcb->when); + TCP_ECN_send(sk, tp, skb, tcp_header_size); + } - if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tp->af_specific->send_check(sk, th, skb->len, skb); - if (skb->len != tcp_header_size) - tcp_event_data_sent(tp, skb, sk); + if (likely(tcb->flags & TCPCB_FLAG_ACK)) + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb, sk); - err = tp->af_specific->queue_xmit(skb, 0); - if (err <= 0) - return err; + TCP_INC_STATS(TCP_MIB_OUTSEGS); - tcp_enter_cwr(sk); + err = tp->af_specific->queue_xmit(skb, 0); + if (unlikely(err <= 0)) + return err; + + tcp_enter_cwr(sk); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; - /* NET_XMIT_CN is special. It does not guarantee, - * that this packet is lost. It tells that device - * is about to start to drop packets or already - * drops some packets of the same priority and - * invokes us to send less aggressively. - */ - return err == NET_XMIT_CN ? 0 : err; - } - return -ENOBUFS; #undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_WSCALE #undef SYSCTL_FLAG_SACK @@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) + if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. @@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { update_send_head(sk, tp, skb); tcp_cwnd_validate(sk, tp); return; @@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, (skb_cloned(skb) ? - pskb_copy(skb, GFP_ATOMIC): - skb_clone(skb, GFP_ATOMIC))); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (err == 0) { /* Update global TCP statistics. */ @@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (tcp_transmit_skb(sk, skb)) + if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); } @@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk) TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /* @@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk) __skb_queue_tail(&sk->sk_write_queue, buff); sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); - tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ @@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff); + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } } @@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb); + return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } int tcp_write_wakeup(struct sock *sk) @@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) { update_send_head(sk, tp, skb); } -- cgit v1.2.3 From 4ebf0ae2618fbbb0d365e5d295a30ccfcb91fe0b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 6 Dec 2005 16:38:35 -0800 Subject: [AF_PACKET]: Convert PACKET_MMAP over to vm_insert_page(). So we can properly use __GFP_COMP and avoid the use of PG_reserved pages. With extremely helpful review from Hugh Dickins. Signed-off-by: David S. Miller --- net/packet/af_packet.c | 115 ++++++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 499ae3df4a4..3e246276041 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1587,23 +1587,47 @@ static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order) return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1); } -static void free_pg_vec(char **pg_vec, unsigned order, unsigned len) +static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) { int i; - for (i=0; itp_block_nr; + char **pg_vec; + int i; + + pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); + if (unlikely(!pg_vec)) + goto out; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = alloc_one_pg_vec_page(order); + if (unlikely(!pg_vec[i])) + goto out_free_pgvec; + } + +out: + return pg_vec; + +out_free_pgvec: + free_pg_vec(pg_vec, order, block_nr); + pg_vec = NULL; + goto out; +} static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) { @@ -1617,64 +1641,46 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing /* Sanity tests and some calculations */ - if (po->pg_vec) + if (unlikely(po->pg_vec)) return -EBUSY; - if ((int)req->tp_block_size <= 0) + if (unlikely((int)req->tp_block_size <= 0)) return -EINVAL; - if (req->tp_block_size&(PAGE_SIZE-1)) + if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) return -EINVAL; - if (req->tp_frame_size < TPACKET_HDRLEN) + if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) return -EINVAL; - if (req->tp_frame_size&(TPACKET_ALIGNMENT-1)) + if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) return -EINVAL; po->frames_per_block = req->tp_block_size/req->tp_frame_size; - if (po->frames_per_block <= 0) + if (unlikely(po->frames_per_block <= 0)) return -EINVAL; - if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr) + if (unlikely((po->frames_per_block * req->tp_block_nr) != + req->tp_frame_nr)) return -EINVAL; - /* OK! */ - - /* Allocate page vector */ - while ((PAGE_SIZE<tp_block_size) - order++; err = -ENOMEM; - - pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL); - if (pg_vec == NULL) + order = get_order(req->tp_block_size); + pg_vec = alloc_pg_vec(req, order); + if (unlikely(!pg_vec)) goto out; - memset(pg_vec, 0, req->tp_block_nr*sizeof(char **)); - - for (i=0; itp_block_nr; i++) { - struct page *page, *pend; - pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order); - if (!pg_vec[i]) - goto out_free_pgvec; - - pend = pg_vec_endpage(pg_vec[i], order); - for (page = virt_to_page(pg_vec[i]); page <= pend; page++) - SetPageReserved(page); - } - /* Page vector is allocated */ l = 0; - for (i=0; itp_block_nr; i++) { + for (i = 0; i < req->tp_block_nr; i++) { char *ptr = pg_vec[i]; struct tpacket_hdr *header; int k; - for (k=0; kframes_per_block; k++) { - - header = (struct tpacket_hdr*)ptr; + for (k = 0; k < po->frames_per_block; k++) { + header = (struct tpacket_hdr *) ptr; header->tp_status = TP_STATUS_KERNEL; ptr += req->tp_frame_size; } } /* Done */ } else { - if (req->tp_frame_nr) + if (unlikely(req->tp_frame_nr)) return -EINVAL; } @@ -1701,7 +1707,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing spin_lock_bh(&sk->sk_receive_queue.lock); pg_vec = XC(po->pg_vec, pg_vec); - po->frame_max = req->tp_frame_nr-1; + po->frame_max = (req->tp_frame_nr - 1); po->head = 0; po->frame_size = req->tp_frame_size; spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -1728,7 +1734,6 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing release_sock(sk); -out_free_pgvec: if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: @@ -1755,17 +1760,19 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) goto out; - atomic_inc(&po->mapped); start = vma->vm_start; - err = -EAGAIN; - for (i=0; ipg_vec_len; i++) { - if (remap_pfn_range(vma, start, - __pa(po->pg_vec[i]) >> PAGE_SHIFT, - po->pg_vec_pages*PAGE_SIZE, - vma->vm_page_prot)) - goto out; - start += po->pg_vec_pages*PAGE_SIZE; + for (i = 0; i < po->pg_vec_len; i++) { + struct page *page = virt_to_page(po->pg_vec[i]); + int pg_num; + + for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { + err = vm_insert_page(vma, start, page); + if (unlikely(err)) + goto out; + start += PAGE_SIZE; + } } + atomic_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; -- cgit v1.2.3 From 246a421207007a034da9b8cfa578bc00d16a9553 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 8 Dec 2005 15:21:39 -0800 Subject: [NET]: Fix NULL pointer deref in checksum debugging. The problem I was seeing turned out to be that skb->dev is NULL when the checksum is being completed in user context. This happens because the reference to the device is dropped (to allow it to be released when packets are in the queue). Because skb->dev was NULL, the netdev_rx_csum_fault was panicing on deref of dev->name. How about this? Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0b48e294aaf..a5efc9ae010 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1113,7 +1113,8 @@ out: void netdev_rx_csum_fault(struct net_device *dev) { if (net_ratelimit()) { - printk(KERN_ERR "%s: hw csum failure.\n", dev->name); + printk(KERN_ERR "%s: hw csum failure.\n", + dev ? dev->name : ""); dump_stack(); } } -- cgit v1.2.3 From 73d4f84fd001b0be67fea46e84b75e6a7a5da08e Mon Sep 17 00:00:00 2001 From: Kazunori MIYAZAWA Date: Thu, 8 Dec 2005 23:11:42 -0800 Subject: [IPv6] IPsec: fix pmtu calculation of esp It is a simple bug which uses the wrong member. This bug does not seriously affect ordinary use of IPsec. But it is important to pass IPv6 ready logo phase-2 conformance test of IPsec SGW. Signed-off-by: Kazunori MIYAZAWA Signed-off-by: David S. Miller --- net/ipv6/esp6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 40d9a1935ab..8bfbe997079 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -248,7 +248,7 @@ static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) if (esp->conf.padlen) mtu = ALIGN(mtu, esp->conf.padlen); - return mtu + x->props.header_len + esp->auth.icv_full_len; + return mtu + x->props.header_len + esp->auth.icv_trunc_len; } static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, -- cgit v1.2.3 From ecc51b6d5ca04bb6346c9ad6b37d6ca8bace12b3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 Dec 2005 14:38:10 -0800 Subject: [TCPv6]: Fix skb leak Spotted by Francois Romieu, thanks! Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 62c0e5bd931..8827389abaf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -992,13 +992,12 @@ static void tcp_v6_send_reset(struct sk_buff *skb) /* sk = NULL, but it is safe for now. RST socket required. */ if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { - if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) + if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); return; - - ip6_xmit(NULL, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); - return; + } } kfree_skb(buff); @@ -1057,11 +1056,11 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 fl.fl_ip_sport = t1->source; if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { - if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) + if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); return; - ip6_xmit(NULL, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - return; + } } kfree_skb(buff); -- cgit v1.2.3 From 2f9616d4c44349c903bc1b54fe46ab0ce0210b74 Mon Sep 17 00:00:00 2001 From: Marcus Sundberg Date: Mon, 12 Dec 2005 15:02:48 -0800 Subject: [NETFILTER]: ip_nat_tftp: Fix expectation NAT When a TFTP client is SNATed so that the port is also changed, the port is never changed back for the expected connection. Signed-off-by: Marcus Sundberg Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_tftp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c index 2215317c76b..43c3bd7c118 100644 --- a/net/ipv4/netfilter/ip_nat_tftp.c +++ b/net/ipv4/netfilter/ip_nat_tftp.c @@ -42,7 +42,10 @@ static unsigned int help(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, struct ip_conntrack_expect *exp) { - exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; + struct ip_conntrack *ct = exp->master; + + exp->saved_proto.udp.port + = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; exp->dir = IP_CT_DIR_REPLY; exp->expectfn = ip_nat_follow_master; if (ip_conntrack_expect_related(exp) != 0) -- cgit v1.2.3 From 1cf9e8a7865c0ac216034e519cf6b8505055ea50 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Thu, 1 Dec 2005 21:22:37 +0100 Subject: [PATCH] ieee80211_crypt_tkip depends on NET_RADIO *** Warning: ".wireless_send_event" [net/ieee80211/ieee80211_crypt_tkip.ko] undefined! Signed-off-by: Olaf Hering net/ieee80211/Kconfig | 2 +- 1 files changed, 1 insertion(+), 1 deletion(-) Signed-off-by: Jeff Garzik --- net/ieee80211/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig index 91b16fbf91f..d18ccba3ea9 100644 --- a/net/ieee80211/Kconfig +++ b/net/ieee80211/Kconfig @@ -55,7 +55,7 @@ config IEEE80211_CRYPT_CCMP config IEEE80211_CRYPT_TKIP tristate "IEEE 802.11i TKIP encryption" - depends on IEEE80211 + depends on IEEE80211 && NET_RADIO select CRYPTO select CRYPTO_MICHAEL_MIC ---help--- -- cgit v1.2.3 From a1493d9cd1aaed06860d128a37df1bdfbc61f7c8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 13 Dec 2005 22:59:36 -0800 Subject: [IPV6] addrconf: Do not print device pointer in privacy log message. Noticed by Andi Kleen, it is pointless to emit the device structure pointer in the kernel logs like this. Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 76ff9f4fe89..73a23b4130a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -379,8 +379,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) dev->type == ARPHRD_NONE || dev->type == ARPHRD_SIT) { printk(KERN_INFO - "Disabled Privacy Extensions on device %p(%s)\n", - dev, dev->name); + "%s: Disabled Privacy Extensions\n", + dev->name); ndev->cnf.use_tempaddr = -1; } else { in6_dev_hold(ndev); -- cgit v1.2.3 From 2edc2689f8183dd21c45621a01580b340ac420ba Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 13 Dec 2005 22:59:50 -0800 Subject: [PKT_SCHED]: Disable debug tracing logs by default in packet action API. Noticed by Andi Kleen. Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8aebe8f6d27..2ce1cb2aa2e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -34,7 +34,7 @@ #include #include -#if 1 /* control */ +#if 0 /* control */ #define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args) #else #define DPRINTK(format, args...) -- cgit v1.2.3 From 1542272a60ab9c0655a13ead8b7d7a661365f9fb Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 14 Dec 2005 12:55:24 -0800 Subject: [GRE]: Fix hardware checksum modification The skb_postpull_rcsum introduced a bug to the checksum modification. Although the length pulled is offset bytes, the origin of the pulling is the GRE header, not the IP header. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a4c347c3b8e..46f9d9cf7a5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -618,7 +618,7 @@ static int ipgre_rcv(struct sk_buff *skb) skb->mac.raw = skb->nh.raw; skb->nh.raw = __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb->mac.raw, offset); + skb_postpull_rcsum(skb, skb->h.raw, offset); memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST -- cgit v1.2.3 From a388442c3798a345d131ff8b9d6dea0bfda3fefc Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 14 Dec 2005 16:23:16 -0800 Subject: [VLAN]: Fix hardware rx csum errors Receiving VLAN packets over a device (without VLAN assist) that is doing hardware checksumming (CHECKSUM_HW), causes errors because the VLAN code forgets to adjust the hardware checksum. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b7486488967..f2a8750bbf1 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -165,6 +165,9 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, skb_pull(skb, VLAN_HLEN); /* take off the VLAN header (4 bytes currently) */ + /* Need to correct hardware checksum */ + skb_postpull_rcsum(skb, vhdr, VLAN_HLEN); + /* Ok, lets check to make sure the device (dev) we * came in on is what this VLAN is attached to. */ -- cgit v1.2.3 From d3a880e1ff6713b4c846e4d2526a8c7e6ad8469c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Dec 2005 09:18:30 +0000 Subject: [PATCH] Address of void __user * is void __user * *, not void * __user * Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d890dfa8818..1f7f244806b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3425,7 +3425,7 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add } static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, - void * __user *to, size_t space_left) + void __user **to, size_t space_left) { struct list_head *pos; struct sctp_sockaddr_entry *addr; -- cgit v1.2.3 From 0476f171affa6eca62021fca2ae9f5140acc3713 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 19 Dec 2005 13:53:09 -0800 Subject: [NETFILTER]: Fix NAT init order As noticed by Phil Oester, the GRE NAT protocol helper is initialized before the NAT core, which makes registration fail. Change the linking order to make NAT be initialized first. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 058c48e258f..d0a447e520a 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -12,6 +12,7 @@ ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +obj-$(CONFIG_IP_NF_NAT) += ip_nat.o # conntrack netlink interface obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o @@ -41,7 +42,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o ip_nat.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o # matches -- cgit v1.2.3 From 31cb5bd4dc89ba14e6347b094e15a2f6778a01fc Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 19 Dec 2005 13:53:26 -0800 Subject: [NETFILTER]: Fix incorrect dependency for IP6_NF_TARGET_NFQUEUE IP6_NF_TARGET_NFQUEUE depends on IP6_NF_IPTABLES, not IP_NF_IPTABLES. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/netfilter/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 060d6120241..04912f9b35c 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -211,7 +211,7 @@ config IP6_NF_TARGET_REJECT config IP6_NF_TARGET_NFQUEUE tristate "NFQUEUE Target Support" - depends on IP_NF_IPTABLES + depends on IP6_NF_IPTABLES help This Target replaced the old obsolete QUEUE target. -- cgit v1.2.3 From b03664869aa6f84c3c98a06ac9d6905b195909bc Mon Sep 17 00:00:00 2001 From: Bart De Schuymer Date: Mon, 19 Dec 2005 14:00:08 -0800 Subject: [BRIDGE-NF]: Fix bridge-nf ipv6 length check A typo caused some bridged IPv6 packets to get dropped randomly, as reported by Sebastien Chaumontet. The patch below fixes this (using skb->nh.raw instead of raw) and also makes the jumbo packet length checking up-to-date with the code in net/ipv6/exthdrs.c::ipv6_hop_jumbo. Signed-off-by: Bart De Schuymer Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index d8e36b77512..43a0b35dfe6 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -295,7 +295,7 @@ static int check_hbh_len(struct sk_buff *skb) len -= 2; while (len > 0) { - int optlen = raw[off+1]+2; + int optlen = skb->nh.raw[off+1]+2; switch (skb->nh.raw[off]) { case IPV6_TLV_PAD0: @@ -308,18 +308,15 @@ static int check_hbh_len(struct sk_buff *skb) case IPV6_TLV_JUMBO: if (skb->nh.raw[off+1] != 4 || (off&3) != 2) goto bad; - pkt_len = ntohl(*(u32*)(skb->nh.raw+off+2)); - + if (pkt_len <= IPV6_MAXPLEN || + skb->nh.ipv6h->payload_len) + goto bad; if (pkt_len > skb->len - sizeof(struct ipv6hdr)) goto bad; - if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { - if (__pskb_trim(skb, - pkt_len + sizeof(struct ipv6hdr))) - goto bad; - if (skb->ip_summed == CHECKSUM_HW) - skb->ip_summed = CHECKSUM_NONE; - } + if (pskb_trim_rcsum(skb, + pkt_len+sizeof(struct ipv6hdr))) + goto bad; break; default: if (optlen > len) -- cgit v1.2.3 From 3dd4bc68fac5df16b6d3ed6ed3c29cf05f29a47e Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 19 Dec 2005 14:02:45 -0800 Subject: [IPV6]: Fix route lifetime. The route expiration time is stored in rt6i_expires in jiffies. The argument of rt6_route_add() for adding a route is not the expiration time in jiffies nor in clock_t, but the lifetime (or time left before expiration) in clock_t. Because of the confusion, we sometimes saw several strange errors (FAILs) in TAHI IPv6 Ready Logo Phase-2 Self Test. The symptoms were analyzed by Mitsuru Chinen . Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 16 ++++++++++++---- net/ipv6/route.c | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 73a23b4130a..4ea8cf7c0cc 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1596,9 +1596,17 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) not good. */ if (valid_lft >= 0x7FFFFFFF/HZ) - rt_expires = 0; + rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ); else - rt_expires = jiffies + valid_lft * HZ; + rt_expires = valid_lft * HZ; + + /* + * We convert this (in jiffies) to clock_t later. + * Avoid arithmetic overflow there as well. + * Overflow can happen only if HZ < USER_HZ. + */ + if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ) + rt_expires = 0x7FFFFFFF / USER_HZ; if (pinfo->onlink) { struct rt6_info *rt; @@ -1610,12 +1618,12 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) ip6_del_rt(rt, NULL, NULL, NULL); rt = NULL; } else { - rt->rt6i_expires = rt_expires; + rt->rt6i_expires = jiffies + rt_expires; } } } else if (valid_lft) { addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); + dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); } if (rt) dst_release(&rt->u.dst); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a7a537b5059..7c68bfbee36 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -829,7 +829,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, } rt->u.dst.obsolete = -1; - rt->rt6i_expires = clock_t_to_jiffies(rtmsg->rtmsg_info); + rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info); if (nlh && (r = NLMSG_DATA(nlh))) { rt->rt6i_protocol = r->rtm_protocol; } else { -- cgit v1.2.3 From 9e999993c71e1506378d26d81f842277aff8a250 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 19 Dec 2005 14:03:46 -0800 Subject: [XFRM]: Handle DCCP in xfrm{4,6}_decode_session Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/xfrm4_policy.c | 1 + net/ipv6/xfrm6_policy.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b2b60f3e9cd..42196ba3b0b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -182,6 +182,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) case IPPROTO_UDP: case IPPROTO_TCP: case IPPROTO_SCTP: + case IPPROTO_DCCP: if (pskb_may_pull(skb, xprth + 4 - skb->data)) { u16 *ports = (u16 *)xprth; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index cf1d91e74c8..69bd957380e 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -214,6 +214,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) case IPPROTO_UDP: case IPPROTO_TCP: case IPPROTO_SCTP: + case IPPROTO_DCCP: if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) { u16 *ports = (u16 *)exthdr; -- cgit v1.2.3 From 399c180ac5f0cb66ef9479358e0b8b6bafcbeafe Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 19 Dec 2005 14:23:23 -0800 Subject: [IPSEC]: Perform SA switchover immediately. When we insert a new xfrm_state which potentially subsumes an existing one, make sure all cached bundles are flushed so that the new SA is used immediately. Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 19 ++++++++++++++----- net/xfrm/xfrm_state.c | 5 +++++ 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0db9e57013f..54a4be6a7d2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1014,13 +1014,12 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } EXPORT_SYMBOL(__xfrm_route_forward); -/* Optimize later using cookies and generation ids. */ - static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { - if (!stale_bundle(dst)) - return dst; - + /* If it is marked obsolete, which is how we even get here, + * then we have purged it from the policy bundle list and we + * did that for a good reason. + */ return NULL; } @@ -1104,6 +1103,16 @@ int xfrm_flush_bundles(void) return 0; } +static int always_true(struct dst_entry *dst) +{ + return 1; +} + +void xfrm_flush_all_bundles(void) +{ + xfrm_prune_bundles(always_true); +} + void xfrm_init_pmtu(struct dst_entry *dst) { do { diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 7cf48aa6c95..479effc9766 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -431,6 +431,8 @@ void xfrm_state_insert(struct xfrm_state *x) spin_lock_bh(&xfrm_state_lock); __xfrm_state_insert(x); spin_unlock_bh(&xfrm_state_lock); + + xfrm_flush_all_bundles(); } EXPORT_SYMBOL(xfrm_state_insert); @@ -478,6 +480,9 @@ out: spin_unlock_bh(&xfrm_state_lock); xfrm_state_put_afinfo(afinfo); + if (!err) + xfrm_flush_all_bundles(); + if (x1) { xfrm_state_delete(x1); xfrm_state_put(x1); -- cgit v1.2.3 From 9bffc4ace1ed875667dbe5b29065d96bec558c62 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 19 Dec 2005 14:24:40 -0800 Subject: [SCTP]: Fix sctp to not return erroneous POLLOUT events. Make sctp_writeable() use sk_wmem_alloc rather than sk_wmem_queued to determine the sndbuf space available. It also removes all the modifications to sk_wmem_queued as it is not currently used in SCTP. Signed-off-by: Neil Horman Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/socket.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 1f7f244806b..9df888e932c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -156,10 +156,6 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) sizeof(struct sk_buff) + sizeof(struct sctp_chunk); - sk->sk_wmem_queued += SCTP_DATA_SNDSIZE(chunk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); - atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); } @@ -4426,7 +4422,7 @@ cleanup: * tcp_poll(). Note that, based on these implementations, we don't * lock the socket in this function, even though it seems that, * ideally, locking or some other mechanisms can be used to ensure - * the integrity of the counters (sndbuf and wmem_queued) used + * the integrity of the counters (sndbuf and wmem_alloc) used * in this place. We assume that we don't need locks either until proven * otherwise. * @@ -4833,10 +4829,6 @@ static void sctp_wfree(struct sk_buff *skb) sizeof(struct sk_buff) + sizeof(struct sctp_chunk); - sk->sk_wmem_queued -= SCTP_DATA_SNDSIZE(chunk) + - sizeof(struct sk_buff) + - sizeof(struct sctp_chunk); - atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); sock_wfree(skb); @@ -4920,7 +4912,7 @@ void sctp_write_space(struct sock *sk) /* Is there any sndbuf space available on the socket? * - * Note that wmem_queued is the sum of the send buffers on all of the + * Note that sk_wmem_alloc is the sum of the send buffers on all of the * associations on the same socket. For a UDP-style socket with * multiple associations, it is possible for it to be "unwriteable" * prematurely. I assume that this is acceptable because @@ -4933,7 +4925,7 @@ static int sctp_writeable(struct sock *sk) { int amt = 0; - amt = sk->sk_sndbuf - sk->sk_wmem_queued; + amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; return amt; -- cgit v1.2.3 From b079fa7baa86b47579f3f60f86d03d21c76159b8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 13 Dec 2005 16:13:52 -0500 Subject: RPC: Do not block on skb allocation If we get something like the following, [ 125.300636] [] schedule_timeout+0x54/0xa5 [ 125.305931] [] io_schedule_timeout+0x29/0x33 [ 125.311495] [] blk_congestion_wait+0x70/0x85 [ 125.317058] [] throttle_vm_writeout+0x69/0x7d [ 125.322720] [] shrink_zone+0xe0/0xfa [ 125.327560] [] shrink_caches+0x6d/0x6f [ 125.332581] [] try_to_free_pages+0xd0/0x1b5 [ 125.338056] [] __alloc_pages+0x135/0x2e8 [ 125.343258] [] tcp_sendmsg+0xaa0/0xb78 [ 125.348281] [] inet_sendmsg+0x48/0x53 [ 125.353212] [] sock_sendmsg+0xb8/0xd3 [ 125.358147] [] kernel_sendmsg+0x42/0x4f [ 125.363259] [] sock_no_sendpage+0x5e/0x77 [ 125.368556] [] xs_tcp_send_request+0x2af/0x375 then the socket is blocked until memory is reclaimed, and no progress can ever be made. Try to access the emergency pools by using GFP_ATOMIC. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0a51fd46a84..77e8800d412 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -990,6 +990,7 @@ static void xs_udp_connect_worker(void *args) sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; + sk->sk_allocation = GFP_ATOMIC; xprt_set_connected(xprt); @@ -1074,6 +1075,7 @@ static void xs_tcp_connect_worker(void *args) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; + sk->sk_allocation = GFP_ATOMIC; /* socket options */ sk->sk_userlocks |= SOCK_BINDPORT_LOCK; -- cgit v1.2.3 From 48e49187753ec3b4fa84a7165c9b7a59f3875b56 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 19 Dec 2005 17:11:22 -0500 Subject: SUNRPC: Fix "EPIPE" error on mount of rpcsec_gss-protected partitions gss_create_upcall() should not error just because rpc.gssd closed the pipe on its end. Instead, it should requeue the pending requests and then retry. Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 6 ++++-- net/sunrpc/rpc_pipe.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index f44f46f1d8e..8d782282ec1 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -638,7 +638,7 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) gss_msg); atomic_inc(&gss_msg->count); gss_unhash_msg(gss_msg); - if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + if (msg->errno == -ETIMEDOUT) { unsigned long now = jiffies; if (time_after(now, ratelimit)) { printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" @@ -786,7 +786,9 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags) cred->gc_flags = 0; cred->gc_base.cr_ops = &gss_credops; cred->gc_service = gss_auth->service; - err = gss_create_upcall(gss_auth, cred); + do { + err = gss_create_upcall(gss_auth, cred); + } while (err == -EAGAIN); if (err < 0) goto out_err; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index c76ea221798..16a2458f38f 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -174,7 +174,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) goto out; msg = (struct rpc_pipe_msg *)filp->private_data; if (msg != NULL) { - msg->errno = -EPIPE; + msg->errno = -EAGAIN; list_del_init(&msg->list); rpci->ops->destroy_msg(msg); } @@ -183,7 +183,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (filp->f_mode & FMODE_READ) rpci->nreaders --; if (!rpci->nreaders) - __rpc_purge_upcall(inode, -EPIPE); + __rpc_purge_upcall(inode, -EAGAIN); if (rpci->ops->release_pipe) rpci->ops->release_pipe(inode); out: -- cgit v1.2.3 From 58c4fb86eabcbc385d954843a635b7f4327be6b0 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 21 Dec 2005 22:56:42 +0900 Subject: [IPV6]: Flag RTF_ANYCAST for anycast routes. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/route.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7c68bfbee36..66140f13d11 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -413,11 +413,14 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, rt = ip6_rt_copy(ort); if (rt) { - ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); - - if (!(rt->rt6i_flags&RTF_GATEWAY)) + if (!(rt->rt6i_flags&RTF_GATEWAY)) { + if (rt->rt6i_dst.plen != 128 && + ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; ipv6_addr_copy(&rt->rt6i_gateway, daddr); + } + ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; rt->u.dst.flags |= DST_HOST; @@ -1413,7 +1416,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->u.dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; - if (!anycast) + if (anycast) + rt->rt6i_flags |= RTF_ANYCAST; + else rt->rt6i_flags |= RTF_LOCAL; rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); if (rt->rt6i_nexthop == NULL) { -- cgit v1.2.3 From 8de3351e6e0a1081fbf6864ae37839e327699a08 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 21 Dec 2005 22:57:06 +0900 Subject: [IPV6]: Try not to send icmp to anycast address. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/icmp.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 34a332225c1..6ec6a2b549b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -328,8 +328,10 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, iif = skb->dev->ifindex; /* - * Must not send if we know that source is Anycast also. - * for now we don't know that. + * Must not send error if the source does not uniquely + * identify a single node (RFC2463 Section 2.4). + * We check unspecified / multicast addresses here, + * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); @@ -373,6 +375,16 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, err = ip6_dst_lookup(sk, &dst, &fl); if (err) goto out; + + /* + * We won't send icmp if the destination is known + * anycast. + */ + if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n"); + goto out_dst_release; + } + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) goto out; -- cgit v1.2.3 From 3c21edbd113788b110116141c8078623a0900b6a Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 21 Dec 2005 22:57:24 +0900 Subject: [IPV6]: Defer IPv6 device initialization until the link becomes ready. NETDEV_UP might be sent even if the link attached to the interface was not ready. DAD does not make sense in such case, so we won't do so. After interface Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4ea8cf7c0cc..d012f6ac704 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -388,6 +388,9 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) } #endif + if (netif_carrier_ok(dev)) + ndev->if_flags |= IF_READY; + write_lock_bh(&addrconf_lock); dev->ip6_ptr = ndev; write_unlock_bh(&addrconf_lock); @@ -1215,10 +1218,8 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) /* Gets referenced address, destroys ifaddr */ -void addrconf_dad_failure(struct inet6_ifaddr *ifp) +void addrconf_dad_stop(struct inet6_ifaddr *ifp) { - if (net_ratelimit()) - printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); addrconf_del_timer(ifp); @@ -1244,6 +1245,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ipv6_del_addr(ifp); } +void addrconf_dad_failure(struct inet6_ifaddr *ifp) +{ + if (net_ratelimit()) + printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); + addrconf_dad_stop(ifp); +} /* Join to solicited addr multicast group. */ @@ -2136,6 +2143,37 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, switch(event) { case NETDEV_UP: + case NETDEV_CHANGE: + if (event == NETDEV_UP) { + if (!netif_carrier_ok(dev)) { + /* device is not ready yet. */ + printk(KERN_INFO + "ADDRCONF(NETDEV_UP): %s: " + "link is not ready\n", + dev->name); + break; + } + } else { + if (!netif_carrier_ok(dev)) { + /* device is still not ready. */ + break; + } + + if (idev) { + if (idev->if_flags & IF_READY) { + /* device is already configured. */ + break; + } + idev->if_flags |= IF_READY; + } + + printk(KERN_INFO + "ADDRCONF(NETDEV_CHANGE): %s: " + "link becomes ready\n", + dev->name); + + } + switch(dev->type) { case ARPHRD_SIT: addrconf_sit_config(dev); @@ -2186,8 +2224,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, */ addrconf_ifdown(dev, event != NETDEV_DOWN); break; - case NETDEV_CHANGE: - break; + case NETDEV_CHANGENAME: #ifdef CONFIG_SYSCTL if (idev) { @@ -2268,7 +2305,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* Step 3: clear flags for stateless addrconf */ if (how != 1) - idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD); + idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); /* Step 4: clear address list */ #ifdef CONFIG_IPV6_PRIVACY @@ -2377,11 +2414,20 @@ out: /* * Duplicate Address Detection */ +static void addrconf_dad_kick(struct inet6_ifaddr *ifp) +{ + unsigned long rand_num; + struct inet6_dev *idev = ifp->idev; + + rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + ifp->probes = idev->cnf.dad_transmits; + addrconf_mod_timer(ifp, AC_DAD, rand_num); +} + static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; - unsigned long rand_num; addrconf_join_solict(dev, &ifp->addr); @@ -2390,7 +2436,6 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) flags); net_srandom(ifp->addr.s6_addr32[3]); - rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); read_lock_bh(&idev->lock); if (ifp->dead) @@ -2407,8 +2452,17 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) return; } - ifp->probes = idev->cnf.dad_transmits; - addrconf_mod_timer(ifp, AC_DAD, rand_num); + if (idev->if_flags & IF_READY) + addrconf_dad_kick(ifp); + else { + /* + * If the defice is not ready: + * - keep it tentative if it is a permanent address. + * - otherwise, kill it. + */ + in6_ifa_hold(ifp); + addrconf_dad_stop(ifp); + } spin_unlock_bh(&ifp->lock); out: -- cgit v1.2.3 From c5e33bddd3c798f681f8f3027270127be6b61a3b Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 21 Dec 2005 22:57:44 +0900 Subject: [IPV6]: Run DAD when the link becomes ready. If the link was not available when the interface was created, run DAD for pending tentative addresses when the link becomes ready. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d012f6ac704..f6ead6a843e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -137,6 +137,7 @@ static int addrconf_ifdown(struct net_device *dev, int how); static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); static void addrconf_dad_timer(unsigned long data); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(unsigned long data); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -418,6 +419,7 @@ static struct inet6_dev * ipv6_find_idev(struct net_device *dev) if ((idev = ipv6_add_dev(dev)) == NULL) return NULL; } + if (dev->flags&IFF_UP) ipv6_mc_up(idev); return idev; @@ -2140,6 +2142,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, { struct net_device *dev = (struct net_device *) data; struct inet6_dev *idev = __in6_dev_get(dev); + int run_pending = 0; switch(event) { case NETDEV_UP: @@ -2172,6 +2175,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, "link becomes ready\n", dev->name); + run_pending = 1; } switch(dev->type) { @@ -2190,6 +2194,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; }; if (idev) { + if (run_pending) + addrconf_dad_run(idev); + /* If the MTU changed during the interface down, when the interface up, the changed MTU must be reflected in the idev as well as routers. @@ -2546,6 +2553,22 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) } } +static void addrconf_dad_run(struct inet6_dev *idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) { + spin_lock_bh(&ifp->lock); + if (!(ifp->flags & IFA_F_TENTATIVE)) { + spin_unlock_bh(&ifp->lock); + continue; + } + spin_unlock_bh(&ifp->lock); + addrconf_dad_kick(ifp); + } + read_unlock_bh(&idev->lock); +} + #ifdef CONFIG_PROC_FS struct if6_iter_state { int bucket; -- cgit v1.2.3 From 6b3ae80a63e47f6e97d68a1ddd520e3509e62821 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 21 Dec 2005 22:58:01 +0900 Subject: [IPV6]: Don't select a tentative address as a source address. A tentative address is not considered "assigned to an interface" in the traditional sense (RFC2462 Section 4). Don't try to select such an address for the source address. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f6ead6a843e..fd03c394436 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -908,11 +908,18 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, score.addr_type = __ipv6_addr_type(&ifa->addr); - /* Rule 0: Candidate Source Address (section 4) + /* Rule 0: + * - Tentative Address (RFC2462 section 5.4) + * - A tentative address is not considered + * "assigned to an interface" in the traditional + * sense. + * - Candidate Source Address (section 4) * - In any case, anycast addresses, multicast * addresses, and the unspecified address MUST * NOT be included in a candidate set. */ + if (ifa->flags & IFA_F_TENTATIVE) + continue; if (unlikely(score.addr_type == IPV6_ADDR_ANY || score.addr_type & IPV6_ADDR_MULTICAST)) { LIMIT_NETDEBUG(KERN_DEBUG -- cgit v1.2.3 From 0d77d59f6293438f25e0560172699c0d3e4ef5ac Mon Sep 17 00:00:00 2001 From: Mika Kukkonen Date: Wed, 21 Dec 2005 18:38:26 -0800 Subject: [NETROM]: Fix three if-statements in nr_state1_machine() I found these while compiling with extra gcc warnings; considering the indenting surely they are not intentional? Signed-off-by: Mika Kukkonen Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller --- net/netrom/nr_in.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 004e8599b8f..a7d88b5ad75 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -99,7 +99,7 @@ static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; @@ -130,7 +130,7 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; @@ -265,7 +265,7 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype break; case NR_RESET: - if (sysctl_netrom_reset_circuit); + if (sysctl_netrom_reset_circuit) nr_disconnect(sk, ECONNRESET); break; -- cgit v1.2.3 From 7eb1b3d372a53fe9220b9e3b579886db0fe2f897 Mon Sep 17 00:00:00 2001 From: Mika Kukkonen Date: Wed, 21 Dec 2005 18:39:49 -0800 Subject: [VLAN]: Add two missing checks to vlan_ioctl_handler() In vlan_ioctl_handler() the code misses couple checks for error return values. Signed-off-by: Mika Kukkonen Signed-off-by: David S. Miller --- net/8021q/vlan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 91e412b0ab0..67465b65abe 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -753,6 +753,8 @@ static int vlan_ioctl_handler(void __user *arg) break; case GET_VLAN_REALDEV_NAME_CMD: err = vlan_dev_get_realdev_name(args.device1, args.u.device2); + if (err) + goto out; if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) { err = -EFAULT; @@ -761,6 +763,8 @@ static int vlan_ioctl_handler(void __user *arg) case GET_VLAN_VID_CMD: err = vlan_dev_get_vid(args.device1, &vid); + if (err) + goto out; args.u.VID = vid; if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) { @@ -774,7 +778,7 @@ static int vlan_ioctl_handler(void __user *arg) __FUNCTION__, args.cmd); return -EINVAL; }; - +out: return err; } -- cgit v1.2.3 From 1d1428045c54ef3d172d480806e2066dde0b4b76 Mon Sep 17 00:00:00 2001 From: Kristian Slavov Date: Wed, 21 Dec 2005 18:47:24 -0800 Subject: [IPV6]: Fix address deletion If you add more than one IPv6 address belonging to the same prefix and delete the address that was last added, routing table entry for that prefix is also deleted. Tested on 2.6.14.4 To reproduce: ip addr add 3ffe::1/64 dev eth0 ip addr add 3ffe::2/64 dev eth0 /* wait DAD */ sleep 1 ip addr del 3ffe::2/64 dev eth0 ip -6 route (route to 3ffe::/64 should be gone) In ipv6_del_addr(), if ifa == ifp, we set ifa->if_next to NULL, and later assign ifap = &ifa->if_next, effectively terminating the for-loop. This prevents us from checking if there are other addresses using the same prefix that are valid, and thus resulting in deletion of the prefix. This applies only if the first entry in idev->addr_list is the address to be deleted. Signed-off-by: Kristian Slavov Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4ea8cf7c0cc..e717a034c95 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -634,8 +634,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) } #endif - for (ifap = &idev->addr_list; (ifa=*ifap) != NULL; - ifap = &ifa->if_next) { + for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;) { if (ifa == ifp) { *ifap = ifa->if_next; __in6_ifa_put(ifp); @@ -643,6 +642,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) break; deleted = 1; + continue; } else if (ifp->flags & IFA_F_PERMANENT) { if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, ifp->prefix_len)) { @@ -666,6 +666,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) } } } + ifap = &ifa->if_next; } write_unlock_bh(&idev->lock); -- cgit v1.2.3 From 4c7e6895027362889422e5dc437dc3238b6b4745 Mon Sep 17 00:00:00 2001 From: Ian McDonald Date: Wed, 21 Dec 2005 19:02:39 -0800 Subject: [DCCP]: Comment typo I hope to actually change this behaviour shortly but this will help anybody grepping code at present. Signed-off-by: Ian McDonald Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ca03521112c..656e13e38cf 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1251,7 +1251,7 @@ static int dccp_v4_destroy_sock(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); /* - * DCCP doesn't use sk_qrite_queue, just sk_send_head + * DCCP doesn't use sk_write_queue, just sk_send_head * for retransmissions */ if (sk->sk_send_head != NULL) { -- cgit v1.2.3 From 9b78a82c1cf19aa813bdaa184fa840a3ba811750 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 22 Dec 2005 07:39:48 -0800 Subject: [IPSEC]: Fix policy updates missed by sockets The problem is that when new policies are inserted, sockets do not see the update (but all new route lookups do). This bug is related to the SA insertion stale route issue solved recently, and this policy visibility problem can be fixed in a similar way. The fix is to flush out the bundles of all policies deeper than the policy being inserted. Consider beginning state of "outgoing" direction policy list: policy A --> policy B --> policy C --> policy D First, realize that inserting a policy into a list only potentially changes IPSEC routes for that direction. Therefore we need not bother considering the policies for other directions. We need only consider the existing policies in the list we are doing the inserting. Consider new policy "B'", inserted after B. policy A --> policy B --> policy B' --> policy C --> policy D Two rules: 1) If policy A or policy B matched before the insertion, they appear before B' and thus would still match after inserting B' 2) Policy C and D, now "shadowed" and after policy B', potentially contain stale routes because policy B' might be selected instead of them. Therefore we only need flush routes assosciated with policies appearing after a newly inserted policy, if any. Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 54a4be6a7d2..d19e274b9c4 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -346,6 +346,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct xfrm_policy *pol, **p; struct xfrm_policy *delpol = NULL; struct xfrm_policy **newpos = NULL; + struct dst_entry *gc_list; write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { @@ -381,9 +382,36 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_pol_hold(policy); write_unlock_bh(&xfrm_policy_lock); - if (delpol) { + if (delpol) xfrm_policy_kill(delpol); + + read_lock_bh(&xfrm_policy_lock); + gc_list = NULL; + for (policy = policy->next; policy; policy = policy->next) { + struct dst_entry *dst; + + write_lock(&policy->lock); + dst = policy->bundles; + if (dst) { + struct dst_entry *tail = dst; + while (tail->next) + tail = tail->next; + tail->next = gc_list; + gc_list = dst; + + policy->bundles = NULL; + } + write_unlock(&policy->lock); } + read_unlock_bh(&xfrm_policy_lock); + + while (gc_list) { + struct dst_entry *dst = gc_list; + + gc_list = dst->next; + dst_free(dst); + } + return 0; } EXPORT_SYMBOL(xfrm_policy_insert); -- cgit v1.2.3 From 3dd3bf83574e38578fc9741c0e23e4fa7f7ff96e Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 23 Dec 2005 11:23:21 -0800 Subject: [IPV6]: Fix dead lock. We need to relesae ifp->lock before we call addrconf_dad_stop(), which will hold ifp->lock. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 510220f2ae8..d805241e439 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2467,9 +2467,11 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) return; } - if (idev->if_flags & IF_READY) + if (idev->if_flags & IF_READY) { addrconf_dad_kick(ifp); - else { + spin_unlock_bh(&ifp->lock); + } else { + spin_unlock_bh(&ifp->lock); /* * If the defice is not ready: * - keep it tentative if it is a permanent address. @@ -2478,8 +2480,6 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) in6_ifa_hold(ifp); addrconf_dad_stop(ifp); } - - spin_unlock_bh(&ifp->lock); out: read_unlock_bh(&idev->lock); } -- cgit v1.2.3 From 291d809ba5c8d4d6d8812e3f185bdf57d539f594 Mon Sep 17 00:00:00 2001 From: Hiroyuki YAMAMORI Date: Fri, 23 Dec 2005 11:24:05 -0800 Subject: [IPV6]: Fix Temporary Address Generation From: Hiroyuki YAMAMORI Since regen_count is stored in the public address, we need to reset it when we start renewing temporary address. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d805241e439..2a6439e3c91 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2782,6 +2782,9 @@ restart: in6_ifa_hold(ifpub); spin_unlock(&ifp->lock); read_unlock(&addrconf_hash_lock); + spin_lock(&ifpub->lock); + ifpub->regen_count = 0; + spin_unlock(&ifpub->lock); ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); -- cgit v1.2.3 From 6f4353d891b5e477528cd0b996e0263fecdf5d5f Mon Sep 17 00:00:00 2001 From: David L Stevens Date: Mon, 26 Dec 2005 17:03:46 -0800 Subject: [IPV6]: Increase default MLD_MAX_MSF to 64. The existing default of 10 is just way too low. Signed-off-by: David L Stevens Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index fd939da090c..057d8619ba1 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -170,7 +170,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, #define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value) #define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) -#define IPV6_MLD_MAX_MSF 10 +#define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf = IPV6_MLD_MAX_MSF; -- cgit v1.2.3 From 79cac2a221ce18642550a13bed0f0203514923ea Mon Sep 17 00:00:00 2001 From: David Kimdon Date: Mon, 26 Dec 2005 17:27:10 -0800 Subject: [BR_NETFILTER]: Fix leak if skb traverses > 1 bridge Call nf_bridge_put() before allocating a new nf_bridge structure and potentially overwriting the pointer to a previously allocated one. This fixes a memory leak which can occur when the bridge topology allows for an skb to traverse more than one bridge. Signed-off-by: David Kimdon Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 43a0b35dfe6..23422bd53a5 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -369,6 +369,7 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) goto inhdr_error; + nf_bridge_put(skb->nf_bridge); if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) return NF_DROP; setup_pre_routing(skb); @@ -452,6 +453,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, skb->ip_summed = CHECKSUM_NONE; } + nf_bridge_put(skb->nf_bridge); if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) return NF_DROP; setup_pre_routing(skb); -- cgit v1.2.3 From 6732badee0dad467fcc9dd0168af8677b2b1bc2f Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 27 Dec 2005 13:35:15 -0800 Subject: [IPV6]: Fix addrconf dead lock. We need to release idev->lcok before we call addrconf_dad_stop(). It calls ipv6_addr_del(), which will hold idev->lock. Bug spotted by Yasuyuki KOZAKAI . Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2a6439e3c91..a60585fd85a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2467,11 +2467,9 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) return; } - if (idev->if_flags & IF_READY) { - addrconf_dad_kick(ifp); - spin_unlock_bh(&ifp->lock); - } else { + if (!(idev->if_flags & IF_READY)) { spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); /* * If the defice is not ready: * - keep it tentative if it is a permanent address. @@ -2479,7 +2477,10 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) */ in6_ifa_hold(ifp); addrconf_dad_stop(ifp); + return; } + addrconf_dad_kick(ifp); + spin_unlock_bh(&ifp->lock); out: read_unlock_bh(&idev->lock); } -- cgit v1.2.3 From 1b93ae64cabe5e28dd5a1f35f96f938ca4f6ae20 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Dec 2005 13:57:59 -0800 Subject: [NET]: Validate socket filters against BPF_MAXINSNS in one spot. Currently the checks are scattered all over and this leads to inconsistencies and even cases where the check is not made. Based upon a patch from Kris Katterjohn. Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 2841bfce29d..3a10e0bc90e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -293,7 +293,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen) struct sock_filter *ftest; int pc; - if (((unsigned int)flen >= (~0U / sizeof(struct sock_filter))) || flen == 0) + if (flen == 0 || flen > BPF_MAXINSNS) return -EINVAL; /* check the filter code now */ @@ -360,7 +360,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) int err; /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS) + if (fprog->filter == NULL) return -EINVAL; fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); -- cgit v1.2.3 From 5ab4a6c81eb3dbe32361791d1535f9153f79b0ed Mon Sep 17 00:00:00 2001 From: David L Stevens Date: Tue, 27 Dec 2005 14:03:00 -0800 Subject: [IPV6] mcast: Fix multiple issues in MLDv2 reports. The below "jumbo" patch fixes the following problems in MLDv2. 1) Add necessary "ntohs" to recent "pskb_may_pull" check [breaks all nonzero source queries on little-endian (!)] 2) Add locking to source filter list [resend of prior patch] 3) fix "mld_marksources()" to a) send nothing when all queried sources are excluded b) send full exclude report when source queried sources are not excluded c) don't schedule a timer when there's nothing to report NOTE: RFC 3810 specifies the source list should be saved and each source reported individually as an IS_IN. This is an obvious DOS path, requiring the host to store and then multicast as many sources as are queried (e.g., millions...). This alternative sends a full, relevant report that's limited to number of sources present on the machine. 4) fix "add_grec()" to send empty-source records when it should The original check doesn't account for a non-empty source list with all sources inactive; the new code keeps that short-circuit case, and also generates the group header with an empty list if needed. 5) fix mca_crcount decrement to be after add_grec(), which needs its original value These issues (other than item #1 ;-) ) were all found by Yan Zheng, much thanks! Signed-off-by: David L Stevens Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 140 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 110 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 057d8619ba1..f829a4ad3cc 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -224,6 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = MCAST_EXCLUDE; + mc_lst->sflock = RW_LOCK_UNLOCKED; mc_lst->sflist = NULL; /* @@ -360,6 +361,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, struct ip6_sf_socklist *psl; int i, j, rv; int leavegroup = 0; + int pmclocked = 0; int err; if (pgsr->gsr_group.ss_family != AF_INET6 || @@ -403,6 +405,9 @@ int ip6_mc_source(int add, int omode, struct sock *sk, pmc->sfmode = omode; } + write_lock_bh(&pmc->sflock); + pmclocked = 1; + psl = pmc->sflist; if (!add) { if (!psl) @@ -475,6 +480,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: + if (pmclocked) + write_unlock_bh(&pmc->sflock); read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); in6_dev_put(idev); @@ -510,6 +517,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) dev = idev->dev; err = 0; + read_lock_bh(&ipv6_sk_mc_lock); + if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { leavegroup = 1; goto done; @@ -549,6 +558,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) newpsl = NULL; (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); } + + write_lock_bh(&pmc->sflock); psl = pmc->sflist; if (psl) { (void) ip6_mc_del_src(idev, group, pmc->sfmode, @@ -558,8 +569,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sflist = newpsl; pmc->sfmode = gsf->gf_fmode; + write_unlock_bh(&pmc->sflock); err = 0; done: + read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); in6_dev_put(idev); dev_put(dev); @@ -592,6 +605,11 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, dev = idev->dev; err = -EADDRNOTAVAIL; + /* + * changes to the ipv6_mc_list require the socket lock and + * a read lock on ip6_sk_mc_lock. We have the socket lock, + * so reading the list is safe. + */ for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { if (pmc->ifindex != gsf->gf_interface) @@ -614,6 +632,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } + /* changes to psl require the socket lock, a read lock on + * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We + * have the socket lock, so reading here is safe. + */ for (i=0; isflock); psl = mc->sflist; if (!psl) { rv = mc->sfmode == MCAST_EXCLUDE; @@ -665,6 +688,7 @@ int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = 0; } + read_unlock(&mc->sflock); read_unlock(&ipv6_sk_mc_lock); return rv; @@ -1068,7 +1092,8 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) ma->mca_flags |= MAF_TIMER_RUNNING; } -static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, +/* mark EXCLUDE-mode sources */ +static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, struct in6_addr *srcs) { struct ip6_sf_list *psf; @@ -1078,13 +1103,53 @@ static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { if (scount == nsrcs) break; - for (i=0; imca_sfcount[MCAST_INCLUDE] || + pmc->mca_sfcount[MCAST_EXCLUDE] != + psf->sf_count[MCAST_EXCLUDE]) + continue; + if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { + scount++; + break; + } + } + } + pmc->mca_flags &= ~MAF_GSQUERY; + if (scount == nsrcs) /* all sources excluded */ + return 0; + return 1; +} + +static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, + struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + if (pmc->mca_sfmode == MCAST_EXCLUDE) + return mld_xmarksources(pmc, nsrcs, srcs); + + /* mark INCLUDE-mode sources */ + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; isf_addr)) { psf->sf_gsresp = 1; scount++; break; } + } + } + if (!scount) { + pmc->mca_flags &= ~MAF_GSQUERY; + return 0; } + pmc->mca_flags |= MAF_GSQUERY; + return 1; } int igmp6_event_query(struct sk_buff *skb) @@ -1167,7 +1232,7 @@ int igmp6_event_query(struct sk_buff *skb) /* mark sources to include, if group & source-specific */ if (mlh2->nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + - mlh2->nsrcs * sizeof(struct in6_addr))) { + ntohs(mlh2->nsrcs) * sizeof(struct in6_addr))) { in6_dev_put(idev); return -EINVAL; } @@ -1203,10 +1268,9 @@ int igmp6_event_query(struct sk_buff *skb) else ma->mca_flags &= ~MAF_GSQUERY; } - if (ma->mca_flags & MAF_GSQUERY) - mld_marksources(ma, ntohs(mlh2->nsrcs), - mlh2->srcs); - igmp6_group_queried(ma, max_delay); + if (!(ma->mca_flags & MAF_GSQUERY) || + mld_marksources(ma, ntohs(mlh2->nsrcs), mlh2->srcs)) + igmp6_group_queried(ma, max_delay); spin_unlock_bh(&ma->mca_lock); if (group_type != IPV6_ADDR_ANY) break; @@ -1281,7 +1345,18 @@ static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, case MLD2_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return 0; - return !((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp); + if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { + if (pmc->mca_sfmode == MCAST_INCLUDE) + return 1; + /* don't include if this source is excluded + * in all filters + */ + if (psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + } + return 0; case MLD2_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return 0; @@ -1450,7 +1525,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_report *pmr; struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; - int scount, first, isquery, truncate; + int scount, stotal, first, isquery, truncate; if (pmc->mca_flags & MAF_NOREPORT) return skb; @@ -1460,25 +1535,13 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, truncate = type == MLD2_MODE_IS_EXCLUDE || type == MLD2_CHANGE_TO_EXCLUDE; + stotal = scount = 0; + psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; - if (!*psf_list) { - if (type == MLD2_ALLOW_NEW_SOURCES || - type == MLD2_BLOCK_OLD_SOURCES) - return skb; - if (pmc->mca_crcount || isquery) { - /* make sure we have room for group header and at - * least one source. - */ - if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)+ - sizeof(struct in6_addr)) { - mld_sendpack(skb); - skb = NULL; /* add_grhead will get a new one */ - } - skb = add_grhead(skb, pmc, type, &pgr); - } - return skb; - } + if (!*psf_list) + goto empty_source; + pmr = skb ? (struct mld2_report *)skb->h.raw : NULL; /* EX and TO_EX get a fresh packet, if needed */ @@ -1491,7 +1554,6 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } } first = 1; - scount = 0; psf_prev = NULL; for (psf=*psf_list; psf; psf=psf_next) { struct in6_addr *psrc; @@ -1525,7 +1587,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc)); *psrc = psf->sf_addr; - scount++; + scount++; stotal++; if ((type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { psf->sf_crcount--; @@ -1540,6 +1602,21 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } psf_prev = psf; } + +empty_source: + if (!stotal) { + if (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) + return skb; + if (pmc->mca_crcount || isquery) { + /* make sure we have room for group header */ + if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { + mld_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + } if (pgr) pgr->grec_nsrcs = htons(scount); @@ -1621,11 +1698,11 @@ static void mld_send_cr(struct inet6_dev *idev) skb = add_grec(skb, pmc, dtype, 1, 1); } if (pmc->mca_crcount) { - pmc->mca_crcount--; if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0); } + pmc->mca_crcount--; if (pmc->mca_crcount == 0) { mld_clear_zeros(&pmc->mca_tomb); mld_clear_zeros(&pmc->mca_sources); @@ -1659,12 +1736,12 @@ static void mld_send_cr(struct inet6_dev *idev) /* filter mode changes */ if (pmc->mca_crcount) { - pmc->mca_crcount--; if (pmc->mca_sfmode == MCAST_EXCLUDE) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); + pmc->mca_crcount--; } spin_unlock_bh(&pmc->mca_lock); } @@ -2023,6 +2100,9 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, { int err; + /* callers have the socket lock and a write lock on ipv6_sk_mc_lock, + * so no other readers or writers of iml or its sflist + */ if (iml->sflist == 0) { /* any-source empty exclude case */ return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); -- cgit v1.2.3