From 8d3a564da34e5844aca4f991b73f8ca512246b23 Mon Sep 17 00:00:00 2001 From: Doug Leith Date: Tue, 9 Dec 2008 00:13:04 -0800 Subject: tcp: tcp_vegas cong avoid fix This patch addresses a book-keeping issue in tcp_vegas.c. At present tcp_vegas does separate book-keeping of cwnd based on packet sequence numbers. A mismatch can develop between this book-keeping and tp->snd_cwnd due, for example, to delayed acks acking multiple packets. When vegas transitions to reno operation (e.g. following loss), then this mismatch leads to incorrect behaviour (akin to a cwnd backoff). This seems mostly to affect operation at low cwnds where delayed acking can lead to a significant fraction of cwnd being covered by a single ack, leading to the book-keeping mismatch. This patch modifies the congestion avoidance update to avoid the need for separate book-keeping while leaving vegas congestion avoidance functionally unchanged. A secondary advantage of this modification is that the use of fixed-point (via V_PARAM_SHIFT) and 64 bit arithmetic is no longer necessary, simplifying the code. Some example test measurements with the patched code (confirming no functional change in the congestion avoidance algorithm) can be seen at: http://www.hamilton.ie/doug/vegaspatch/ Signed-off-by: Doug Leith Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 80 +++++++--------------------------------------------- 1 file changed, 10 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 7cd22262de3..a453aac91bd 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -40,18 +40,14 @@ #include "tcp_vegas.h" -/* Default values of the Vegas variables, in fixed-point representation - * with V_PARAM_SHIFT bits to the right of the binary point. - */ -#define V_PARAM_SHIFT 1 -static int alpha = 2<beg_snd_nxt)) { /* Do the Vegas once-per-RTT cwnd adjustment. */ - u32 old_wnd, old_snd_cwnd; - - - /* Here old_wnd is essentially the window of data that was - * sent during the previous RTT, and has all - * been acknowledged in the course of the RTT that ended - * with the ACK we just received. Likewise, old_snd_cwnd - * is the cwnd during the previous RTT. - */ - old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / - tp->mss_cache; - old_snd_cwnd = vegas->beg_snd_cwnd; /* Save the extent of the current window so we can use this * at the end of the next RTT. */ - vegas->beg_snd_una = vegas->beg_snd_nxt; vegas->beg_snd_nxt = tp->snd_nxt; - vegas->beg_snd_cwnd = tp->snd_cwnd; /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got @@ -252,22 +212,14 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * * This is: * (actual rate in segments) * baseRTT - * We keep it as a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary point. */ - target_cwnd = ((u64)old_wnd * vegas->baseRTT); - target_cwnd <<= V_PARAM_SHIFT; - do_div(target_cwnd, rtt); + target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt; /* Calculate the difference between the window we had, * and the window we would like to have. This quantity * is the "Diff" from the Arizona Vegas papers. - * - * Again, this is a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary - * point. */ - diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; if (diff > gamma && tp->snd_ssthresh > 2 ) { /* Going too fast. Time to slow down @@ -282,16 +234,13 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * truncation robs us of full link * utilization. */ - tp->snd_cwnd = min(tp->snd_cwnd, - ((u32)target_cwnd >> - V_PARAM_SHIFT)+1); + tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); } else if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ tcp_slow_start(tp); } else { /* Congestion avoidance. */ - u32 next_snd_cwnd; /* Figure out where we would like cwnd * to be. @@ -300,26 +249,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* The old window was too fast, so * we slow down. */ - next_snd_cwnd = old_snd_cwnd - 1; + tp->snd_cwnd--; } else if (diff < alpha) { /* We don't have enough extra packets * in the network, so speed up. */ - next_snd_cwnd = old_snd_cwnd + 1; + tp->snd_cwnd++; } else { /* Sending just as fast as we * should be. */ - next_snd_cwnd = old_snd_cwnd; } - - /* Adjust cwnd upward or downward, toward the - * desired value. - */ - if (next_snd_cwnd > tp->snd_cwnd) - tp->snd_cwnd++; - else if (next_snd_cwnd < tp->snd_cwnd) - tp->snd_cwnd--; } if (tp->snd_cwnd < 2) -- cgit v1.2.3 From 24fc7b86dc0470616803be2f921c8cd5c459175d Mon Sep 17 00:00:00 2001 From: Jan Sembera Date: Tue, 9 Dec 2008 15:48:32 -0800 Subject: ipv6: silence log messages for locally generated multicast This patch fixes minor annoyance during transmission of unsolicited neighbor advertisements from userspace to multicast addresses (as far as I can see in RFC, this is allowed and the similar functionality for IPv4 has been in arping for a long time). Outgoing multicast packets get reinserted into local processing as if they are received from the network. The machine thus sees its own NA and fills the logs with error messages. This patch removes the message if NA has been generated locally. Signed-off-by: Jan Sembera Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 172438320ee..d0f54d18e19 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -912,8 +912,13 @@ static void ndisc_recv_na(struct sk_buff *skb) is invalid, but ndisc specs say nothing about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) + + We should not print the error if NA has been + received from loopback - it is just our own + unsolicited advertisement. */ - ND_PRINTK1(KERN_WARNING + if (skb->pkt_type != PACKET_LOOPBACK) + ND_PRINTK1(KERN_WARNING "ICMPv6 NA: someone advertises our address on %s!\n", ifp->idev->dev->name); in6_ifa_put(ifp); -- cgit v1.2.3 From 7b363e440021a1cf9ed76944b2685f48dacefb3e Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 9 Dec 2008 23:22:26 -0800 Subject: netpoll: fix race on poll_list resulting in garbage entry A few months back a race was discused between the netpoll napi service path, and the fast path through net_rx_action: http://kerneltrap.org/mailarchive/linux-netdev/2007/10/16/345470 A patch was submitted for that bug, but I think we missed a case. Consider the following scenario: INITIAL STATE CPU0 has one napi_struct A on its poll_list CPU1 is calling netpoll_send_skb and needs to call poll_napi on the same napi_struct A that CPU0 has on its list CPU0 CPU1 net_rx_action poll_napi !list_empty (returns true) locks poll_lock for A poll_one_napi napi->poll netif_rx_complete __napi_complete (removes A from poll_list) list_entry(list->next) In the above scenario, net_rx_action assumes that the per-cpu poll_list is exclusive to that cpu. netpoll of course violates that, and because the netpoll path can dequeue from the poll list, its possible for CPU0 to detect a non-empty list at the top of the while loop in net_rx_action, but have it become empty by the time it calls list_entry. Since the poll_list isn't surrounded by any other structure, the returned data from that list_entry call in this situation is garbage, and any number of crashes can result based on what exactly that garbage is. Given that its not fasible for performance reasons to place exclusive locks arround each cpus poll list to provide that mutal exclusion, I think the best solution is modify the netpoll path in such a way that we continue to guarantee that the poll_list for a cpu is in fact exclusive to that cpu. To do this I've implemented the patch below. It adds an additional bit to the state field in the napi_struct. When executing napi->poll from the netpoll_path, this bit will be set. When a driver calls netif_rx_complete, if that bit is set, it will not remove the napi_struct from the poll_list. That work will be saved for the next iteration of net_rx_action. I've tested this and it seems to work well. About the biggest drawback I can see to it is the fact that it might result in an extra loop through net_rx_action in the event that the device is actually contended for (i.e. the netpoll path actually preforms all the needed work no the device, and the call to net_rx_action winds up doing nothing, except removing the napi_struct from the poll_list. However I think this is probably a small price to pay, given that the alternative is a crash. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 6c7af390be0..dadac6281f2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -133,9 +133,11 @@ static int poll_one_napi(struct netpoll_info *npinfo, npinfo->rx_flags |= NETPOLL_RX_DROP; atomic_inc(&trapped); + set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); npinfo->rx_flags &= ~NETPOLL_RX_DROP; -- cgit v1.2.3 From ec8f2375d7584969501918651241f91eca2a6ad3 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 11 Dec 2008 21:31:50 -0800 Subject: netlabel: Compiler warning and NULL pointer dereference fix Fix the two compiler warnings show below. Thanks to Geert Uytterhoeven for finding and reporting the problem. net/netlabel/netlabel_unlabeled.c:567: warning: 'entry' may be used uninitialized in this function net/netlabel/netlabel_unlabeled.c:629: warning: 'entry' may be used uninitialized in this function Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/netlabel/netlabel_unlabeled.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 90c8506a0aa..8c030803217 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -562,7 +562,6 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, const struct in_addr *mask, struct netlbl_audit *audit_info) { - int ret_val = 0; struct netlbl_af4list *list_entry; struct netlbl_unlhsh_addr4 *entry; struct audit_buffer *audit_buf; @@ -577,7 +576,7 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, if (list_entry != NULL) entry = netlbl_unlhsh_addr4_entry(list_entry); else - ret_val = -ENOENT; + entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); @@ -588,19 +587,21 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, addr->s_addr, mask->s_addr); if (dev != NULL) dev_put(dev); - if (entry && security_secid_to_secctx(entry->secid, - &secctx, - &secctx_len) == 0) { + if (entry != NULL && + security_secid_to_secctx(entry->secid, + &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } - audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); + audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } - if (ret_val == 0) - call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4); - return ret_val; + if (entry == NULL) + return -ENOENT; + + call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4); + return 0; } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) @@ -624,7 +625,6 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, const struct in6_addr *mask, struct netlbl_audit *audit_info) { - int ret_val = 0; struct netlbl_af6list *list_entry; struct netlbl_unlhsh_addr6 *entry; struct audit_buffer *audit_buf; @@ -638,7 +638,7 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, if (list_entry != NULL) entry = netlbl_unlhsh_addr6_entry(list_entry); else - ret_val = -ENOENT; + entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); @@ -649,19 +649,21 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, addr, mask); if (dev != NULL) dev_put(dev); - if (entry && security_secid_to_secctx(entry->secid, - &secctx, - &secctx_len) == 0) { + if (entry != NULL && + security_secid_to_secctx(entry->secid, + &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } - audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); + audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } - if (ret_val == 0) - call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6); - return ret_val; + if (entry == NULL) + return -ENOENT; + + call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6); + return 0; } #endif /* IPv6 */ -- cgit v1.2.3 From be70ed189bc0d16e1609a1c6c04ec9418b4dd11a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 15 Dec 2008 00:19:14 -0800 Subject: netfilter: update rwlock initialization for nat_table The commit e099a173573ce1ba171092aee7bb3c72ea686e59 (netfilter: netns nat: per-netns NAT table) renamed the nat_table from __nat_table to nat_table without updating the __RW_LOCK_UNLOCKED(__nat_table.lock). Signed-off-by: Steven Rostedt Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_rule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index bea54a68510..8d489e746b2 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -61,7 +61,7 @@ static struct static struct xt_table nat_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(__nat_table.lock), + .lock = __RW_LOCK_UNLOCKED(nat_table.lock), .me = THIS_MODULE, .af = AF_INET, }; -- cgit v1.2.3 From eb9b851b980e20ba5f6bdfd6ec24f4bc77623ce6 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 15 Dec 2008 00:39:17 -0800 Subject: SCHED: netem: Correct documentation comment in code. The netem simulator is no longer limited by Linux timer resolution HZ. Not since Patrick McHardy changed the QoS system to use hrtimer. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index a11959908d9..98402f0efa4 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -46,9 +46,6 @@ layering other disciplines. It does not need to do bandwidth control either since that can be handled by using token bucket or other rate control. - - The simulator is limited by the Linux timer resolution - and will create packet bursts on the HZ boundary (1ms). */ struct netem_sched_data { -- cgit v1.2.3 From 4798a2b84ea5a98e4f36a815a646cb48ff521684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Mon, 15 Dec 2008 00:53:57 -0800 Subject: Phonet: keep TX queue disabled when the device is off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: RĂ©mi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/pep-gprs.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c index 9978afbd9f2..803eeef0aa8 100644 --- a/net/phonet/pep-gprs.c +++ b/net/phonet/pep-gprs.c @@ -155,12 +155,13 @@ static void gprs_data_ready(struct sock *sk, int len) static void gprs_write_space(struct sock *sk) { struct gprs_dev *dev = sk->sk_user_data; + struct net_device *net = dev->net; unsigned credits = pep_writeable(sk); spin_lock_bh(&dev->tx_lock); dev->tx_max = credits; - if (credits > skb_queue_len(&dev->tx_queue)) - netif_wake_queue(dev->net); + if (credits > skb_queue_len(&dev->tx_queue) && netif_running(net)) + netif_wake_queue(net); spin_unlock_bh(&dev->tx_lock); } @@ -168,6 +169,23 @@ static void gprs_write_space(struct sock *sk) * Network device callbacks */ +static int gprs_open(struct net_device *dev) +{ + struct gprs_dev *gp = netdev_priv(dev); + + gprs_write_space(gp->sk); + return 0; +} + +static int gprs_close(struct net_device *dev) +{ + struct gprs_dev *gp = netdev_priv(dev); + + netif_stop_queue(dev); + flush_work(&gp->tx_work); + return 0; +} + static int gprs_xmit(struct sk_buff *skb, struct net_device *net) { struct gprs_dev *dev = netdev_priv(net); @@ -254,6 +272,8 @@ static void gprs_setup(struct net_device *net) net->tx_queue_len = 10; net->destructor = free_netdev; + net->open = gprs_open; + net->stop = gprs_close; net->hard_start_xmit = gprs_xmit; /* mandatory */ net->change_mtu = gprs_set_mtu; net->get_stats = gprs_get_stats; @@ -318,7 +338,6 @@ int gprs_attach(struct sock *sk) dev->sk = sk; printk(KERN_DEBUG"%s: attached\n", net->name); - gprs_write_space(sk); /* kick off TX */ return net->ifindex; out_rel: @@ -341,7 +360,5 @@ void gprs_detach(struct sock *sk) printk(KERN_DEBUG"%s: detached\n", net->name); unregister_netdev(net); - flush_scheduled_work(); sock_put(sk); - skb_queue_purge(&dev->tx_queue); } -- cgit v1.2.3