From 0c6ce78abf6e228d44c3840edb8a4ae0c1299825 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 28 Oct 2008 16:09:23 -0700 Subject: net: replace uses of NIP6_FMT with %p6 Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d77c0d29e23..191c06bb0f6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2346,9 +2346,9 @@ static void DBGUNDO(struct sock *sk, const char *msg) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n", + printk(KERN_DEBUG "Undo %s %p6/%u c%u l%u ss%u/%u p%u\n", msg, - NIP6(np->daddr), ntohs(inet->dport), + &np->daddr, ntohs(inet->dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); -- cgit v1.2.3 From 5b095d98928fdb9e3b75be20a54b7a6cbf6ca9ad Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 29 Oct 2008 12:52:50 -0700 Subject: net: replace %p6 with %pI6 Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 191c06bb0f6..04909e4b3c4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2346,7 +2346,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - printk(KERN_DEBUG "Undo %s %p6/%u c%u l%u ss%u/%u p%u\n", + printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, &np->daddr, ntohs(inet->dport), tp->snd_cwnd, tcp_left_out(tp), -- cgit v1.2.3 From 673d57e72398edfedc93fb50ff58048077c9d587 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 31 Oct 2008 00:53:57 -0700 Subject: net: replace NIPQUAD() in net/ipv4/ net/ipv6/ Using NIPQUAD() with NIPQUAD_FMT, %d.%d.%d.%d or %u.%u.%u.%u can be replaced with %pI4 Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 04909e4b3c4..097294b7da3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2336,9 +2336,9 @@ static void DBGUNDO(struct sock *sk, const char *msg) struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { - printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n", + printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", msg, - NIPQUAD(inet->daddr), ntohs(inet->dport), + &inet->daddr, ntohs(inet->dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); -- cgit v1.2.3 From e1aa680fa40e7492260a09cb57d94002245cc8fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:11:55 -0800 Subject: tcp: move tcp_simple_retransmit to tcp_input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 097294b7da3..8085704863f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1002,7 +1002,8 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) } } -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, + struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); @@ -2559,6 +2560,56 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } +/* Do a simple retransmit without using the backoff mechanisms in + * tcp_timer. This is used for path mtu discovery. + * The socket is already locked here. + */ +void tcp_simple_retransmit(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk, 0); + u32 prior_lost = tp->lost_out; + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (skb->len > mss && + !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + } + tcp_skb_mark_lost_uncond_verify(tp, skb); + } + } + + tcp_clear_retrans_hints_partial(tp); + + if (prior_lost == tp->lost_out) + return; + + if (tcp_is_reno(tp)) + tcp_limit_reno_sacked(tp); + + tcp_verify_left_out(tp); + + /* Don't muck with the congestion window here. + * Reason is that we do not increase amount of _data_ + * in network, but units changed and effective + * cwnd/ssthresh really reduced now. + */ + if (icsk->icsk_ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tcp_set_ca_state(sk, TCP_CA_Loss); + } + tcp_xmit_retransmit_queue(sk); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and -- cgit v1.2.3 From e8bae275d9354104f7ae24a48a90d1a6286e7bd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:12:28 -0800 Subject: tcp: more aggressive skipping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I knew already when rewriting the sacktag that this condition was too conservative, change it now since it prevent lot of useless work (especially in the sack shifter decision code that is being added by a later patch). This shouldn't change anything really, just save some processing regardless of the shifter. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8085704863f..3f26599ddc8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1402,7 +1402,7 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, if (skb == tcp_send_head(sk)) break; - if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) + if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) break; *fack_count += tcp_skb_pcount(skb); -- cgit v1.2.3 From adb92db857ee2a0a2b925ccfbd560203c3f88aae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:13:50 -0800 Subject: tcp: Make SACK code to split only at mss boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sadly enough, this adds possible divide though we try to avoid it by checking one mss as common case. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3f26599ddc8..ca46eb9151f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1248,20 +1248,39 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, { int in_sack, err; unsigned int pkt_len; + unsigned int mss; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (tcp_skb_pcount(skb) > 1 && !in_sack && after(TCP_SKB_CB(skb)->end_seq, start_seq)) { - + mss = tcp_skb_mss(skb); in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); - if (!in_sack) + if (!in_sack) { pkt_len = start_seq - TCP_SKB_CB(skb)->seq; - else + if (pkt_len < mss) + pkt_len = mss; + } else { pkt_len = end_seq - TCP_SKB_CB(skb)->seq; - err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); + if (pkt_len < mss) + return -EINVAL; + } + + /* Round if necessary so that SACKs cover only full MSSes + * and/or the remaining small portion (if present) + */ + if (pkt_len > mss) { + unsigned int new_len = (pkt_len / mss) * mss; + if (!in_sack && new_len < pkt_len) { + new_len += mss; + if (new_len > skb->len) + return 0; + } + pkt_len = new_len; + } + err = tcp_fragment(sk, skb, pkt_len, mss); if (err < 0) return err; } -- cgit v1.2.3 From f58b22fd3c16444edc393a217a74208f1894b601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:14:43 -0800 Subject: tcp: make tcp_sacktag_one able to handle partial skb too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is preparatory work for SACK combiner patch which may have to count TCP state changes for only a part of the skb because it will intentionally avoids splitting skb to SACKed and not sacked parts. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ca46eb9151f..3c8e297e2c3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1289,7 +1289,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, } static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, - int *reord, int dup_sack, int fack_count) + int *reord, int dup_sack, int fack_count, + u8 *sackedto, int pcount) { struct tcp_sock *tp = tcp_sk(sk); u8 sacked = TCP_SKB_CB(skb)->sacked; @@ -1314,10 +1315,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, * that retransmission is still in flight. */ if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= - ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); - tp->lost_out -= tcp_skb_pcount(skb); - tp->retrans_out -= tcp_skb_pcount(skb); + *sackedto &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= pcount; + tp->retrans_out -= pcount; } } else { if (!(sacked & TCPCB_RETRANS)) { @@ -1334,22 +1334,22 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, } if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - tp->lost_out -= tcp_skb_pcount(skb); + *sackedto &= ~TCPCB_LOST; + tp->lost_out -= pcount; } } - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + *sackedto |= TCPCB_SACKED_ACKED; flag |= FLAG_DATA_SACKED; - tp->sacked_out += tcp_skb_pcount(skb); + tp->sacked_out += pcount; - fack_count += tcp_skb_pcount(skb); + fack_count += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) - tp->lost_cnt_hint += tcp_skb_pcount(skb); + tp->lost_cnt_hint += pcount; if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; @@ -1362,9 +1362,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, * frames and clear it. undo_retrans is decreased above, L|R frames * are accounted above as well. */ - if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); + if (dup_sack && (*sackedto & TCPCB_SACKED_RETRANS)) { + *sackedto &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= pcount; } return flag; @@ -1404,7 +1404,9 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, if (in_sack) *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, - *fack_count); + *fack_count, + &(TCP_SKB_CB(skb)->sacked), + tcp_skb_pcount(skb)); *fack_count += tcp_skb_pcount(skb); } -- cgit v1.2.3 From 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:20:15 -0800 Subject: tcp: Try to restore large SKBs while SACK processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During SACK processing, most of the benefits of TSO are eaten by the SACK blocks that one-by-one fragment SKBs to MSS sized chunks. Then we're in problems when cleanup work for them has to be done when a large cumulative ACK comes. Try to return back to pre-split state already while more and more SACK info gets discovered by combining newly discovered SACK areas with the previous skb if that's SACKed as well. This approach has a number of benefits: 1) The processing overhead is spread more equally over the RTT 2) Write queue has less skbs to process (affect everything which has to walk in the queue past the sacked areas) 3) Write queue is consistent whole the time, so no other parts of TCP has to be aware of this (this was not the case with some other approach that was, well, quite intrusive all around). 4) Clean_rtx_queue can release most of the pages using single put_page instead of previous PAGE_SIZE/mss+1 calls In case a hole is fully filled by the new SACK block, we attempt to combine the next skb too which allows construction of skbs that are even larger than what tso split them to and it handles hole per on every nth patterns that often occur during slow start overshoot pretty nicely. Though this to be really useful also a retransmission would have to get lost since cumulative ACKs advance one hole at a time in the most typical case. TODO: handle upwards only merging. That should be rather easy when segment is fully sacked but I'm leaving that as future work item (it won't make very large difference anyway since this current approach already covers quite a lot of normal cases). I was earlier thinking of some sophisticated way of tracking timestamps of the first and the last segment but later on realized that it won't be that necessary at all to store the timestamp of the last segment. The cases that can occur are basically either: 1) ambiguous => no sensible measurement can be taken anyway 2) non-ambiguous is due to reordering => having the timestamp of the last segment there is just skewing things more off than does some good since the ack got triggered by one of the holes (besides some substle issues that would make determining right hole/skb even harder problem). Anyway, it has nothing to do with this change then. I choose to route some abnormal looking cases with goto noop, some could be handled differently (eg., by stopping the walking at that skb but again). In general, they either shouldn't happen at all or are rare enough to make no difference in practice. In theory this change (as whole) could cause some macroscale regression (global) because of cache misses that are taken over the round-trip time but it gets very likely better because of much less (local) cache misses per other write queue walkers and the big recovery clearing cumulative ack. Worth to note that these benefits would be very easy to get also without TSO/GSO being on as long as the data is in pages so that we can merge them. Currently I won't let that happen because DSACK splitting at fragment that would mess up pcounts due to sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets avoided, we have some conditions that can be made less strict. TODO: I will probably have to convert the excessive pointer passing to struct sacktag_state... :-) My testing revealed that considerable amount of skbs couldn't be shifted because they were cloned (most likely still awaiting tx reclaim)... [The rest is considering future work instead since I got repeatably EFAULT to tcpdump's recvfrom when I added pskb_expand_head to deal with clones, so I separated that into another, later patch] ...To counter that, I gave up on the fifth advantage: 5) When growing previous SACK block, less allocs for new skbs are done, basically a new alloc is needed only when new hole is detected and when the previous skb runs out of frags space ...which now only happens of if reclaim is fast enough to dispose the clone before the SACK block comes in (the window is RTT long), otherwise we'll have to alloc some. With clones being handled I got these numbers (will be somewhat worse without that), taken with fine-grained mibs: TCPSackShifted 398 TCPSackMerged 877 TCPSackShiftFallback 320 TCPSACKCOLLAPSEFALLBACKGSO 0 TCPSACKCOLLAPSEFALLBACKSKBBITS 0 TCPSACKCOLLAPSEFALLBACKSKBDATA 0 TCPSACKCOLLAPSEFALLBACKBELOW 0 TCPSACKCOLLAPSEFALLBACKFIRST 1 TCPSACKCOLLAPSEFALLBACKPREVBITS 318 TCPSACKCOLLAPSEFALLBACKMSS 1 TCPSACKCOLLAPSEFALLBACKNOHEAD 0 TCPSACKCOLLAPSEFALLBACKSHIFT 0 TCPSACKCOLLAPSENOOPSEQ 0 TCPSACKCOLLAPSENOOPSMALLPCOUNT 0 TCPSACKCOLLAPSENOOPSMALLLEN 0 TCPSACKCOLLAPSEHOLE 12 Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 249 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3c8e297e2c3..97d57676b8e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). + * + * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq) @@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; - - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - tcp_advance_highest_sack(sk, skb); } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, return flag; } +static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, unsigned int pcount, + int shifted, int fack_count, int *reord, + int *flag, int mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */ + + BUG_ON(!pcount); + + TCP_SKB_CB(prev)->end_seq += shifted; + TCP_SKB_CB(skb)->seq += shifted; + + skb_shinfo(prev)->gso_segs += pcount; + BUG_ON(skb_shinfo(skb)->gso_segs < pcount); + skb_shinfo(skb)->gso_segs -= pcount; + + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep + * setting gso_size to something. + */ + if (!skb_shinfo(prev)->gso_size) { + skb_shinfo(prev)->gso_size = mss; + skb_shinfo(prev)->gso_type = sk->sk_gso_type; + } + + /* CHECKME: To clear or not to clear? Mimics normal skb currently */ + if (skb_shinfo(skb)->gso_segs <= 1) { + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; + } + + *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked, + pcount); + + /* Difference in this won't matter, both ACKed by the same cumul. ACK */ + TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); + + tcp_clear_all_retrans_hints(tp); + + if (skb->len > 0) { + BUG_ON(!tcp_skb_pcount(skb)); + return 0; + } + + /* Whole SKB was eaten :-) */ + + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; + if (skb == tcp_highest_sack(sk)) + tcp_advance_highest_sack(sk, skb); + + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + + return 1; +} + +/* I wish gso_size would have a bit more sane initialization than + * something-or-zero which complicates things + */ +static int tcp_shift_mss(struct sk_buff *skb) +{ + int mss = tcp_skb_mss(skb); + + if (!mss) + mss = skb->len; + + return mss; +} + +/* Shifting pages past head area doesn't work */ +static int skb_can_shift(struct sk_buff *skb) +{ + return !skb_headlen(skb) && skb_is_nonlinear(skb); +} + +/* Try collapsing SACK blocks spanning across multiple skbs to a single + * skb. + */ +static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, + u32 start_seq, u32 end_seq, + int dup_sack, int *fack_count, + int *reord, int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *prev; + int mss; + int pcount = 0; + int len; + int in_sack; + + if (!sk_can_gso(sk)) + goto fallback; + + /* Normally R but no L won't result in plain S */ + if (!dup_sack && + (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS) + goto fallback; + if (!skb_can_shift(skb)) + goto fallback; + /* This frame is about to be dropped (was ACKed). */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + goto fallback; + + /* Can only happen with delayed DSACK + discard craziness */ + if (unlikely(skb == tcp_write_queue_head(sk))) + goto fallback; + prev = tcp_write_queue_prev(sk, skb); + + if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) + goto fallback; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + if (in_sack) { + len = skb->len; + pcount = tcp_skb_pcount(skb); + mss = tcp_shift_mss(skb); + + /* TODO: Fix DSACKs to not fragment already SACKed and we can + * drop this restriction as unnecessary + */ + if (mss != tcp_shift_mss(prev)) + goto fallback; + } else { + if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) + goto noop; + /* CHECKME: This is non-MSS split case only?, this will + * cause skipped skbs due to advancing loop btw, original + * has that feature too + */ + if (tcp_skb_pcount(skb) <= 1) + goto noop; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); + if (!in_sack) { + /* TODO: head merge to next could be attempted here + * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), + * though it might not be worth of the additional hassle + * + * ...we can probably just fallback to what was done + * previously. We could try merging non-SACKed ones + * as well but it probably isn't going to buy off + * because later SACKs might again split them, and + * it would make skb timestamp tracking considerably + * harder problem. + */ + goto fallback; + } + + len = end_seq - TCP_SKB_CB(skb)->seq; + BUG_ON(len < 0); + BUG_ON(len > skb->len); + + /* MSS boundaries should be honoured or else pcount will + * severely break even though it makes things bit trickier. + * Optimize common case to avoid most of the divides + */ + mss = tcp_skb_mss(skb); + + /* TODO: Fix DSACKs to not fragment already SACKed and we can + * drop this restriction as unnecessary + */ + if (mss != tcp_shift_mss(prev)) + goto fallback; + + if (len == mss) { + pcount = 1; + } else if (len < mss) { + goto noop; + } else { + pcount = len / mss; + len = pcount * mss; + } + } + + if (!skb_shift(prev, skb, len)) + goto fallback; + if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord, + flag, mss)) + goto out; + + /* Hole filled allows collapsing with the next as well, this is very + * useful when hole on every nth skb pattern happens + */ + if (prev == tcp_write_queue_tail(sk)) + goto out; + skb = tcp_write_queue_next(sk, prev); + + if (!skb_can_shift(skb)) + goto out; + if (skb == tcp_send_head(sk)) + goto out; + if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) + goto out; + + len = skb->len; + if (skb_shift(prev, skb, len)) { + pcount += tcp_skb_pcount(skb); + tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len, + *fack_count, reord, flag, mss); + } + +out: + *fack_count += pcount; + return prev; + +noop: + return skb; + +fallback: + return NULL; +} + static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, u32 start_seq, u32 end_seq, int dup_sack_in, int *fack_count, int *reord, int *flag) { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *tmp; + tcp_for_write_queue_from(skb, sk) { int in_sack = 0; int dup_sack = dup_sack_in; @@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, dup_sack = 1; } - if (in_sack <= 0) - in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, - end_seq); + /* skb reference here is a bit tricky to get right, since + * shifting can eat and free both this skb and the next, + * so not even _safe variant of the loop is enough. + */ + if (in_sack <= 0) { + tmp = tcp_shift_skb_data(sk, skb, start_seq, + end_seq, dup_sack, + fack_count, reord, flag); + if (tmp != NULL) { + if (tmp != skb) { + skb = tmp; + continue; + } + + in_sack = 0; + } else { + in_sack = tcp_match_skb_to_sack(sk, skb, + start_seq, + end_seq); + } + } + if (unlikely(in_sack < 0)) break; - if (in_sack) + if (in_sack) { *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, *fack_count, &(TCP_SKB_CB(skb)->sacked), tcp_skb_pcount(skb)); + if (!before(TCP_SKB_CB(skb)->seq, + tcp_highest_sack_seq(tp))) + tcp_advance_highest_sack(sk, skb); + } + *fack_count += tcp_skb_pcount(skb); } return skb; -- cgit v1.2.3 From 92ee76b6d99bfcdab6162816c9025541ef7248eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:26:56 -0800 Subject: tcp: Make shifting not clear the hints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The earlier version was just very basic one which is "playing safe" by always clearing the hints. However, clearing of a hint is extremely costly operation with large windows, so it must be avoided at all cost whenever possible, there is a way with shifting too achieve not-clearing. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 97d57676b8e..e6291dde334 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1379,6 +1379,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, BUG_ON(!pcount); + /* Tweak before seqno plays */ + if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && + !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) + tp->lost_cnt_hint += pcount; + TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; @@ -1408,8 +1413,6 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); - tcp_clear_all_retrans_hints(tp); - if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); return 0; @@ -1417,6 +1420,15 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, /* Whole SKB was eaten :-) */ + if (skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = prev; + if (skb == tp->scoreboard_skb_hint) + tp->scoreboard_skb_hint = prev; + if (skb == tp->lost_skb_hint) { + tp->lost_skb_hint = prev; + tp->lost_cnt_hint -= tcp_skb_pcount(prev); + } + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); -- cgit v1.2.3 From 111cc8b913b42ef07793648b1699288332f273e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2008 21:27:22 -0800 Subject: tcp: add some mibs to track collapsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e6291dde334..9f8a80ba17b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1415,6 +1415,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); return 0; } @@ -1436,6 +1437,8 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); + return 1; } @@ -1594,6 +1597,7 @@ noop: return skb; fallback: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); return NULL; } -- cgit v1.2.3 From 8eecaba900e89643029fd2c253ad8ebb60761165 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 25 Nov 2008 13:45:29 -0800 Subject: tcp: tcp_limit_reno_sacked can become static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9f8a80ba17b..d67b6e9cc54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1940,7 +1940,7 @@ out: /* Limits sacked_out so that sum with lost_out isn't ever larger than * packets_out. Returns zero if sacked_out adjustement wasn't necessary. */ -int tcp_limit_reno_sacked(struct tcp_sock *tp) +static int tcp_limit_reno_sacked(struct tcp_sock *tp) { u32 holes; -- cgit v1.2.3 From f0bc52f38b09308fca85f3aa9300a341364fe9c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 5 Dec 2008 22:40:47 -0800 Subject: tcp: force mss equality with the next skb too. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also make if-goto forest nicer looking. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d67b6e9cc54..63c3ef6d4a1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1575,11 +1575,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto out; skb = tcp_write_queue_next(sk, prev); - if (!skb_can_shift(skb)) - goto out; - if (skb == tcp_send_head(sk)) - goto out; - if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) + if (!skb_can_shift(skb) || + (skb == tcp_send_head(sk)) || + ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || + (mss != tcp_shift_mss(skb))) goto out; len = skb->len; -- cgit v1.2.3 From 9969ca5f205988fb96461075cb4914c55cf166b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 5 Dec 2008 22:41:06 -0800 Subject: tcp: Fix thinko making the not-shiftable to cover S|R as well MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit S|R won't result in S if just SACK is received. DSACK is another story (but it is covered correctly already). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 63c3ef6d4a1..33902f6799c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1481,7 +1481,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, /* Normally R but no L won't result in plain S */ if (!dup_sack && - (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS) + (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) goto fallback; if (!skb_can_shift(skb)) goto fallback; -- cgit v1.2.3 From 775ffabf77a648d78fe1d20cb3a620e771abb921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 5 Dec 2008 22:41:26 -0800 Subject: tcp: make mtu probe failure to not break gso'ed skbs unnecessarily MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I noticed that since skb->len has nothing to do with actual segment length with gso, we need to figure it out separately, reuse a function from the recent shifting stuff (generalize it). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 33902f6799c..21c67019078 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1445,14 +1445,9 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, /* I wish gso_size would have a bit more sane initialization than * something-or-zero which complicates things */ -static int tcp_shift_mss(struct sk_buff *skb) +static int tcp_skb_seglen(struct sk_buff *skb) { - int mss = tcp_skb_mss(skb); - - if (!mss) - mss = skb->len; - - return mss; + return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); } /* Shifting pages past head area doesn't work */ @@ -1503,12 +1498,12 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (in_sack) { len = skb->len; pcount = tcp_skb_pcount(skb); - mss = tcp_shift_mss(skb); + mss = tcp_skb_seglen(skb); /* TODO: Fix DSACKs to not fragment already SACKed and we can * drop this restriction as unnecessary */ - if (mss != tcp_shift_mss(prev)) + if (mss != tcp_skb_seglen(prev)) goto fallback; } else { if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) @@ -1549,7 +1544,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, /* TODO: Fix DSACKs to not fragment already SACKed and we can * drop this restriction as unnecessary */ - if (mss != tcp_shift_mss(prev)) + if (mss != tcp_skb_seglen(prev)) goto fallback; if (len == mss) { @@ -1578,7 +1573,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_can_shift(skb) || (skb == tcp_send_head(sk)) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || - (mss != tcp_shift_mss(skb))) + (mss != tcp_skb_seglen(skb))) goto out; len = skb->len; @@ -2853,7 +2848,7 @@ void tcp_simple_retransmit(struct sock *sk) tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; - if (skb->len > mss && + if (tcp_skb_seglen(skb) > mss && !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; -- cgit v1.2.3 From a1197f5a6faa23e5d0c1f8ed97b011deb2a75457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 5 Dec 2008 22:42:22 -0800 Subject: tcp: introduce struct tcp_sacktag_state to reduce arg pressure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are just too many args to some sacktag functions. This idea was first proposed by David S. Miller around a year ago, and the current situation is much worse that what it was back then. tcp_sacktag_one can be made a bit simpler by returning the new sacked (it can be achieved with a single variable though the previous code "caching" sacked into a local variable and therefore it is not exactly equal but the results will be the same). codiff on x86_64 tcp_sacktag_one | -15 tcp_shifted_skb | -50 tcp_match_skb_to_sack | -1 tcp_sacktag_walk | -64 tcp_sacktag_write_queue | -59 tcp_urg | +1 tcp_event_data_recv | -1 7 functions changed, 1 bytes added, 190 bytes removed, diff: -189 Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 145 ++++++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 71 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21c67019078..e25827719e7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1237,6 +1237,12 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, return dup_sack; } +struct tcp_sacktag_state { + int reord; + int fack_count; + int flag; +}; + /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment @@ -1290,25 +1296,25 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } -static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, - int *reord, int dup_sack, int fack_count, - u8 *sackedto, int pcount) +static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, + struct tcp_sacktag_state *state, + int dup_sack, int pcount) { struct tcp_sock *tp = tcp_sk(sk); u8 sacked = TCP_SKB_CB(skb)->sacked; - int flag = 0; + int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) - *reord = min(fack_count, *reord); + state->reord = min(fack_count, state->reord); } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - return flag; + return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { if (sacked & TCPCB_SACKED_RETRANS) { @@ -1317,7 +1323,7 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, * that retransmission is still in flight. */ if (sacked & TCPCB_LOST) { - *sackedto &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= pcount; tp->retrans_out -= pcount; } @@ -1328,21 +1334,22 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, */ if (before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - *reord = min(fack_count, *reord); + state->reord = min(fack_count, + state->reord); /* SACK enhanced F-RTO (RFC4138; Appendix B) */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) - flag |= FLAG_ONLY_ORIG_SACKED; + state->flag |= FLAG_ONLY_ORIG_SACKED; } if (sacked & TCPCB_LOST) { - *sackedto &= ~TCPCB_LOST; + sacked &= ~TCPCB_LOST; tp->lost_out -= pcount; } } - *sackedto |= TCPCB_SACKED_ACKED; - flag |= FLAG_DATA_SACKED; + sacked |= TCPCB_SACKED_ACKED; + state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; fack_count += pcount; @@ -1361,21 +1368,20 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, * frames and clear it. undo_retrans is decreased above, L|R frames * are accounted above as well. */ - if (dup_sack && (*sackedto & TCPCB_SACKED_RETRANS)) { - *sackedto &= ~TCPCB_SACKED_RETRANS; + if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) { + sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= pcount; } - return flag; + return sacked; } static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, - struct sk_buff *skb, unsigned int pcount, - int shifted, int fack_count, int *reord, - int *flag, int mss) + struct sk_buff *skb, + struct tcp_sacktag_state *state, + unsigned int pcount, int shifted, int mss) { struct tcp_sock *tp = tcp_sk(sk); - u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */ BUG_ON(!pcount); @@ -1407,8 +1413,8 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, skb_shinfo(skb)->gso_type = 0; } - *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked, - pcount); + /* We discard results */ + tcp_sacktag_one(skb, sk, state, 0, pcount); /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1460,9 +1466,9 @@ static int skb_can_shift(struct sk_buff *skb) * skb. */ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, + struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, - int dup_sack, int *fack_count, - int *reord, int *flag) + int dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; @@ -1559,8 +1565,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord, - flag, mss)) + if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss)) goto out; /* Hole filled allows collapsing with the next as well, this is very @@ -1579,12 +1584,12 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len, - *fack_count, reord, flag, mss); + tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), len, + mss); } out: - *fack_count += pcount; + state->fack_count += pcount; return prev; noop: @@ -1597,9 +1602,9 @@ fallback: static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, + struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, - int dup_sack_in, int *fack_count, - int *reord, int *flag) + int dup_sack_in) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; @@ -1629,9 +1634,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, * so not even _safe variant of the loop is enough. */ if (in_sack <= 0) { - tmp = tcp_shift_skb_data(sk, skb, start_seq, - end_seq, dup_sack, - fack_count, reord, flag); + tmp = tcp_shift_skb_data(sk, skb, state, + start_seq, end_seq, dup_sack); if (tmp != NULL) { if (tmp != skb) { skb = tmp; @@ -1650,17 +1654,17 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, break; if (in_sack) { - *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, - *fack_count, - &(TCP_SKB_CB(skb)->sacked), - tcp_skb_pcount(skb)); + TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, + state, + dup_sack, + tcp_skb_pcount(skb)); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } - *fack_count += tcp_skb_pcount(skb); + state->fack_count += tcp_skb_pcount(skb); } return skb; } @@ -1669,7 +1673,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, * a normal way */ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, - u32 skip_to_seq, int *fack_count) + struct tcp_sacktag_state *state, + u32 skip_to_seq) { tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1678,7 +1683,7 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) break; - *fack_count += tcp_skb_pcount(skb); + state->fack_count += tcp_skb_pcount(skb); } return skb; } @@ -1686,18 +1691,17 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, - u32 skip_to_seq, - int *fack_count, int *reord, - int *flag) + struct tcp_sacktag_state *state, + u32 skip_to_seq) { if (next_dup == NULL) return skb; if (before(next_dup->start_seq, skip_to_seq)) { - skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); - skb = tcp_sacktag_walk(skb, sk, NULL, - next_dup->start_seq, next_dup->end_seq, - 1, fack_count, reord, flag); + skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq); + skb = tcp_sacktag_walk(skb, sk, NULL, state, + next_dup->start_seq, next_dup->end_seq, + 1); } return skb; @@ -1719,16 +1723,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; + struct tcp_sacktag_state state; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; - int reord = tp->packets_out; - int flag = 0; int found_dup_sack = 0; - int fack_count; int i, j; int first_sack_index; + state.flag = 0; + state.reord = tp->packets_out; + if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) tp->fackets_out = 0; @@ -1738,7 +1743,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) - flag |= FLAG_DSACKING_ACK; + state.flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1807,7 +1812,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, } skb = tcp_write_queue_head(sk); - fack_count = 0; + state.fack_count = 0; i = 0; if (!tp->sacked_out) { @@ -1832,7 +1837,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) - flag |= FLAG_DATA_LOST; + state.flag |= FLAG_DATA_LOST; /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && @@ -1845,13 +1850,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, /* Head todo? */ if (before(start_seq, cache->start_seq)) { - skb = tcp_sacktag_skip(skb, sk, start_seq, - &fack_count); + skb = tcp_sacktag_skip(skb, sk, &state, + start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, + &state, start_seq, cache->start_seq, - dup_sack, &fack_count, - &reord, &flag); + dup_sack); } /* Rest of the block already fully processed? */ @@ -1859,9 +1864,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, - cache->end_seq, - &fack_count, &reord, - &flag); + &state, + cache->end_seq); /* ...tail remains todo... */ if (tcp_highest_sack_seq(tp) == cache->end_seq) { @@ -1869,13 +1873,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (skb == NULL) break; - fack_count = tp->fackets_out; + state.fack_count = tp->fackets_out; cache++; goto walk; } - skb = tcp_sacktag_skip(skb, sk, cache->end_seq, - &fack_count); + skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; @@ -1885,20 +1888,20 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (skb == NULL) break; - fack_count = tp->fackets_out; + state.fack_count = tp->fackets_out; } - skb = tcp_sacktag_skip(skb, sk, start_seq, &fack_count); + skb = tcp_sacktag_skip(skb, sk, &state, start_seq); walk: - skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, - dup_sack, &fack_count, &reord, &flag); + skb = tcp_sacktag_walk(skb, sk, next_dup, &state, + start_seq, end_seq, dup_sack); advance_sp: /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct * due to in-order walk */ if (after(end_seq, tp->frto_highmark)) - flag &= ~FLAG_ONLY_ORIG_SACKED; + state.flag &= ~FLAG_ONLY_ORIG_SACKED; i++; } @@ -1915,10 +1918,10 @@ advance_sp: tcp_verify_left_out(tp); - if ((reord < tp->fackets_out) && + if ((state.reord < tp->fackets_out) && ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) - tcp_update_reordering(sk, tp->fackets_out - reord, 0); + tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); out: @@ -1928,7 +1931,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - return flag; + return state.flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than -- cgit v1.2.3 From 50133161a83c9e5974d430cabd77d6430ca7d579 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 5 Dec 2008 22:42:41 -0800 Subject: tcp: no need to pass prev skb around, reduces arg pressure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e25827719e7..2d9151c9436 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1376,12 +1376,12 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, return sacked; } -static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, - struct sk_buff *skb, +static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss) { struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *prev = tcp_write_queue_prev(sk, skb); BUG_ON(!pcount); @@ -1565,7 +1565,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss)) + if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss)) goto out; /* Hole filled allows collapsing with the next as well, this is very @@ -1584,8 +1584,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), len, - mss); + tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss); } out: -- cgit v1.2.3 From ee6aac59505bcae5de1422c76956de62ac22170d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 5 Dec 2008 22:43:08 -0800 Subject: tcp: drop tcp_bound_rto, merge content of it tcp_set_rto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both are called by the same sites. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2d9151c9436..215e38c1a23 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -701,13 +701,10 @@ static inline void tcp_set_rto(struct sock *sk) * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ -} -/* NOTE: clamping at TCP_RTO_MIN is not required, current algo - * guarantees that rto is higher. - */ -static inline void tcp_bound_rto(struct sock *sk) -{ + /* NOTE: clamping at TCP_RTO_MIN is not required, current algo + * guarantees that rto is higher. + */ if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } @@ -928,7 +925,6 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); - tcp_bound_rto(sk); if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) goto reset; tp->snd_cwnd = tcp_init_cwnd(tp, dst); @@ -3081,7 +3077,6 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; - tcp_bound_rto(sk); } static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) @@ -3101,7 +3096,6 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; - tcp_bound_rto(sk); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, -- cgit v1.2.3 From 41834b7332a1ad3f7b6e8bbd83e6ce63586f0b07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 5 Dec 2008 22:43:26 -0800 Subject: tcp: share code through function, not through copy-paste. :-) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 215e38c1a23..99b7ecbe889 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3052,6 +3052,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tcp_xmit_retransmit_queue(sk); } +static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +{ + tcp_rtt_estimator(sk, seq_rtt); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; +} + /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Supersedes RFC1323) */ @@ -3073,10 +3080,8 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) * in window is lost... Voila. --ANK (010210) */ struct tcp_sock *tp = tcp_sk(sk); - const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt); - tcp_set_rto(sk); - inet_csk(sk)->icsk_backoff = 0; + + tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); } static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) @@ -3093,9 +3098,7 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt); - tcp_set_rto(sk); - inet_csk(sk)->icsk_backoff = 0; + tcp_valid_rtt_meas(sk, seq_rtt); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, -- cgit v1.2.3