diff options
Diffstat (limited to 'net/ipv4')
32 files changed, 333 insertions, 263 deletions
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 60123905dbb..732d8f088b1 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -59,6 +59,13 @@ struct fib_table *ip_fib_main_table; #define FIB_TABLE_HASHSZ 1 static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; +static void __init fib4_rules_init(void) +{ + ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); + hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]); + ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); + hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); +} #else #define FIB_TABLE_HASHSZ 256 @@ -905,14 +912,8 @@ void __init ip_fib_init(void) for (i = 0; i < FIB_TABLE_HASHSZ; i++) INIT_HLIST_HEAD(&fib_table_hash[i]); -#ifndef CONFIG_IP_MULTIPLE_TABLES - ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); - hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]); - ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); - hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); -#else + fib4_rules_init(); -#endif register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f16839c6a72..a0ada3a8d8d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -49,33 +49,6 @@ struct fib4_rule #endif }; -static struct fib4_rule default_rule = { - .common = { - .refcnt = ATOMIC_INIT(2), - .pref = 0x7FFF, - .table = RT_TABLE_DEFAULT, - .action = FR_ACT_TO_TBL, - }, -}; - -static struct fib4_rule main_rule = { - .common = { - .refcnt = ATOMIC_INIT(2), - .pref = 0x7FFE, - .table = RT_TABLE_MAIN, - .action = FR_ACT_TO_TBL, - }, -}; - -static struct fib4_rule local_rule = { - .common = { - .refcnt = ATOMIC_INIT(2), - .table = RT_TABLE_LOCAL, - .action = FR_ACT_TO_TBL, - .flags = FIB_RULE_PERMANENT, - }, -}; - #ifdef CONFIG_NET_CLS_ROUTE u32 fib_rules_tclass(struct fib_result *res) { @@ -319,11 +292,27 @@ static struct fib_rules_ops fib4_rules_ops = { .owner = THIS_MODULE, }; -void __init fib4_rules_init(void) +static int __init fib_default_rules_init(void) { - list_add_tail(&local_rule.common.list, &fib4_rules_ops.rules_list); - list_add_tail(&main_rule.common.list, &fib4_rules_ops.rules_list); - list_add_tail(&default_rule.common.list, &fib4_rules_ops.rules_list); + int err; + + err = fib_default_rule_add(&fib4_rules_ops, 0, + RT_TABLE_LOCAL, FIB_RULE_PERMANENT); + if (err < 0) + return err; + err = fib_default_rule_add(&fib4_rules_ops, 0x7FFE, + RT_TABLE_MAIN, 0); + if (err < 0) + return err; + err = fib_default_rule_add(&fib4_rules_ops, 0x7FFF, + RT_TABLE_DEFAULT, 0); + if (err < 0) + return err; + return 0; +} +void __init fib4_rules_init(void) +{ + BUG_ON(fib_default_rules_init()); fib_rules_register(&fib4_rules_ops); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index dc429b6b0ba..b0170732b5e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -747,13 +747,14 @@ skip_listen_ht: for (i = s_i; i < hashinfo->ehash_size; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + rwlock_t *lock = inet_ehash_lockp(hashinfo, i); struct sock *sk; struct hlist_node *node; if (i > s_i) s_num = 0; - read_lock_bh(&head->lock); + read_lock_bh(lock); num = 0; sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); @@ -769,7 +770,7 @@ skip_listen_ht: r->id.idiag_dport) goto next_normal; if (inet_csk_diag_dump(sk, skb, cb) < 0) { - read_unlock_bh(&head->lock); + read_unlock_bh(lock); goto done; } next_normal: @@ -791,14 +792,14 @@ next_normal: r->id.idiag_dport) goto next_dying; if (inet_twsk_diag_dump(tw, skb, cb) < 0) { - read_unlock_bh(&head->lock); + read_unlock_bh(lock); goto done; } next_dying: ++num; } } - read_unlock_bh(&head->lock); + read_unlock_bh(lock); } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 16eecc7046a..67704da04fc 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &head->twchain) { @@ -239,7 +240,7 @@ unique: BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(lock); if (twp) { *twp = tw; @@ -255,7 +256,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(lock); return -EADDRNOTAVAIL; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 4e189e28f30..a60b99e0ebd 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; /* Unlink from established hashes. */ - struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); - write_lock(&ehead->lock); + write_lock(lock); if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); + write_unlock(lock); return; } __hlist_del(&tw->tw_node); sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); + write_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; @@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in @@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); - write_lock(&ehead->lock); + write_lock(lock); /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) @@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_node(tw, &ehead->twchain); atomic_inc(&tw->tw_refcnt); - write_unlock(&ehead->lock); + write_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 771031dfbd0..af995198f64 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -61,7 +61,7 @@ * 4. Global variable peer_total is modified under the pool lock. * 5. struct inet_peer fields modification: * avl_left, avl_right, avl_parent, avl_height: pool lock - * unused_next, unused_prevp: unused node list lock + * unused: unused node list lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * dtime: unused node list lock @@ -94,8 +94,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min int inet_peer_gc_mintime __read_mostly = 10 * HZ; int inet_peer_gc_maxtime __read_mostly = 120 * HZ; -static struct inet_peer *inet_peer_unused_head; -static struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head; +static LIST_HEAD(unused_peers); static DEFINE_SPINLOCK(inet_peer_unused_lock); static void peer_check_expire(unsigned long dummy); @@ -138,15 +137,7 @@ void __init inet_initpeers(void) static void unlink_from_unused(struct inet_peer *p) { spin_lock_bh(&inet_peer_unused_lock); - if (p->unused_prevp != NULL) { - /* On unused list. */ - *p->unused_prevp = p->unused_next; - if (p->unused_next != NULL) - p->unused_next->unused_prevp = p->unused_prevp; - else - inet_peer_unused_tailp = p->unused_prevp; - p->unused_prevp = NULL; /* mark it as removed */ - } + list_del_init(&p->unused); spin_unlock_bh(&inet_peer_unused_lock); } @@ -337,24 +328,24 @@ static void unlink_from_pool(struct inet_peer *p) /* May be called with local BH enabled. */ static int cleanup_once(unsigned long ttl) { - struct inet_peer *p; + struct inet_peer *p = NULL; /* Remove the first entry from the list of unused nodes. */ spin_lock_bh(&inet_peer_unused_lock); - p = inet_peer_unused_head; - if (p != NULL) { - __u32 delta = (__u32)jiffies - p->dtime; + if (!list_empty(&unused_peers)) { + __u32 delta; + + p = list_first_entry(&unused_peers, struct inet_peer, unused); + delta = (__u32)jiffies - p->dtime; + if (delta < ttl) { /* Do not prune fresh entries. */ spin_unlock_bh(&inet_peer_unused_lock); return -1; } - inet_peer_unused_head = p->unused_next; - if (p->unused_next != NULL) - p->unused_next->unused_prevp = p->unused_prevp; - else - inet_peer_unused_tailp = p->unused_prevp; - p->unused_prevp = NULL; /* mark as not on the list */ + + list_del_init(&p->unused); + /* Grab an extra reference to prevent node disappearing * before unlink_from_pool() call. */ atomic_inc(&p->refcnt); @@ -412,7 +403,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) /* Link the node. */ link_to_pool(n); - n->unused_prevp = NULL; /* not on the list */ + INIT_LIST_HEAD(&n->unused); peer_total++; write_unlock_bh(&peer_pool_lock); @@ -467,10 +458,7 @@ void inet_putpeer(struct inet_peer *p) { spin_lock_bh(&inet_peer_unused_lock); if (atomic_dec_and_test(&p->refcnt)) { - p->unused_prevp = inet_peer_unused_tailp; - p->unused_next = NULL; - *inet_peer_unused_tailp = p; - inet_peer_unused_tailp = &p->unused_next; + list_add_tail(&p->unused, &unused_peers); p->dtime = (__u32)jiffies; } spin_unlock_bh(&inet_peer_unused_lock); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e5f7dc2de30..fd99fbd685e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1183,6 +1183,17 @@ error: return err; } +static void ip_cork_release(struct inet_sock *inet) +{ + inet->cork.flags &= ~IPCORK_OPT; + kfree(inet->cork.opt); + inet->cork.opt = NULL; + if (inet->cork.rt) { + ip_rt_put(inet->cork.rt); + inet->cork.rt = NULL; + } +} + /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. @@ -1276,13 +1287,7 @@ int ip_push_pending_frames(struct sock *sk) } out: - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(inet); return err; error: @@ -1295,19 +1300,12 @@ error: */ void ip_flush_pending_frames(struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(inet_sk(sk)); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f51f20e487c..82817e55436 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -437,10 +437,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* If optlen==0, it is equivalent to val == 0 */ -#ifdef CONFIG_IP_MROUTE - if (optname >= MRT_BASE && optname <= (MRT_BASE + 10)) + if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk,optname,optval,optlen); -#endif err = 0; lock_sock(sk); @@ -909,11 +907,9 @@ int ip_setsockopt(struct sock *sk, int level, #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > (MRT_BASE + 10)) -#endif - ) { + optname != IP_IPSEC_POLICY && + optname != IP_XFRM_POLICY && + !ip_mroute_opt(optname)) { lock_sock(sk); err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); release_sock(sk); @@ -935,11 +931,9 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > (MRT_BASE + 10)) -#endif - ) { + optname != IP_IPSEC_POLICY && + optname != IP_XFRM_POLICY && + !ip_mroute_opt(optname)) { lock_sock(sk); err = compat_nf_setsockopt(sk, PF_INET, optname, optval, optlen); @@ -967,11 +961,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (level != SOL_IP) return -EOPNOTSUPP; -#ifdef CONFIG_IP_MROUTE - if (optname >= MRT_BASE && optname <= MRT_BASE+10) { + if (ip_mroute_opt(optname)) return ip_mroute_getsockopt(sk,optname,optval,optlen); - } -#endif if (get_user(len,optlen)) return -EFAULT; @@ -1171,11 +1162,8 @@ int ip_getsockopt(struct sock *sk, int level, err = do_ip_getsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > MRT_BASE+10) -#endif - ) { + if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && + !ip_mroute_opt(optname)) { int len; if (get_user(len,optlen)) @@ -1200,11 +1188,8 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, int err = do_ip_getsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > MRT_BASE+10) -#endif - ) { + if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && + !ip_mroute_opt(optname)) { int len; if (get_user(len, optlen)) diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index ca1b5fdb8d3..2c44a94c213 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <asm/semaphore.h> #include <linux/crypto.h> +#include <linux/err.h> #include <linux/pfkeyv2.h> #include <linux/percpu.h> #include <linux/smp.h> @@ -344,7 +345,7 @@ static struct crypto_comp **ipcomp_alloc_tfms(const char *alg_name) for_each_possible_cpu(cpu) { struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0, CRYPTO_ALG_ASYNC); - if (!tfm) + if (IS_ERR(tfm)) goto error; *per_cpu_ptr(tfms, cpu) = tfm; } diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 4b702f708d3..0a9f3c37e18 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -426,6 +426,24 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) /* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + if ((cp) && (!cp->dest)) { + dest = ip_vs_find_dest(cp->daddr, cp->dport, + cp->vaddr, cp->vport, cp->protocol); + ip_vs_bind_dest(cp, dest); + return dest; + } else + return NULL; +} + + +/* * Unbind a connection entry with its VS destination * Called by the ip_vs_conn_expire function. */ diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index c6ed7654e83..20c884a5772 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -979,15 +979,23 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, ret = NF_ACCEPT; } - /* increase its packet counter and check if it is needed - to be synchronized */ + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + */ atomic_inc(&cp->in_pkts); if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && - (cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) + (((cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) || + ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && + ((cp->state == IP_VS_TCP_S_FIN_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE))))) ip_vs_sync_conn(cp); + cp->old_state = cp->state; ip_vs_conn_put(cp); return ret; diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7345fc252a2..b64cf45a9ea 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -579,6 +579,31 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) return NULL; } +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Cretaed to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * + * ip_vs_lookup_real_service() looked promissing, but + * seems not working as expected. + */ +struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, + __be32 vaddr, __be16 vport, __u16 protocol) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + + svc = ip_vs_service_get(0, protocol, vaddr, vport); + if (!svc) + return NULL; + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest) + atomic_inc(&dest->refcnt); + ip_vs_service_put(svc); + return dest; +} /* * Lookup dest by {svc,addr,port} in the destination trash. diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 0d4d9721cbd..bd930efc18d 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -284,6 +284,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) struct ip_vs_sync_conn_options *opt; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; + struct ip_vs_dest *dest; char *p; int i; @@ -317,20 +318,34 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) s->caddr, s->cport, s->vaddr, s->vport); if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(s->daddr, s->dport, + s->vaddr, s->vport, + s->protocol); cp = ip_vs_conn_new(s->protocol, s->caddr, s->cport, s->vaddr, s->vport, s->daddr, s->dport, - flags, NULL); + flags, dest); + if (dest) + atomic_dec(&dest->refcnt); if (!cp) { IP_VS_ERR("ip_vs_conn_new failed\n"); return; } cp->state = ntohs(s->state); } else if (!cp->dest) { - /* it is an entry created by the synchronization */ - cp->state = ntohs(s->state); - cp->flags = flags | IP_VS_CONN_F_HASHED; + dest = ip_vs_try_bind_dest(cp); + if (!dest) { + /* it is an unbound entry created by + * synchronization */ + cp->flags = flags | IP_VS_CONN_F_HASHED; + } else + atomic_dec(&dest->refcnt); } /* Note that we don't touch its state and flags if it is a normal entry. */ @@ -342,6 +357,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) p += SIMPLE_CONN_SIZE; atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->state = ntohs(s->state); pp = ip_vs_proto_get(s->protocol); cp->timeout = pp->timeout_table[cp->state]; ip_vs_conn_put(cp); diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 409d273f6f8..7456833d6ad 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -41,27 +41,27 @@ obj-$(CONFIG_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o # matches +obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o +obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o +obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o -obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o -obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o -obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o +obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o -obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o # targets -obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o -obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o +obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o +obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o -obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o +obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o +obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o -obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o -obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o -obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o +obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o +obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 10a2ce09fd8..14d64a383db 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -22,6 +22,7 @@ #include <linux/spinlock.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/security.h> #include <linux/mutex.h> #include <net/net_namespace.h> @@ -607,15 +608,11 @@ static ctl_table ipq_root_table[] = { { .ctl_name = 0 } }; -#ifdef CONFIG_PROC_FS -static int -ipq_get_info(char *buffer, char **start, off_t offset, int length) +static int ip_queue_show(struct seq_file *m, void *v) { - int len; - read_lock_bh(&queue_lock); - len = sprintf(buffer, + seq_printf(m, "Peer PID : %d\n" "Copy mode : %hu\n" "Copy range : %u\n" @@ -632,16 +629,21 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) queue_user_dropped); read_unlock_bh(&queue_lock); + return 0; +} - *start = buffer + offset; - len -= offset; - if (len > length) - len = length; - else if (len < 0) - len = 0; - return len; +static int ip_queue_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip_queue_show, NULL); } -#endif /* CONFIG_PROC_FS */ + +static const struct file_operations ip_queue_proc_fops = { + .open = ip_queue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .owner = THIS_MODULE, +}; static struct nf_queue_handler nfqh = { .name = "ip_queue", @@ -661,10 +663,11 @@ static int __init ip_queue_init(void) goto cleanup_netlink_notifier; } - proc = proc_net_create(&init_net, IPQ_PROC_FS_NAME, 0, ipq_get_info); - if (proc) + proc = create_proc_entry(IPQ_PROC_FS_NAME, 0, init_net.proc_net); + if (proc) { proc->owner = THIS_MODULE; - else { + proc->proc_fops = &ip_queue_proc_fops; + } else { printk(KERN_ERR "ip_queue: failed to create proc entry\n"); goto cleanup_ipqnl; } diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 35a5aa69cd9..c31b8766825 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -69,7 +69,7 @@ static void __exit nf_nat_amanda_fini(void) static int __init nf_nat_amanda_init(void) { - BUG_ON(rcu_dereference(nf_nat_amanda_hook)); + BUG_ON(nf_nat_amanda_hook != NULL); rcu_assign_pointer(nf_nat_amanda_hook, help); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 56e93f692e8..70e7997ea28 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -681,7 +681,7 @@ static int clean_nat(struct nf_conn *i, void *data) if (!nat) return 0; - memset(nat, 0, sizeof(nat)); + memset(nat, 0, sizeof(*nat)); i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index e1a16d3ea4c..a1d5d58a58b 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -147,7 +147,7 @@ static void __exit nf_nat_ftp_fini(void) static int __init nf_nat_ftp_init(void) { - BUG_ON(rcu_dereference(nf_nat_ftp_hook)); + BUG_ON(nf_nat_ftp_hook != NULL); rcu_assign_pointer(nf_nat_ftp_hook, nf_nat_ftp); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index a868c8c4132..93e18ef114f 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -544,15 +544,15 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int __init init(void) { - BUG_ON(rcu_dereference(set_h245_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_h225_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_sig_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_ras_addr_hook) != NULL); - BUG_ON(rcu_dereference(nat_rtp_rtcp_hook) != NULL); - BUG_ON(rcu_dereference(nat_t120_hook) != NULL); - BUG_ON(rcu_dereference(nat_h245_hook) != NULL); - BUG_ON(rcu_dereference(nat_callforwarding_hook) != NULL); - BUG_ON(rcu_dereference(nat_q931_hook) != NULL); + BUG_ON(set_h245_addr_hook != NULL); + BUG_ON(set_h225_addr_hook != NULL); + BUG_ON(set_sig_addr_hook != NULL); + BUG_ON(set_ras_addr_hook != NULL); + BUG_ON(nat_rtp_rtcp_hook != NULL); + BUG_ON(nat_t120_hook != NULL); + BUG_ON(nat_h245_hook != NULL); + BUG_ON(nat_callforwarding_hook != NULL); + BUG_ON(nat_q931_hook != NULL); rcu_assign_pointer(set_h245_addr_hook, set_h245_addr); rcu_assign_pointer(set_h225_addr_hook, set_h225_addr); diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c index 766e2c16c6b..fe6f9cef6c8 100644 --- a/net/ipv4/netfilter/nf_nat_irc.c +++ b/net/ipv4/netfilter/nf_nat_irc.c @@ -74,7 +74,7 @@ static void __exit nf_nat_irc_fini(void) static int __init nf_nat_irc_init(void) { - BUG_ON(rcu_dereference(nf_nat_irc_hook)); + BUG_ON(nf_nat_irc_hook != NULL); rcu_assign_pointer(nf_nat_irc_hook, help); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index e1385a09907..6817e7995f3 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -281,16 +281,16 @@ static int __init nf_nat_helper_pptp_init(void) { nf_nat_need_gre(); - BUG_ON(rcu_dereference(nf_nat_pptp_hook_outbound)); + BUG_ON(nf_nat_pptp_hook_outbound != NULL); rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); - BUG_ON(rcu_dereference(nf_nat_pptp_hook_inbound)); + BUG_ON(nf_nat_pptp_hook_inbound != NULL); rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); - BUG_ON(rcu_dereference(nf_nat_pptp_hook_exp_gre)); + BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); - BUG_ON(rcu_dereference(nf_nat_pptp_hook_expectfn)); + BUG_ON(nf_nat_pptp_hook_expectfn != NULL); rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index ce9edbcc01e..3ca98971a1e 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -293,8 +293,8 @@ static void __exit nf_nat_sip_fini(void) static int __init nf_nat_sip_init(void) { - BUG_ON(rcu_dereference(nf_nat_sip_hook)); - BUG_ON(rcu_dereference(nf_nat_sdp_hook)); + BUG_ON(nf_nat_sip_hook != NULL); + BUG_ON(nf_nat_sdp_hook != NULL); rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); rcu_assign_pointer(nf_nat_sdp_hook, ip_nat_sdp); return 0; diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c index 0ecec701cb4..1360a94766d 100644 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ b/net/ipv4/netfilter/nf_nat_tftp.c @@ -43,7 +43,7 @@ static void __exit nf_nat_tftp_fini(void) static int __init nf_nat_tftp_init(void) { - BUG_ON(rcu_dereference(nf_nat_tftp_hook)); + BUG_ON(nf_nat_tftp_hook != NULL); rcu_assign_pointer(nf_nat_tftp_hook, help); return 0; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ffdccc0972e..ce34b281803 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -46,17 +46,6 @@ #include <net/sock.h> #include <net/raw.h> -static int fold_prot_inuse(struct proto *proto) -{ - int res = 0; - int cpu; - - for_each_possible_cpu(cpu) - res += proto->stats[cpu].inuse; - - return res; -} - /* * Report socket allocation statistics [mea@utu.fi] */ @@ -64,12 +53,12 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", - fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), + sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); - seq_printf(seq, "UDPLITE: inuse %d\n", fold_prot_inuse(&udplite_prot)); - seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); + seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot)); + seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot)); + seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot)); seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues(), ip_frag_mem()); return 0; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 3916faca3af..66b42f547bf 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -760,6 +760,8 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) } } +DEFINE_PROTO_INUSE(raw) + struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, @@ -781,6 +783,7 @@ struct proto raw_prot = { .compat_setsockopt = compat_raw_setsockopt, .compat_getsockopt = compat_raw_getsockopt, #endif + REF_PROTO_INUSE(raw) }; #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 21b12de9e65..1bff9ed349f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -578,6 +578,9 @@ static void rt_check_expire(struct work_struct *work) i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; + if (need_resched()) + cond_resched(); + if (*rthp == NULL) continue; spin_lock_bh(rt_hash_lock_addr(i)); @@ -851,9 +854,7 @@ restart: */ rcu_assign_pointer(rt_hash_table[hash].chain, rth); - rth->u.dst.__use++; - dst_hold(&rth->u.dst); - rth->u.dst.lastuse = now; + dst_use(&rth->u.dst, now); spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); @@ -1813,11 +1814,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_destination; err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); - if (err == -ENOBUFS) - goto e_nobufs; - if (err == -EINVAL) - goto e_inval; - done: in_dev_put(in_dev); if (free_res) @@ -1935,9 +1931,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.oif == 0 && rth->fl.mark == skb->mark && rth->fl.fl4_tos == tos) { - rth->u.dst.lastuse = jiffies; - dst_hold(&rth->u.dst); - rth->u.dst.__use++; + dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); skb->dst = (struct dst_entry*)rth; @@ -2331,9 +2325,7 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { - rth->u.dst.lastuse = jiffies; - dst_hold(&rth->u.dst); - rth->u.dst.__use++; + dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); *rp = rth; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c64072bb504..8e65182f7af 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2456,11 +2456,11 @@ void __init tcp_init(void) thash_entries ? 0 : 512 * 1024); tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; for (i = 0; i < tcp_hashinfo.ehash_size; i++) { - rwlock_init(&tcp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); } - + if (inet_ehash_locks_alloc(&tcp_hashinfo)) + panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ca9590f4f52..0f0c1c9829a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1269,6 +1269,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window)) return 0; + if (!tp->packets_out) + goto out; + /* SACK fastpath: * if the only SACK change is the increase of the end_seq of * the first block then only apply that SACK block @@ -1400,11 +1403,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* DSACK info lost if out-of-mem, try SACK still */ if (in_sack <= 0) in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); - if (in_sack < 0) + if (unlikely(in_sack < 0)) break; - fack_count += tcp_skb_pcount(skb); - sacked = TCP_SKB_CB(skb)->sacked; /* Account D-SACK for retransmitted packet. */ @@ -1419,19 +1420,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if ((dup_sack && in_sack) && (sacked&TCPCB_SACKED_ACKED)) reord = min(fack_count, reord); - } else { - /* If it was in a hole, we detected reordering. */ - if (fack_count < prior_fackets && - !(sacked&TCPCB_SACKED_ACKED)) - reord = min(fack_count, reord); } /* Nothing to do; acked frame is about to be dropped. */ + fack_count += tcp_skb_pcount(skb); continue; } - if (!in_sack) + if (!in_sack) { + fack_count += tcp_skb_pcount(skb); continue; + } if (!(sacked&TCPCB_SACKED_ACKED)) { if (sacked & TCPCB_SACKED_RETRANS) { @@ -1448,12 +1447,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->retransmit_skb_hint = NULL; } } else { - /* New sack for not retransmitted frame, - * which was in hole. It is reordering. - */ - if (!(sacked & TCPCB_RETRANS) && - fack_count < prior_fackets) - reord = min(fack_count, reord); + if (!(sacked & TCPCB_RETRANS)) { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (fack_count < prior_fackets) + reord = min(fack_count, reord); + + /* SACK enhanced F-RTO (RFC4138; Appendix B) */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + flag |= FLAG_ONLY_ORIG_SACKED; + } if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; @@ -1462,24 +1466,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* clear lost hint */ tp->retransmit_skb_hint = NULL; } - /* SACK enhanced F-RTO detection. - * Set flag if and only if non-rexmitted - * segments below frto_highmark are - * SACKed (RFC4138; Appendix B). - * Clearing correct due to in-order walk - */ - if (after(end_seq, tp->frto_highmark)) { - flag &= ~FLAG_ONLY_ORIG_SACKED; - } else { - if (!(sacked & TCPCB_RETRANS)) - flag |= FLAG_ONLY_ORIG_SACKED; - } } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; flag |= FLAG_DATA_SACKED; tp->sacked_out += tcp_skb_pcount(skb); + fack_count += tcp_skb_pcount(skb); if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; @@ -1490,6 +1483,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } else { if (dup_sack && (sacked&TCPCB_RETRANS)) reord = min(fack_count, reord); + + fack_count += tcp_skb_pcount(skb); } /* D-SACK. We can detect redundant retransmission @@ -1504,6 +1499,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->retransmit_skb_hint = NULL; } } + + /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct + * due to in-order walk + */ + if (after(end_seq, tp->frto_highmark)) + flag &= ~FLAG_ONLY_ORIG_SACKED; } if (tp->retrans_out && @@ -1515,7 +1516,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) - tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); + tcp_update_reordering(sk, tp->fackets_out - reord, 0); + +out: #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); @@ -1671,6 +1674,9 @@ void tcp_enter_frto(struct sock *sk) } tcp_verify_left_out(tp); + /* Too bad if TCP was application limited */ + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); + /* Earlier loss recovery underway (see RFC4138; Appendix B). * The last condition is necessary at least in tp->frto_counter case. */ @@ -1703,6 +1709,8 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; + + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; /* * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... @@ -1716,7 +1724,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) } else { if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; } /* Don't lost mark skbs that were fwd transmitted after RTO */ @@ -2630,7 +2638,8 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) +static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, + int prior_fackets) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2639,6 +2648,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) int fully_acked = 1; int flag = 0; int prior_packets = tp->packets_out; + u32 cnt = 0; + u32 reord = tp->packets_out; s32 seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); @@ -2679,10 +2690,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) if ((flag & FLAG_DATA_ACKED) || (packets_acked > 1)) flag |= FLAG_NONHEAD_RETRANS_ACKED; - } else if (seq_rtt < 0) { - seq_rtt = now - scb->when; - if (fully_acked) - last_ackt = skb->tstamp; + } else { + if (seq_rtt < 0) { + seq_rtt = now - scb->when; + if (fully_acked) + last_ackt = skb->tstamp; + } + if (!(sacked & TCPCB_SACKED_ACKED)) + reord = min(cnt, reord); } if (sacked & TCPCB_SACKED_ACKED) @@ -2693,12 +2708,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) if ((sacked & TCPCB_URG) && tp->urg_mode && !before(end_seq, tp->snd_up)) tp->urg_mode = 0; - } else if (seq_rtt < 0) { - seq_rtt = now - scb->when; - if (fully_acked) - last_ackt = skb->tstamp; + } else { + if (seq_rtt < 0) { + seq_rtt = now - scb->when; + if (fully_acked) + last_ackt = skb->tstamp; + } + reord = min(cnt, reord); } tp->packets_out -= packets_acked; + cnt += packets_acked; /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not @@ -2730,13 +2749,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) tcp_ack_update_rtt(sk, flag, seq_rtt); tcp_rearm_rto(sk); + if (tcp_is_reno(tp)) { + tcp_remove_reno_sacks(sk, pkts_acked); + } else { + /* Non-retransmitted hole got filled? That's reordering */ + if (reord < prior_fackets) + tcp_update_reordering(sk, tp->fackets_out - reord, 0); + } + tp->fackets_out -= min(pkts_acked, tp->fackets_out); /* hint's skb might be NULL but we don't need to care */ tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, tp->fastpath_cnt_hint); - if (tcp_is_reno(tp)) - tcp_remove_reno_sacks(sk, pkts_acked); - if (ca_ops->pkts_acked) { s32 rtt_us = -1; @@ -3019,6 +3043,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; + u32 prior_fackets; s32 seq_rtt; int prior_packets; int frto_cwnd = 0; @@ -3043,6 +3068,8 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache); } + prior_fackets = tp->fackets_out; + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -3084,13 +3111,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt, prior_fackets); + if (tp->frto_counter) + frto_cwnd = tcp_process_frto(sk, flag); /* Guarantee sacktag reordering detection against wrap-arounds */ if (before(tp->frto_highmark, tp->snd_una)) tp->frto_highmark = 0; - if (tp->frto_counter) - frto_cwnd = tcp_process_frto(sk, flag); if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d438dfb0c8f..e566f3c6767 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq) struct sock *sk; struct hlist_node *node; struct inet_timewait_sock *tw; + rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); - read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; @@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq) rc = tw; goto out; } - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2094,11 +2095,11 @@ get_tw: cur = tw; goto out; } - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else { cur = NULL; @@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); break; } } @@ -2417,6 +2418,8 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ +DEFINE_PROTO_INUSE(tcp) + struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -2451,6 +2454,7 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif + REF_PROTO_INUSE(tcp) }; void __init tcp_v4_init(struct net_proto_family *ops) diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index a794a8ca8b4..978b3fd61e6 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -17,6 +17,11 @@ static struct xfrm_tunnel *tunnel4_handlers; static struct xfrm_tunnel *tunnel64_handlers; static DEFINE_MUTEX(tunnel4_mutex); +static inline struct xfrm_tunnel **fam_handlers(unsigned short family) +{ + return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; +} + int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel **pprev; @@ -25,8 +30,7 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) mutex_lock(&tunnel4_mutex); - for (pprev = (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; - *pprev; pprev = &(*pprev)->next) { + for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { if ((*pprev)->priority > priority) break; if ((*pprev)->priority == priority) @@ -53,8 +57,7 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) mutex_lock(&tunnel4_mutex); - for (pprev = (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; - *pprev; pprev = &(*pprev)->next) { + for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { if (*pprev == handler) { *pprev = handler->next; ret = 0; @@ -118,6 +121,17 @@ static void tunnel4_err(struct sk_buff *skb, u32 info) break; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static void tunnel64_err(struct sk_buff *skb, u32 info) +{ + struct xfrm_tunnel *handler; + + for (handler = tunnel64_handlers; handler; handler = handler->next) + if (!handler->err_handler(skb, info)) + break; +} +#endif + static struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, @@ -127,7 +141,7 @@ static struct net_protocol tunnel4_protocol = { #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, - .err_handler = tunnel4_err, + .err_handler = tunnel64_err, .no_policy = 1, }; #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4bc25b46f33..03c400ca14c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1430,6 +1430,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } +DEFINE_PROTO_INUSE(udp) + struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -1452,6 +1454,7 @@ struct proto udp_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif + REF_PROTO_INUSE(udp) }; /* ------------------------------------------------------------------------ */ diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 94977205abb..f5baeb3e8b8 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -44,6 +44,8 @@ static struct net_protocol udplite_protocol = { .no_policy = 1, }; +DEFINE_PROTO_INUSE(udplite) + struct proto udplite_prot = { .name = "UDP-Lite", .owner = THIS_MODULE, @@ -67,6 +69,7 @@ struct proto udplite_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif + REF_PROTO_INUSE(udplite) }; static struct inet_protosw udplite4_protosw = { |