aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/skbuff.h38
-rw-r--r--include/net/inet_hashtables.h2
-rw-r--r--net/core/stream.c12
-rw-r--r--net/dccp/ipv4.c32
-rw-r--r--net/ipv4/inet_connection_sock.c14
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv6/tcp_ipv6.c15
-rw-r--r--net/sched/sch_netem.c122
9 files changed, 155 insertions, 83 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4286d832166..fdfb8fe8c38 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -603,23 +603,23 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
*/
/**
- * __skb_queue_head - queue a buffer at the list head
+ * __skb_queue_after - queue a buffer at the list head
* @list: list to use
+ * @prev: place after this buffer
* @newsk: buffer to queue
*
- * Queue a buffer at the start of a list. This function takes no locks
+ * Queue a buffer int the middle of a list. This function takes no locks
* and you must therefore hold required locks before calling it.
*
* A buffer cannot be placed on two lists at the same time.
*/
-extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
-static inline void __skb_queue_head(struct sk_buff_head *list,
- struct sk_buff *newsk)
+static inline void __skb_queue_after(struct sk_buff_head *list,
+ struct sk_buff *prev,
+ struct sk_buff *newsk)
{
- struct sk_buff *prev, *next;
-
+ struct sk_buff *next;
list->qlen++;
- prev = (struct sk_buff *)list;
+
next = prev->next;
newsk->next = next;
newsk->prev = prev;
@@ -627,6 +627,23 @@ static inline void __skb_queue_head(struct sk_buff_head *list,
}
/**
+ * __skb_queue_head - queue a buffer at the list head
+ * @list: list to use
+ * @newsk: buffer to queue
+ *
+ * Queue a buffer at the start of a list. This function takes no locks
+ * and you must therefore hold required locks before calling it.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
+static inline void __skb_queue_head(struct sk_buff_head *list,
+ struct sk_buff *newsk)
+{
+ __skb_queue_after(list, (struct sk_buff *)list, newsk);
+}
+
+/**
* __skb_queue_tail - queue a buffer at the list tail
* @list: list to use
* @newsk: buffer to queue
@@ -1203,6 +1220,11 @@ static inline void kunmap_skb_frag(void *vaddr)
prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \
skb = skb->next)
+#define skb_queue_reverse_walk(queue, skb) \
+ for (skb = (queue)->prev; \
+ prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \
+ skb = skb->prev)
+
extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
int noblock, int *err);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index f50f9596834..07840baa934 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -125,9 +125,7 @@ struct inet_hashinfo {
rwlock_t lhash_lock ____cacheline_aligned;
atomic_t lhash_users;
wait_queue_head_t lhash_wait;
- spinlock_t portalloc_lock;
kmem_cache_t *bind_bucket_cachep;
- int port_rover;
};
static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
diff --git a/net/core/stream.c b/net/core/stream.c
index ac9edfdf874..15bfd03e802 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
struct task_struct *tsk = current;
DEFINE_WAIT(wait);
+ int done;
- while (1) {
+ do {
if (sk->sk_err)
return sock_error(sk);
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending++;
- if (sk_wait_event(sk, timeo_p,
- !((1 << sk->sk_state) &
- ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))))
- break;
+ done = sk_wait_event(sk, timeo_p,
+ !((1 << sk->sk_state) &
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
finish_wait(sk->sk_sleep, &wait);
sk->sk_write_pending--;
- }
+ } while (!done);
return 0;
}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6298cf58ff9..4b9bc81ae1a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
.lhash_lock = RW_LOCK_UNLOCKED,
.lhash_users = ATOMIC_INIT(0),
.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
- .portalloc_lock = SPIN_LOCK_UNLOCKED,
- .port_rover = 1024 - 1,
};
EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
int ret;
if (snum == 0) {
- int rover;
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
+ int rover = net_random() % (high - low) + low;
struct hlist_node *node;
struct inet_timewait_sock *tw = NULL;
local_bh_disable();
-
- /* TODO. Actually it is not so bad idea to remove
- * dccp_hashinfo.portalloc_lock before next submission to
- * Linus.
- * As soon as we touch this place at all it is time to think.
- *
- * Now it protects single _advisory_ variable
- * dccp_hashinfo.port_rover, hence it is mostly useless.
- * Code will work nicely if we just delete it, but
- * I am afraid in contented case it will work not better or
- * even worse: another cpu just will hit the same bucket
- * and spin there.
- * So some cpu salt could remove both contention and
- * memory pingpong. Any ideas how to do this in a nice way?
- */
- spin_lock(&dccp_hashinfo.portalloc_lock);
- rover = dccp_hashinfo.port_rover;
-
do {
- rover++;
- if ((rover < low) || (rover > high))
- rover = low;
head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
dccp_hashinfo.bhash_size)];
spin_lock(&head->lock);
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
next_port:
spin_unlock(&head->lock);
+ if (++rover > high)
+ rover = low;
} while (--remaining > 0);
- dccp_hashinfo.port_rover = rover;
- spin_unlock(&dccp_hashinfo.portalloc_lock);
local_bh_enable();
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
ok:
/* All locks still held and bhs disabled */
- dccp_hashinfo.port_rover = rover;
- spin_unlock(&dccp_hashinfo.portalloc_lock);
-
inet_bind_hash(sk, tb, rover);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(rover);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 94468a76c5b..3fe021f1a56 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
- int rover;
+ int rover = net_random() % (high - low) + low;
- spin_lock(&hashinfo->portalloc_lock);
- if (hashinfo->port_rover < low)
- rover = low;
- else
- rover = hashinfo->port_rover;
do {
- rover++;
- if (rover > high)
- rover = low;
head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
break;
next:
spin_unlock(&head->lock);
+ if (++rover > high)
+ rover = low;
} while (--remaining > 0);
- hashinfo->port_rover = rover;
- spin_unlock(&hashinfo->portalloc_lock);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3f0013a958..72b7c22e1ea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2112,7 +2112,6 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans >>= (3 - order);
sysctl_max_syn_backlog = 128;
}
- tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
sysctl_tcp_mem[0] = 768 << order;
sysctl_tcp_mem[1] = 1024 << order;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c85819d8474..49d67cd75ed 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
.lhash_lock = RW_LOCK_UNLOCKED,
.lhash_users = ATOMIC_INIT(0),
.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
- .portalloc_lock = SPIN_LOCK_UNLOCKED,
- .port_rover = 1024 - 1,
};
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d693cb988b7..d746d3b27ef 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
- int rover;
+ int rover = net_random() % (high - low) + low;
- spin_lock(&tcp_hashinfo.portalloc_lock);
- if (tcp_hashinfo.port_rover < low)
- rover = low;
- else
- rover = tcp_hashinfo.port_rover;
- do { rover++;
- if (rover > high)
- rover = low;
+ do {
head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
break;
next:
spin_unlock(&head->lock);
+ if (++rover > high)
+ rover = low;
} while (--remaining > 0);
- tcp_hashinfo.port_rover = rover;
- spin_unlock(&tcp_hashinfo.portalloc_lock);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb9bf8d5003..cdc8d283791 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,6 +25,8 @@
#include <net/pkt_sched.h>
+#define VERSION "1.1"
+
/* Network Emulation Queuing algorithm.
====================================
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|| q->counter < q->gap /* inside last reordering gap */
|| q->reorder < get_crandom(&q->reorder_cor)) {
psched_time_t now;
+ psched_tdiff_t delay;
+
+ delay = tabledist(q->latency, q->jitter,
+ &q->delay_cor, q->delay_dist);
+
PSCHED_GET_TIME(now);
- PSCHED_TADD2(now, tabledist(q->latency, q->jitter,
- &q->delay_cor, q->delay_dist),
- cb->time_to_send);
+ PSCHED_TADD2(now, delay, cb->time_to_send);
++q->counter;
ret = q->qdisc->enqueue(skb, q->qdisc);
} else {
@@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
const struct netem_skb_cb *cb
= (const struct netem_skb_cb *)skb->cb;
psched_time_t now;
- long delay;
/* if more time remaining? */
PSCHED_GET_TIME(now);
- delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
- pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
- if (delay <= 0) {
+
+ if (PSCHED_TLESS(cb->time_to_send, now)) {
pr_debug("netem_dequeue: return skb=%p\n", skb);
sch->q.qlen--;
sch->flags &= ~TCQ_F_THROTTLED;
return skb;
- }
+ } else {
+ psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
+
+ if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
+ sch->qstats.drops++;
- mod_timer(&q->timer, jiffies + delay);
- sch->flags |= TCQ_F_THROTTLED;
+ /* After this qlen is confused */
+ printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
+ q->qdisc->ops->id);
- if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)
- sch->qstats.drops++;
+ sch->q.qlen--;
+ }
+
+ mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
+ sch->flags |= TCQ_F_THROTTLED;
+ }
}
return NULL;
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch)
del_timer_sync(&q->timer);
}
+/* Pass size change message down to embedded FIFO */
static int set_fifo_limit(struct Qdisc *q, int limit)
{
struct rtattr *rta;
int ret = -ENOMEM;
+ /* Hack to avoid sending change message to non-FIFO */
+ if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+ return 0;
+
rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (rta) {
rta->rta_type = RTM_NEWQDISC;
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
return 0;
}
+/*
+ * Special case version of FIFO queue for use by netem.
+ * It queues in order based on timestamps in skb's
+ */
+struct fifo_sched_data {
+ u32 limit;
+};
+
+static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *list = &sch->q;
+ const struct netem_skb_cb *ncb
+ = (const struct netem_skb_cb *)nskb->cb;
+ struct sk_buff *skb;
+
+ if (likely(skb_queue_len(list) < q->limit)) {
+ skb_queue_reverse_walk(list, skb) {
+ const struct netem_skb_cb *cb
+ = (const struct netem_skb_cb *)skb->cb;
+
+ if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
+ break;
+ }
+
+ __skb_queue_after(list, skb, nskb);
+
+ sch->qstats.backlog += nskb->len;
+ sch->bstats.bytes += nskb->len;
+ sch->bstats.packets++;
+
+ return NET_XMIT_SUCCESS;
+ }
+
+ return qdisc_drop(nskb, sch);
+}
+
+static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+
+ if (opt) {
+ struct tc_fifo_qopt *ctl = RTA_DATA(opt);
+ if (RTA_PAYLOAD(opt) < sizeof(*ctl))
+ return -EINVAL;
+
+ q->limit = ctl->limit;
+ } else
+ q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
+
+ return 0;
+}
+
+static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+ struct tc_fifo_qopt opt = { .limit = q->limit };
+
+ RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ return skb->len;
+
+rtattr_failure:
+ return -1;
+}
+
+static struct Qdisc_ops tfifo_qdisc_ops = {
+ .id = "tfifo",
+ .priv_size = sizeof(struct fifo_sched_data),
+ .enqueue = tfifo_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .requeue = qdisc_requeue,
+ .drop = qdisc_queue_drop,
+ .init = tfifo_init,
+ .reset = qdisc_reset_queue,
+ .change = tfifo_init,
+ .dump = tfifo_dump,
+};
+
static int netem_init(struct Qdisc *sch, struct rtattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
q->timer.function = netem_watchdog;
q->timer.data = (unsigned long) sch;
- q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+ q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
if (!q->qdisc) {
pr_debug("netem: qdisc create failed\n");
return -ENOMEM;
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
static int __init netem_module_init(void)
{
+ pr_info("netem: version " VERSION "\n");
return register_qdisc(&netem_qdisc_ops);
}
static void __exit netem_module_exit(void)