From 713b15b3781240653d2b38414da3f4567dcbcf91 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:26:58 -0600 Subject: lguest: be paranoid about guest playing with device descriptors. We can't trust the values in the device descriptor table once the guest has booted, so keep local copies. They could set them to strange values then cause us to segv (they're 8 bit values, so they can't make our pointers go too wild). This becomes more important with the following patches which read them. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index d36fcc0f271..e65d6cbf241 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -126,9 +126,13 @@ struct device /* The linked-list pointer. */ struct device *next; - /* The this device's descriptor, as mapped into the Guest. */ + /* The device's descriptor, as mapped into the Guest. */ struct lguest_device_desc *desc; + /* We can't trust desc values once Guest has booted: we use these. */ + unsigned int feature_len; + unsigned int num_vq; + /* The name of this device, for --verbose. */ const char *name; @@ -245,7 +249,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) static u8 *get_feature_bits(struct device *dev) { return (u8 *)(dev->desc + 1) - + dev->desc->num_vq * sizeof(struct lguest_vqconfig); + + dev->num_vq * sizeof(struct lguest_vqconfig); } /*L:100 The Launcher code itself takes us out into userspace, that scary place @@ -979,8 +983,8 @@ static void update_device_status(struct device *dev) verbose("Resetting device %s\n", dev->name); /* Clear any features they've acked. */ - memset(get_feature_bits(dev) + dev->desc->feature_len, 0, - dev->desc->feature_len); + memset(get_feature_bits(dev) + dev->feature_len, 0, + dev->feature_len); /* Zero out the virtqueues. */ for (vq = dev->vq; vq; vq = vq->next) { @@ -994,12 +998,12 @@ static void update_device_status(struct device *dev) unsigned int i; verbose("Device %s OK: offered", dev->name); - for (i = 0; i < dev->desc->feature_len; i++) + for (i = 0; i < dev->feature_len; i++) verbose(" %02x", get_feature_bits(dev)[i]); verbose(", accepted"); - for (i = 0; i < dev->desc->feature_len; i++) + for (i = 0; i < dev->feature_len; i++) verbose(" %02x", get_feature_bits(dev) - [dev->desc->feature_len+i]); + [dev->feature_len+i]); if (dev->ready) dev->ready(dev); @@ -1129,8 +1133,8 @@ static void handle_input(int fd) static u8 *device_config(const struct device *dev) { return (void *)(dev->desc + 1) - + dev->desc->num_vq * sizeof(struct lguest_vqconfig) - + dev->desc->feature_len * 2; + + dev->num_vq * sizeof(struct lguest_vqconfig) + + dev->feature_len * 2; } /* This routine allocates a new "struct lguest_device_desc" from descriptor @@ -1191,6 +1195,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, * yet, otherwise we'd be overwriting them. */ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); memcpy(device_config(dev), &vq->config, sizeof(vq->config)); + dev->num_vq++; dev->desc->num_vq++; verbose("Virtqueue page %#lx\n", to_guest_phys(p)); @@ -1219,7 +1224,7 @@ static void add_feature(struct device *dev, unsigned bit) /* We can't extend the feature bits once we've added config bytes */ if (dev->desc->feature_len <= bit / CHAR_BIT) { assert(dev->desc->config_len == 0); - dev->desc->feature_len = (bit / CHAR_BIT) + 1; + dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; } features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); @@ -1259,6 +1264,8 @@ static struct device *new_device(const char *name, u16 type, int fd, dev->name = name; dev->vq = NULL; dev->ready = NULL; + dev->feature_len = 0; + dev->num_vq = 0; /* Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus -- cgit v1.2.3 From 56739c802ca845435f60e909104637880e14c769 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:26:59 -0600 Subject: lguest: cleanup passing of /dev/lguest fd around example launcher. We hand the /dev/lguest fd everywhere; it's far neater to just make it a global (it already is, in fact, hidden in the waker_fds struct). Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 102 +++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 55 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index e65d6cbf241..1a2b906a3ae 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -79,7 +79,6 @@ static bool verbose; /* File descriptors for the Waker. */ struct { int pipe[2]; - int lguest_fd; } waker_fds; /* The pointer to the start of guest memory. */ @@ -89,6 +88,8 @@ static unsigned long guest_limit, guest_max; /* The pipe for signal hander to write to. */ static int timeoutpipe[2]; static unsigned int timeout_usec = 500; +/* The /dev/lguest file descriptor. */ +static int lguest_fd; /* a per-cpu variable indicating whose vcpu is currently running */ static unsigned int __thread cpu_id; @@ -139,7 +140,7 @@ struct device /* If handle_input is set, it wants to be called when this file * descriptor is ready. */ int fd; - bool (*handle_input)(int fd, struct device *me); + bool (*handle_input)(struct device *me); /* Any queues attached to this device */ struct virtqueue *vq; @@ -169,7 +170,7 @@ struct virtqueue u16 last_avail_idx; /* The routine to call when the Guest pings us, or timeout. */ - void (*handle_output)(int fd, struct virtqueue *me, bool timeout); + void (*handle_output)(struct virtqueue *me, bool timeout); /* Outstanding buffers */ unsigned int inflight; @@ -509,21 +510,16 @@ static void concat(char *dst, char *args[]) * saw the arguments it expects when we looked at initialize() in lguest_user.c: * the base of Guest "physical" memory, the top physical page to allow and the * entry point for the Guest. */ -static int tell_kernel(unsigned long start) +static void tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, (unsigned long)guest_base, guest_limit / getpagesize(), start }; - int fd; - verbose("Guest: %p - %p (%#lx)\n", guest_base, guest_base + guest_limit, guest_limit); - fd = open_or_die("/dev/lguest", O_RDWR); - if (write(fd, args, sizeof(args)) < 0) + lguest_fd = open_or_die("/dev/lguest", O_RDWR); + if (write(lguest_fd, args, sizeof(args)) < 0) err(1, "Writing to /dev/lguest"); - - /* We return the /dev/lguest file descriptor to control this Guest */ - return fd; } /*:*/ @@ -583,21 +579,18 @@ static int waker(void *unused) } /* Send LHREQ_BREAK command to snap the Launcher out of it. */ - pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id); + pwrite(lguest_fd, args, sizeof(args), cpu_id); } return 0; } /* This routine just sets up a pipe to the Waker process. */ -static void setup_waker(int lguest_fd) +static void setup_waker(void) { /* This pipe is closed when Launcher dies, telling Waker. */ if (pipe(waker_fds.pipe) != 0) err(1, "Creating pipe for Waker"); - /* Waker also needs to know the lguest fd */ - waker_fds.lguest_fd = lguest_fd; - if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) err(1, "Creating Waker"); } @@ -727,7 +720,7 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len) } /* This actually sends the interrupt for this virtqueue */ -static void trigger_irq(int fd, struct virtqueue *vq) +static void trigger_irq(struct virtqueue *vq) { unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; @@ -737,16 +730,15 @@ static void trigger_irq(int fd, struct virtqueue *vq) return; /* Send the Guest an interrupt tell them we used something up. */ - if (write(fd, buf, sizeof(buf)) != 0) + if (write(lguest_fd, buf, sizeof(buf)) != 0) err(1, "Triggering irq %i", vq->config.irq); } /* And here's the combo meal deal. Supersize me! */ -static void add_used_and_trigger(int fd, struct virtqueue *vq, - unsigned int head, int len) +static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) { add_used(vq, head, len); - trigger_irq(fd, vq); + trigger_irq(vq); } /* @@ -770,7 +762,7 @@ struct console_abort }; /* This is the routine which handles console input (ie. stdin). */ -static bool handle_console_input(int fd, struct device *dev) +static bool handle_console_input(struct device *dev) { int len; unsigned int head, in_num, out_num; @@ -804,7 +796,7 @@ static bool handle_console_input(int fd, struct device *dev) } /* Tell the Guest about the new input. */ - add_used_and_trigger(fd, dev->vq, head, len); + add_used_and_trigger(dev->vq, head, len); /* Three ^C within one second? Exit. * @@ -824,7 +816,7 @@ static bool handle_console_input(int fd, struct device *dev) close(waker_fds.pipe[1]); /* Just in case Waker is blocked in BREAK, send * unbreak now. */ - write(fd, args, sizeof(args)); + write(lguest_fd, args, sizeof(args)); exit(2); } abort->count = 0; @@ -839,7 +831,7 @@ static bool handle_console_input(int fd, struct device *dev) /* Handling output for console is simple: we just get all the output buffers * and write them to stdout. */ -static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) +static void handle_console_output(struct virtqueue *vq, bool timeout) { unsigned int head, out, in; int len; @@ -850,7 +842,7 @@ static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) if (in) errx(1, "Input buffers in output queue?"); len = writev(STDOUT_FILENO, iov, out); - add_used_and_trigger(fd, vq, head, len); + add_used_and_trigger(vq, head, len); } } @@ -879,7 +871,7 @@ static void block_vq(struct virtqueue *vq) * and write them (ignoring the first element) to this device's file descriptor * (/dev/net/tun). */ -static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) +static void handle_net_output(struct virtqueue *vq, bool timeout) { unsigned int head, out, in, num = 0; int len; @@ -893,7 +885,7 @@ static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) len = writev(vq->dev->fd, iov, out); if (len < 0) err(1, "Writing network packet to tun"); - add_used_and_trigger(fd, vq, head, len); + add_used_and_trigger(vq, head, len); num++; } @@ -917,7 +909,7 @@ static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) /* This is where we handle a packet coming in from the tun device to our * Guest. */ -static bool handle_tun_input(int fd, struct device *dev) +static bool handle_tun_input(struct device *dev) { unsigned int head, in_num, out_num; int len; @@ -946,7 +938,7 @@ static bool handle_tun_input(int fd, struct device *dev) err(1, "reading network"); /* Tell the Guest about the new packet. */ - add_used_and_trigger(fd, dev->vq, head, len); + add_used_and_trigger(dev->vq, head, len); verbose("tun input packet len %i [%02x %02x] (%s)\n", len, ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], @@ -959,18 +951,18 @@ static bool handle_tun_input(int fd, struct device *dev) /*L:215 This is the callback attached to the network and console input * virtqueues: it ensures we try again, in case we stopped console or net * delivery because Guest didn't have any buffers. */ -static void enable_fd(int fd, struct virtqueue *vq, bool timeout) +static void enable_fd(struct virtqueue *vq, bool timeout) { add_device_fd(vq->dev->fd); /* Snap the Waker out of its select loop. */ write(waker_fds.pipe[1], "", 1); } -static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) +static void net_enable_fd(struct virtqueue *vq, bool timeout) { /* We don't need to know again when Guest refills receive buffer. */ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - enable_fd(fd, vq, timeout); + enable_fd(vq, timeout); } /* When the Guest tells us they updated the status field, we handle it. */ @@ -1011,7 +1003,7 @@ static void update_device_status(struct device *dev) } /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ -static void handle_output(int fd, unsigned long addr) +static void handle_output(unsigned long addr) { struct device *i; struct virtqueue *vq; @@ -1039,7 +1031,7 @@ static void handle_output(int fd, unsigned long addr) if (strcmp(vq->dev->name, "console") != 0) verbose("Output to %s\n", vq->dev->name); if (vq->handle_output) - vq->handle_output(fd, vq, false); + vq->handle_output(vq, false); return; } } @@ -1053,7 +1045,7 @@ static void handle_output(int fd, unsigned long addr) strnlen(from_guest_phys(addr), guest_limit - addr)); } -static void handle_timeout(int fd) +static void handle_timeout(void) { char buf[32]; struct device *i; @@ -1071,14 +1063,14 @@ static void handle_timeout(int fd) vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; vq->blocked = false; if (vq->handle_output) - vq->handle_output(fd, vq, true); + vq->handle_output(vq, true); } } } /* This is called when the Waker wakes us up: check for incoming file * descriptors. */ -static void handle_input(int fd) +static void handle_input(void) { /* select() wants a zeroed timeval to mean "don't wait". */ struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; @@ -1100,7 +1092,7 @@ static void handle_input(int fd) * descriptors and a method of handling them. */ for (i = devices.dev; i; i = i->next) { if (i->handle_input && FD_ISSET(i->fd, &fds)) { - if (i->handle_input(fd, i)) + if (i->handle_input(i)) continue; /* If handle_input() returns false, it means we @@ -1114,7 +1106,7 @@ static void handle_input(int fd) /* Is this the timeout fd? */ if (FD_ISSET(timeoutpipe[0], &fds)) - handle_timeout(fd); + handle_timeout(); } } @@ -1163,7 +1155,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) /* Each device descriptor is followed by the description of its virtqueues. We * specify how many descriptors the virtqueue is to have. */ static void add_virtqueue(struct device *dev, unsigned int num_descs, - void (*handle_output)(int, struct virtqueue *, bool)) + void (*handle_output)(struct virtqueue *, bool)) { unsigned int pages; struct virtqueue **i, *vq = malloc(sizeof(*vq)); @@ -1249,7 +1241,7 @@ static void set_config(struct device *dev, unsigned len, const void *conf) * * See what I mean about userspace being boring? */ static struct device *new_device(const char *name, u16 type, int fd, - bool (*handle_input)(int, struct device *)) + bool (*handle_input)(struct device *)) { struct device *dev = malloc(sizeof(*dev)); @@ -1678,7 +1670,7 @@ static int io_thread(void *_dev) /* Now we've seen the I/O thread, we return to the Launcher to see what happens * when that thread tells us it's completed some I/O. */ -static bool handle_io_finish(int fd, struct device *dev) +static bool handle_io_finish(struct device *dev) { char c; @@ -1688,12 +1680,12 @@ static bool handle_io_finish(int fd, struct device *dev) exit(1); /* It did some work, so trigger the irq. */ - trigger_irq(fd, dev->vq); + trigger_irq(dev->vq); return true; } /* When the Guest submits some I/O, we just need to wake the I/O thread. */ -static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) +static void handle_virtblk_output(struct virtqueue *vq, bool timeout) { struct vblk_info *vblk = vq->dev->priv; char c = 0; @@ -1771,7 +1763,7 @@ static void setup_block_file(const char *filename) * console is the reverse. * * The same logic applies, however. */ -static bool handle_rng_input(int fd, struct device *dev) +static bool handle_rng_input(struct device *dev) { int len; unsigned int head, in_num, out_num, totlen = 0; @@ -1800,7 +1792,7 @@ static bool handle_rng_input(int fd, struct device *dev) } /* Tell the Guest about the new input. */ - add_used_and_trigger(fd, dev->vq, head, totlen); + add_used_and_trigger(dev->vq, head, totlen); /* Everything went OK! */ return true; @@ -1841,7 +1833,7 @@ static void __attribute__((noreturn)) restart_guest(void) /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves * its input and output, and finally, lays it to rest. */ -static void __attribute__((noreturn)) run_guest(int lguest_fd) +static void __attribute__((noreturn)) run_guest(void) { for (;;) { unsigned long args[] = { LHREQ_BREAK, 0 }; @@ -1855,7 +1847,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) /* One unsigned long means the Guest did HCALL_NOTIFY */ if (readval == sizeof(notify_addr)) { verbose("Notify on address %#lx\n", notify_addr); - handle_output(lguest_fd, notify_addr); + handle_output(notify_addr); continue; /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { @@ -1875,7 +1867,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) continue; /* Service input, then unset the BREAK to release the Waker. */ - handle_input(lguest_fd); + handle_input(); if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) err(1, "Resetting break"); } @@ -1911,8 +1903,8 @@ int main(int argc, char *argv[]) /* Memory, top-level pagetable, code startpoint and size of the * (optional) initrd. */ unsigned long mem = 0, start, initrd_size = 0; - /* Two temporaries and the /dev/lguest file descriptor. */ - int i, c, lguest_fd; + /* Two temporaries. */ + int i, c; /* The boot information for the Guest. */ struct boot_params *boot; /* If they specify an initrd file to load. */ @@ -2030,15 +2022,15 @@ int main(int argc, char *argv[]) /* We tell the kernel to initialize the Guest: this returns the open * /dev/lguest file descriptor. */ - lguest_fd = tell_kernel(start); + tell_kernel(start); /* We clone off a thread, which wakes the Launcher whenever one of the * input file descriptors needs attention. We call this the Waker, and * we'll cover it in a moment. */ - setup_waker(lguest_fd); + setup_waker(); /* Finally, run the Guest. This doesn't return. */ - run_guest(lguest_fd); + run_guest(); } /*:*/ -- cgit v1.2.3 From f7027c6387d0c3acf569845165ec7947e2083c82 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:00 -0600 Subject: lguest: get more serious about wmb() in example Launcher code Since the Launcher process runs the Guest, it doesn't have to be very serious about its barriers: the Guest isn't running while we are (Guest is UP). Before we change to use threads to service devices, we need to fix this. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 1a2b906a3ae..1e31d1ec12a 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -182,9 +182,10 @@ struct virtqueue /* Remember the arguments to the program so we can "reboot" */ static char **main_args; -/* Since guest is UP and we don't run at the same time, we don't need barriers. - * But I include them in the code in case others copy it. */ -#define wmb() +/* We have to be careful with barriers: our devices are all run in separate + * threads and so we need to make sure that changes visible to the Guest happen + * in precise order. */ +#define wmb() __asm__ __volatile__("" : : : "memory") /* Convert an iovec element to the given type. * -- cgit v1.2.3 From ebf9a5a99c1a464afe0b4dfa64416fc8b273bc5c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:01 -0600 Subject: lguest: remove invalid interrupt forcing logic. 20887611523e749d99cc7d64ff6c97d27529fbae (lguest: notify on empty) introduced lguest support for the VIRTIO_F_NOTIFY_ON_EMPTY flag, but in fact it turned on interrupts all the time. Because we always process one buffer at a time, the inflight count is always 0 when call trigger_irq and so we always ignore VRING_AVAIL_F_NO_INTERRUPT from the Guest. It should be looking to see if there are more buffers in the Guest's queue: if it's empty, then we force an interrupt. This makes little difference, since we usually have an empty queue; but that's the subject of another patch. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 1e31d1ec12a..9f3240c4571 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -172,9 +172,6 @@ struct virtqueue /* The routine to call when the Guest pings us, or timeout. */ void (*handle_output)(struct virtqueue *me, bool timeout); - /* Outstanding buffers */ - unsigned int inflight; - /* Is this blocked awaiting a timer? */ bool blocked; }; @@ -699,7 +696,6 @@ static unsigned get_vq_desc(struct virtqueue *vq, errx(1, "Looped descriptor"); } while ((i = next_desc(vq, i)) != vq->vring.num); - vq->inflight++; return head; } @@ -717,7 +713,6 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len) /* Make sure buffer is written before we update index. */ wmb(); vq->vring.used->idx++; - vq->inflight--; } /* This actually sends the interrupt for this virtqueue */ @@ -727,7 +722,7 @@ static void trigger_irq(struct virtqueue *vq) /* If they don't want an interrupt, don't send one, unless empty. */ if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && vq->inflight) + && lg_last_avail(vq) != vq->vring.avail->idx) return; /* Send the Guest an interrupt tell them we used something up. */ @@ -1171,7 +1166,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; - vq->inflight = 0; vq->blocked = false; /* Initialize the configuration. */ -- cgit v1.2.3 From 2644f17d6c932929fd68cfec95691490947e0fd1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:03 -0600 Subject: lguest: clean up example launcher compile flags. 18 months ago 5bbf89fc260830f3f58b331d946a16b39ad1ca2d changed to loading bzImages directly, and no longer manually ungzipping them, so we no longer need libz. Also, -m32 is useful for those on 64-bit platforms (and harmless on 32-bit). Reported-by: Ron Minnich Signed-off-by: Rusty Russell --- Documentation/lguest/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index 1f4f9e888bd..28c8cdfcafd 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile @@ -1,6 +1,5 @@ # This creates the demonstration utility "lguest" which runs a Linux guest. -CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE -LDLIBS:=-lz +CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE all: lguest -- cgit v1.2.3 From e606490c440900e50ccf73a54f6fc6150ff40815 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:04 -0600 Subject: lguest: clean up length-used value in example launcher The "len" field in the used ring for virtio indicates the number of bytes *written* to the buffer. This means the guest doesn't have to zero the buffers in advance as it always knows the used length. Erroneously, the console and network example code puts the length *read* into that field. The guest ignores it, but it's wrong. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 9f3240c4571..8704600c5e4 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -830,15 +830,14 @@ static bool handle_console_input(struct device *dev) static void handle_console_output(struct virtqueue *vq, bool timeout) { unsigned int head, out, in; - int len; struct iovec iov[vq->vring.num]; /* Keep getting output buffers from the Guest until we run out. */ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { if (in) errx(1, "Input buffers in output queue?"); - len = writev(STDOUT_FILENO, iov, out); - add_used_and_trigger(vq, head, len); + writev(STDOUT_FILENO, iov, out); + add_used_and_trigger(vq, head, 0); } } @@ -870,7 +869,6 @@ static void block_vq(struct virtqueue *vq) static void handle_net_output(struct virtqueue *vq, bool timeout) { unsigned int head, out, in, num = 0; - int len; struct iovec iov[vq->vring.num]; static int last_timeout_num; @@ -878,10 +876,9 @@ static void handle_net_output(struct virtqueue *vq, bool timeout) while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { if (in) errx(1, "Input buffers in output queue?"); - len = writev(vq->dev->fd, iov, out); - if (len < 0) + if (writev(vq->dev->fd, iov, out) < 0) err(1, "Writing network packet to tun"); - add_used_and_trigger(vq, head, len); + add_used_and_trigger(vq, head, 0); num++; } -- cgit v1.2.3 From 7b5c806c35f6ff76b2e36a8b5b1513c8a83fcff7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:05 -0600 Subject: lguest: fix writev returning short on console output I've never seen it here, but I can't find anywhere that says writev will write everything. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 8704600c5e4..02fa524cf4a 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -836,7 +836,12 @@ static void handle_console_output(struct virtqueue *vq, bool timeout) while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { if (in) errx(1, "Input buffers in output queue?"); - writev(STDOUT_FILENO, iov, out); + while (!iov_empty(iov, out)) { + int len = writev(STDOUT_FILENO, iov, out); + if (len <= 0) + err(1, "Write to stdout gave %i", len); + iov_consume(iov, out, len); + } add_used_and_trigger(vq, head, 0); } } -- cgit v1.2.3 From acdd0b6292b282c4511897ac2691a47befbf1c6a Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:07 -0600 Subject: lguest: PAE support This version requires that host and guest have the same PAE status. NX cap is not offered to the guest, yet. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.txt | 1 - 1 file changed, 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 28c747362f9..efb3a6a045a 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt @@ -37,7 +37,6 @@ Running Lguest: "Paravirtualized guest support" = Y "Lguest guest support" = Y "High Memory Support" = off/4GB - "PAE (Physical Address Extension) Support" = N "Alignment value to which kernel should be aligned" = 0x100000 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and CONFIG_PHYSICAL_ALIGN=0x100000) -- cgit v1.2.3 From 659a0e6633567246edcb7bd400c7e2bece9237d9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:10 -0600 Subject: lguest: have example Launcher service all devices in separate threads Currently lguest has three threads: the main Launcher thread, a Waker thread, and a thread for the block device (because synchronous block was simply too painful to bear). The Waker selects() on all the input file descriptors (eg. stdin, net devices, pipe to the block thread) and when one becomes readable it calls into the kernel to kick the Launcher thread out into userspace, which repeats the poll, services the device(s), and then tells the kernel to release the Waker before re-entering the kernel to run the Guest. Also, to make a slightly-decent network transmit routine, the Launcher would suppress further network interrupts while it set a timer: that signal handler would write to a pipe, which would rouse the Waker which would prod the Launcher out of the kernel to check the network device again. Now we can convert all our virtqueues to separate threads: each one has a separate eventfd for when the Guest pokes the device, and can trigger interrupts in the Guest directly. The linecount shows how much this simplifies, but to really bring it home, here's an strace analysis of single Guest->Host ping before: * Guest sends packet, notifies xmit vq, return control to Launcher * Launcher clears notification flag on xmit ring * Launcher writes packet to TUN device writev(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"\366\r\224`\2058\272m\224vf\274\10\0E\0\0T\0\0@\0@\1\265"..., 98}], 2) = 108 * Launcher sets up interrupt for Guest (xmit ring is empty) write(10, "\2\0\0\0\3\0\0\0", 8) = 0 * Launcher sets up timer for interrupt mitigation setitimer(ITIMER_REAL, {it_interval={0, 0}, it_value={0, 505}}, NULL) = 0 * Launcher re-runs guest pread64(10, 0xbfa5f4d4, 4, 0) ... * Waker notices reply packet in tun device (it was in select) select(12, [0 3 4 6 11], NULL, NULL, NULL) = 1 (in [4]) * Waker kicks Launcher out of guest: pwrite64(10, "\3\0\0\0\1\0\0\0", 8, 0) = 0 * Launcher returns from running guest: ... = -1 EAGAIN (Resource temporarily unavailable) * Launcher looks at input fds: select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 1 (in [4], left {0, 0}) * Launcher reads pong from tun device: readv(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"\272m\224vf\274\366\r\224`\2058\10\0E\0\0T\364\26\0\0@"..., 1518}], 2) = 108 * Launcher injects guest notification: write(10, "\2\0\0\0\2\0\0\0", 8) = 0 * Launcher rechecks fds: select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 0 (Timeout) * Launcher clears Waker: pwrite64(10, "\3\0\0\0\0\0\0\0", 8, 0) = 0 * Launcher reruns Guest: pread64(10, 0xbfa5f4d4, 4, 0) = ? ERESTARTSYS (To be restarted) * Signal comes in, uses pipe to wake up Launcher: --- SIGALRM (Alarm clock) @ 0 (0) --- write(8, "\0", 1) = 1 sigreturn() = ? (mask now []) * Waker sees write on pipe: select(12, [0 3 4 6 11], NULL, NULL, NULL) = 1 (in [6]) * Waker kicks Launcher out of Guest: pwrite64(10, "\3\0\0\0\1\0\0\0", 8, 0) = 0 * Launcher exits from kernel: pread64(10, 0xbfa5f4d4, 4, 0) = -1 EAGAIN (Resource temporarily unavailable) * Launcher looks to see what fd woke it: select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 1 (in [6], left {0, 0}) * Launcher reads timeout fd, sets notification flag on xmit ring read(6, "\0", 32) = 1 * Launcher rechecks fds: select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 0 (Timeout) * Launcher clears Waker: pwrite64(10, "\3\0\0\0\0\0\0\0", 8, 0) = 0 * Launcher resumes Guest: pread64(10, "\0p\0\4", 4, 0) .... strace analysis of single Guest->Host ping after: * Guest sends packet, notifies xmit vq, creates event on eventfd. * Network xmit thread wakes from read on eventfd: read(7, "\1\0\0\0\0\0\0\0", 8) = 8 * Network xmit thread writes packet to TUN device writev(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"J\217\232FI\37j\27\375\276\0\304\10\0E\0\0T\0\0@\0@\1\265"..., 98}], 2) = 108 * Network recv thread wakes up from read on tunfd: readv(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"j\27\375\276\0\304J\217\232FI\37\10\0E\0\0TiO\0\0@\1\214"..., 1518}], 2) = 108 * Network recv thread sets up interrupt for the Guest write(6, "\2\0\0\0\2\0\0\0", 8) = 0 * Network recv thread goes back to reading tunfd 13:39:42.460285 readv(4, * Network xmit thread sets up interrupt for Guest (xmit ring is empty) write(6, "\2\0\0\0\3\0\0\0", 8) = 0 * Network xmit thread goes back to reading from eventfd read(7, Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 833 +++++++++++++----------------------------- 1 file changed, 259 insertions(+), 574 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 02fa524cf4a..5470b8ed214 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -59,7 +60,6 @@ typedef uint8_t u8; /*:*/ #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ -#define NET_PEERNUM 1 #define BRIDGE_PFX "bridge:" #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ @@ -76,18 +76,10 @@ static bool verbose; do { if (verbose) printf(args); } while(0) /*:*/ -/* File descriptors for the Waker. */ -struct { - int pipe[2]; -} waker_fds; - /* The pointer to the start of guest memory. */ static void *guest_base; /* The maximum guest physical address allowed, and maximum possible. */ static unsigned long guest_limit, guest_max; -/* The pipe for signal hander to write to. */ -static int timeoutpipe[2]; -static unsigned int timeout_usec = 500; /* The /dev/lguest file descriptor. */ static int lguest_fd; @@ -97,11 +89,6 @@ static unsigned int __thread cpu_id; /* This is our list of devices. */ struct device_list { - /* Summary information about the devices in our list: ready to pass to - * select() to ask which need servicing.*/ - fd_set infds; - int max_infd; - /* Counter to assign interrupt numbers. */ unsigned int next_irq; @@ -137,16 +124,11 @@ struct device /* The name of this device, for --verbose. */ const char *name; - /* If handle_input is set, it wants to be called when this file - * descriptor is ready. */ - int fd; - bool (*handle_input)(struct device *me); - /* Any queues attached to this device */ struct virtqueue *vq; - /* Handle status being finalized (ie. feature bits stable). */ - void (*ready)(struct device *me); + /* Is it operational */ + bool running; /* Device-specific data. */ void *priv; @@ -169,16 +151,20 @@ struct virtqueue /* Last available index we saw. */ u16 last_avail_idx; - /* The routine to call when the Guest pings us, or timeout. */ - void (*handle_output)(struct virtqueue *me, bool timeout); + /* Eventfd where Guest notifications arrive. */ + int eventfd; - /* Is this blocked awaiting a timer? */ - bool blocked; + /* Function for the thread which is servicing this virtqueue. */ + void (*service)(struct virtqueue *vq); + pid_t thread; }; /* Remember the arguments to the program so we can "reboot" */ static char **main_args; +/* The original tty settings to restore on exit. */ +static struct termios orig_term; + /* We have to be careful with barriers: our devices are all run in separate * threads and so we need to make sure that changes visible to the Guest happen * in precise order. */ @@ -521,78 +507,6 @@ static void tell_kernel(unsigned long start) } /*:*/ -static void add_device_fd(int fd) -{ - FD_SET(fd, &devices.infds); - if (fd > devices.max_infd) - devices.max_infd = fd; -} - -/*L:200 - * The Waker. - * - * With console, block and network devices, we can have lots of input which we - * need to process. We could try to tell the kernel what file descriptors to - * watch, but handing a file descriptor mask through to the kernel is fairly - * icky. - * - * Instead, we clone off a thread which watches the file descriptors and writes - * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host - * stop running the Guest. This causes the Launcher to return from the - * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset - * the LHREQ_BREAK and wake us up again. - * - * This, of course, is merely a different *kind* of icky. - * - * Given my well-known antipathy to threads, I'd prefer to use processes. But - * it's easier to share Guest memory with threads, and trivial to share the - * devices.infds as the Launcher changes it. - */ -static int waker(void *unused) -{ - /* Close the write end of the pipe: only the Launcher has it open. */ - close(waker_fds.pipe[1]); - - for (;;) { - fd_set rfds = devices.infds; - unsigned long args[] = { LHREQ_BREAK, 1 }; - unsigned int maxfd = devices.max_infd; - - /* We also listen to the pipe from the Launcher. */ - FD_SET(waker_fds.pipe[0], &rfds); - if (waker_fds.pipe[0] > maxfd) - maxfd = waker_fds.pipe[0]; - - /* Wait until input is ready from one of the devices. */ - select(maxfd+1, &rfds, NULL, NULL, NULL); - - /* Message from Launcher? */ - if (FD_ISSET(waker_fds.pipe[0], &rfds)) { - char c; - /* If this fails, then assume Launcher has exited. - * Don't do anything on exit: we're just a thread! */ - if (read(waker_fds.pipe[0], &c, 1) != 1) - _exit(0); - continue; - } - - /* Send LHREQ_BREAK command to snap the Launcher out of it. */ - pwrite(lguest_fd, args, sizeof(args), cpu_id); - } - return 0; -} - -/* This routine just sets up a pipe to the Waker process. */ -static void setup_waker(void) -{ - /* This pipe is closed when Launcher dies, telling Waker. */ - if (pipe(waker_fds.pipe) != 0) - err(1, "Creating pipe for Waker"); - - if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) - err(1, "Creating Waker"); -} - /* * Device Handling. * @@ -642,25 +556,27 @@ static unsigned next_desc(struct virtqueue *vq, unsigned int i) * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found, or vq->vring.num (which - * is never a valid descriptor number) if none was found. */ -static unsigned get_vq_desc(struct virtqueue *vq, - struct iovec iov[], - unsigned int *out_num, unsigned int *in_num) + * This function returns the descriptor number found. */ +static unsigned wait_for_vq_desc(struct virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) { unsigned int i, head; - u16 last_avail; + u16 last_avail = lg_last_avail(vq); + + while (last_avail == vq->vring.avail->idx) { + u64 event; + + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) + errx(1, "Event read failed?"); + } /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail = lg_last_avail(vq); if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) errx(1, "Guest moved used index from %u to %u", last_avail, vq->vring.avail->idx); - /* If there's nothing new since last we looked, return invalid. */ - if (vq->vring.avail->idx == last_avail) - return vq->vring.num; - /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ head = vq->vring.avail->ring[last_avail % vq->vring.num]; @@ -740,15 +656,7 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) /* * The Console * - * Here is the input terminal setting we save, and the routine to restore them - * on exit so the user gets their terminal back. */ -static struct termios orig_term; -static void restore_term(void) -{ - tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); -} - -/* We associate some data with the console for our exit hack. */ + * We associate some data with the console for our exit hack. */ struct console_abort { /* How many times have they hit ^C? */ @@ -758,245 +666,235 @@ struct console_abort }; /* This is the routine which handles console input (ie. stdin). */ -static bool handle_console_input(struct device *dev) +static void console_input(struct virtqueue *vq) { int len; unsigned int head, in_num, out_num; - struct iovec iov[dev->vq->vring.num]; - struct console_abort *abort = dev->priv; - - /* First we need a console buffer from the Guests's input virtqueue. */ - head = get_vq_desc(dev->vq, iov, &out_num, &in_num); - - /* If they're not ready for input, stop listening to this file - * descriptor. We'll start again once they add an input buffer. */ - if (head == dev->vq->vring.num) - return false; + struct console_abort *abort = vq->dev->priv; + struct iovec iov[vq->vring.num]; + /* Make sure there's a descriptor waiting. */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in console in queue?"); - /* This is why we convert to iovecs: the readv() call uses them, and so - * it reads straight into the Guest's buffer. */ - len = readv(dev->fd, iov, in_num); + /* Read it in. */ + len = readv(STDIN_FILENO, iov, in_num); if (len <= 0) { - /* This implies that the console is closed, is /dev/null, or - * something went terribly wrong. */ + /* Ran out of input? */ warnx("Failed to get console input, ignoring console."); - /* Put the input terminal back. */ - restore_term(); - /* Remove callback from input vq, so it doesn't restart us. */ - dev->vq->handle_output = NULL; - /* Stop listening to this fd: don't call us again. */ - return false; + /* For simplicity, dying threads kill the whole Launcher. So + * just nap here. */ + for (;;) + pause(); } - /* Tell the Guest about the new input. */ - add_used_and_trigger(dev->vq, head, len); + add_used_and_trigger(vq, head, len); /* Three ^C within one second? Exit. * - * This is such a hack, but works surprisingly well. Each ^C has to be - * in a buffer by itself, so they can't be too fast. But we check that - * we get three within about a second, so they can't be too slow. */ - if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { - if (!abort->count++) - gettimeofday(&abort->start, NULL); - else if (abort->count == 3) { - struct timeval now; - gettimeofday(&now, NULL); - if (now.tv_sec <= abort->start.tv_sec+1) { - unsigned long args[] = { LHREQ_BREAK, 0 }; - /* Close the fd so Waker will know it has to - * exit. */ - close(waker_fds.pipe[1]); - /* Just in case Waker is blocked in BREAK, send - * unbreak now. */ - write(lguest_fd, args, sizeof(args)); - exit(2); - } - abort->count = 0; - } - } else - /* Any other key resets the abort counter. */ + * This is such a hack, but works surprisingly well. Each ^C has to + * be in a buffer by itself, so they can't be too fast. But we check + * that we get three within about a second, so they can't be too + * slow. */ + if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { abort->count = 0; + return; + } - /* Everything went OK! */ - return true; + abort->count++; + if (abort->count == 1) + gettimeofday(&abort->start, NULL); + else if (abort->count == 3) { + struct timeval now; + gettimeofday(&now, NULL); + /* Kill all Launcher processes with SIGINT, like normal ^C */ + if (now.tv_sec <= abort->start.tv_sec+1) + kill(0, SIGINT); + abort->count = 0; + } } -/* Handling output for console is simple: we just get all the output buffers - * and write them to stdout. */ -static void handle_console_output(struct virtqueue *vq, bool timeout) +/* This is the routine which handles console output (ie. stdout). */ +static void console_output(struct virtqueue *vq) { unsigned int head, out, in; struct iovec iov[vq->vring.num]; - /* Keep getting output buffers from the Guest until we run out. */ - while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { - if (in) - errx(1, "Input buffers in output queue?"); - while (!iov_empty(iov, out)) { - int len = writev(STDOUT_FILENO, iov, out); - if (len <= 0) - err(1, "Write to stdout gave %i", len); - iov_consume(iov, out, len); - } - add_used_and_trigger(vq, head, 0); + head = wait_for_vq_desc(vq, iov, &out, &in); + if (in) + errx(1, "Input buffers in console output queue?"); + while (!iov_empty(iov, out)) { + int len = writev(STDOUT_FILENO, iov, out); + if (len <= 0) + err(1, "Write to stdout gave %i", len); + iov_consume(iov, out, len); } -} - -/* This is called when we no longer want to hear about Guest changes to a - * virtqueue. This is more efficient in high-traffic cases, but it means we - * have to set a timer to check if any more changes have occurred. */ -static void block_vq(struct virtqueue *vq) -{ - struct itimerval itm; - - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - vq->blocked = true; - - itm.it_interval.tv_sec = 0; - itm.it_interval.tv_usec = 0; - itm.it_value.tv_sec = 0; - itm.it_value.tv_usec = timeout_usec; - - setitimer(ITIMER_REAL, &itm, NULL); + add_used_and_trigger(vq, head, 0); } /* * The Network * * Handling output for network is also simple: we get all the output buffers - * and write them (ignoring the first element) to this device's file descriptor - * (/dev/net/tun). + * and write them to /dev/net/tun. */ -static void handle_net_output(struct virtqueue *vq, bool timeout) +struct net_info { + int tunfd; +}; + +static void net_output(struct virtqueue *vq) { - unsigned int head, out, in, num = 0; + struct net_info *net_info = vq->dev->priv; + unsigned int head, out, in; struct iovec iov[vq->vring.num]; - static int last_timeout_num; - - /* Keep getting output buffers from the Guest until we run out. */ - while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { - if (in) - errx(1, "Input buffers in output queue?"); - if (writev(vq->dev->fd, iov, out) < 0) - err(1, "Writing network packet to tun"); - add_used_and_trigger(vq, head, 0); - num++; - } - /* Block further kicks and set up a timer if we saw anything. */ - if (!timeout && num) - block_vq(vq); - - /* We never quite know how long should we wait before we check the - * queue again for more packets. We start at 500 microseconds, and if - * we get fewer packets than last time, we assume we made the timeout - * too small and increase it by 10 microseconds. Otherwise, we drop it - * by one microsecond every time. It seems to work well enough. */ - if (timeout) { - if (num < last_timeout_num) - timeout_usec += 10; - else if (timeout_usec > 1) - timeout_usec--; - last_timeout_num = num; - } + head = wait_for_vq_desc(vq, iov, &out, &in); + if (in) + errx(1, "Input buffers in net output queue?"); + if (writev(net_info->tunfd, iov, out) < 0) + errx(1, "Write to tun failed?"); + add_used_and_trigger(vq, head, 0); } -/* This is where we handle a packet coming in from the tun device to our +/* This is where we handle packets coming in from the tun device to our * Guest. */ -static bool handle_tun_input(struct device *dev) +static void net_input(struct virtqueue *vq) { - unsigned int head, in_num, out_num; int len; - struct iovec iov[dev->vq->vring.num]; - - /* First we need a network buffer from the Guests's recv virtqueue. */ - head = get_vq_desc(dev->vq, iov, &out_num, &in_num); - if (head == dev->vq->vring.num) { - /* Now, it's expected that if we try to send a packet too - * early, the Guest won't be ready yet. Wait until the device - * status says it's ready. */ - /* FIXME: Actually want DRIVER_ACTIVE here. */ - - /* Now tell it we want to know if new things appear. */ - dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - wmb(); - - /* We'll turn this back on if input buffers are registered. */ - return false; - } else if (out_num) - errx(1, "Output buffers in network recv queue?"); - - /* Read the packet from the device directly into the Guest's buffer. */ - len = readv(dev->fd, iov, in_num); + unsigned int head, out, in; + struct iovec iov[vq->vring.num]; + struct net_info *net_info = vq->dev->priv; + + head = wait_for_vq_desc(vq, iov, &out, &in); + if (out) + errx(1, "Output buffers in net input queue?"); + len = readv(net_info->tunfd, iov, in); if (len <= 0) - err(1, "reading network"); + err(1, "Failed to read from tun."); + add_used_and_trigger(vq, head, len); +} - /* Tell the Guest about the new packet. */ - add_used_and_trigger(dev->vq, head, len); +/* This is the helper to create threads. */ +static int do_thread(void *_vq) +{ + struct virtqueue *vq = _vq; - verbose("tun input packet len %i [%02x %02x] (%s)\n", len, - ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], - head != dev->vq->vring.num ? "sent" : "discarded"); + for (;;) + vq->service(vq); + return 0; +} - /* All good. */ - return true; +/* When a child dies, we kill our entire process group with SIGTERM. This + * also has the side effect that the shell restores the console for us! */ +static void kill_launcher(int signal) +{ + kill(0, SIGTERM); } -/*L:215 This is the callback attached to the network and console input - * virtqueues: it ensures we try again, in case we stopped console or net - * delivery because Guest didn't have any buffers. */ -static void enable_fd(struct virtqueue *vq, bool timeout) +static void reset_device(struct device *dev) { - add_device_fd(vq->dev->fd); - /* Snap the Waker out of its select loop. */ - write(waker_fds.pipe[1], "", 1); + struct virtqueue *vq; + + verbose("Resetting device %s\n", dev->name); + + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); + + /* We're going to be explicitly killing threads, so ignore them. */ + signal(SIGCHLD, SIG_IGN); + + /* Zero out the virtqueues, get rid of their threads */ + for (vq = dev->vq; vq; vq = vq->next) { + if (vq->thread != (pid_t)-1) { + kill(vq->thread, SIGTERM); + waitpid(vq->thread, NULL, 0); + vq->thread = (pid_t)-1; + } + memset(vq->vring.desc, 0, + vring_size(vq->config.num, LGUEST_VRING_ALIGN)); + lg_last_avail(vq) = 0; + } + dev->running = false; + + /* Now we care if threads die. */ + signal(SIGCHLD, (void *)kill_launcher); } -static void net_enable_fd(struct virtqueue *vq, bool timeout) +static void create_thread(struct virtqueue *vq) { - /* We don't need to know again when Guest refills receive buffer. */ - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - enable_fd(vq, timeout); + /* Create stack for thread and run it. Since stack grows + * upwards, we point the stack pointer to the end of this + * region. */ + char *stack = malloc(32768); + unsigned long args[] = { LHREQ_EVENTFD, + vq->config.pfn*getpagesize(), 0 }; + + /* Create a zero-initialized eventfd. */ + vq->eventfd = eventfd(0, 0); + if (vq->eventfd < 0) + err(1, "Creating eventfd"); + args[2] = vq->eventfd; + + /* Attach an eventfd to this virtqueue: it will go off + * when the Guest does an LHCALL_NOTIFY for this vq. */ + if (write(lguest_fd, &args, sizeof(args)) != 0) + err(1, "Attaching eventfd"); + + /* CLONE_VM: because it has to access the Guest memory, and + * SIGCHLD so we get a signal if it dies. */ + vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); + if (vq->thread == (pid_t)-1) + err(1, "Creating clone"); + /* We close our local copy, now the child has it. */ + close(vq->eventfd); } -/* When the Guest tells us they updated the status field, we handle it. */ -static void update_device_status(struct device *dev) +static void start_device(struct device *dev) { + unsigned int i; struct virtqueue *vq; - /* This is a reset. */ - if (dev->desc->status == 0) { - verbose("Resetting device %s\n", dev->name); + verbose("Device %s OK: offered", dev->name); + for (i = 0; i < dev->feature_len; i++) + verbose(" %02x", get_feature_bits(dev)[i]); + verbose(", accepted"); + for (i = 0; i < dev->feature_len; i++) + verbose(" %02x", get_feature_bits(dev) + [dev->feature_len+i]); + + for (vq = dev->vq; vq; vq = vq->next) { + if (vq->service) + create_thread(vq); + } + dev->running = true; +} + +static void cleanup_devices(void) +{ + struct device *dev; + + for (dev = devices.dev; dev; dev = dev->next) + reset_device(dev); - /* Clear any features they've acked. */ - memset(get_feature_bits(dev) + dev->feature_len, 0, - dev->feature_len); + /* If we saved off the original terminal settings, restore them now. */ + if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) + tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); +} - /* Zero out the virtqueues. */ - for (vq = dev->vq; vq; vq = vq->next) { - memset(vq->vring.desc, 0, - vring_size(vq->config.num, LGUEST_VRING_ALIGN)); - lg_last_avail(vq) = 0; - } - } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { +/* When the Guest tells us they updated the status field, we handle it. */ +static void update_device_status(struct device *dev) +{ + /* A zero status is a reset, otherwise it's a set of flags. */ + if (dev->desc->status == 0) + reset_device(dev); + else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { warnx("Device %s configuration FAILED", dev->name); + if (dev->running) + reset_device(dev); } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - unsigned int i; - - verbose("Device %s OK: offered", dev->name); - for (i = 0; i < dev->feature_len; i++) - verbose(" %02x", get_feature_bits(dev)[i]); - verbose(", accepted"); - for (i = 0; i < dev->feature_len; i++) - verbose(" %02x", get_feature_bits(dev) - [dev->feature_len+i]); - - if (dev->ready) - dev->ready(dev); + if (!dev->running) + start_device(dev); } } @@ -1004,32 +902,24 @@ static void update_device_status(struct device *dev) static void handle_output(unsigned long addr) { struct device *i; - struct virtqueue *vq; - /* Check each device and virtqueue. */ + /* Check each device. */ for (i = devices.dev; i; i = i->next) { + struct virtqueue *vq; + /* Notifications to device descriptors update device status. */ if (from_guest_phys(addr) == i->desc) { update_device_status(i); return; } - /* Notifications to virtqueues mean output has occurred. */ + /* Devices *can* be used before status is set to DRIVER_OK. */ for (vq = i->vq; vq; vq = vq->next) { - if (vq->config.pfn != addr/getpagesize()) + if (addr != vq->config.pfn*getpagesize()) continue; - - /* Guest should acknowledge (and set features!) before - * using the device. */ - if (i->desc->status == 0) { - warnx("%s gave early output", i->name); - return; - } - - if (strcmp(vq->dev->name, "console") != 0) - verbose("Output to %s\n", vq->dev->name); - if (vq->handle_output) - vq->handle_output(vq, false); + if (i->running) + errx(1, "Notification on running %s", i->name); + start_device(i); return; } } @@ -1043,71 +933,6 @@ static void handle_output(unsigned long addr) strnlen(from_guest_phys(addr), guest_limit - addr)); } -static void handle_timeout(void) -{ - char buf[32]; - struct device *i; - struct virtqueue *vq; - - /* Clear the pipe */ - read(timeoutpipe[0], buf, sizeof(buf)); - - /* Check each device and virtqueue: flush blocked ones. */ - for (i = devices.dev; i; i = i->next) { - for (vq = i->vq; vq; vq = vq->next) { - if (!vq->blocked) - continue; - - vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - vq->blocked = false; - if (vq->handle_output) - vq->handle_output(vq, true); - } - } -} - -/* This is called when the Waker wakes us up: check for incoming file - * descriptors. */ -static void handle_input(void) -{ - /* select() wants a zeroed timeval to mean "don't wait". */ - struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; - - for (;;) { - struct device *i; - fd_set fds = devices.infds; - int num; - - num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); - /* Could get interrupted */ - if (num < 0) - continue; - /* If nothing is ready, we're done. */ - if (num == 0) - break; - - /* Otherwise, call the device(s) which have readable file - * descriptors and a method of handling them. */ - for (i = devices.dev; i; i = i->next) { - if (i->handle_input && FD_ISSET(i->fd, &fds)) { - if (i->handle_input(i)) - continue; - - /* If handle_input() returns false, it means we - * should no longer service it. Networking and - * console do this when there's no input - * buffers to deliver into. Console also uses - * it when it discovers that stdin is closed. */ - FD_CLR(i->fd, &devices.infds); - } - } - - /* Is this the timeout fd? */ - if (FD_ISSET(timeoutpipe[0], &fds)) - handle_timeout(); - } -} - /*L:190 * Device Setup * @@ -1153,7 +978,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) /* Each device descriptor is followed by the description of its virtqueues. We * specify how many descriptors the virtqueue is to have. */ static void add_virtqueue(struct device *dev, unsigned int num_descs, - void (*handle_output)(struct virtqueue *, bool)) + void (*service)(struct virtqueue *)) { unsigned int pages; struct virtqueue **i, *vq = malloc(sizeof(*vq)); @@ -1168,7 +993,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; - vq->blocked = false; + vq->service = service; + vq->thread = (pid_t)-1; /* Initialize the configuration. */ vq->config.num = num_descs; @@ -1193,15 +1019,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, * second. */ for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; - - /* Set the routine to call when the Guest does something to this - * virtqueue. */ - vq->handle_output = handle_output; - - /* As an optimization, set the advisory "Don't Notify Me" flag if we - * don't have a handler */ - if (!handle_output) - vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; } /* The first half of the feature bitmask is for us to advertise features. The @@ -1237,24 +1054,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf) * calling new_dev_desc() to allocate the descriptor and device memory. * * See what I mean about userspace being boring? */ -static struct device *new_device(const char *name, u16 type, int fd, - bool (*handle_input)(struct device *)) +static struct device *new_device(const char *name, u16 type) { struct device *dev = malloc(sizeof(*dev)); /* Now we populate the fields one at a time. */ - dev->fd = fd; - /* If we have an input handler for this file descriptor, then we add it - * to the device_list's fdset and maxfd. */ - if (handle_input) - add_device_fd(dev->fd); dev->desc = new_dev_desc(type); - dev->handle_input = handle_input; dev->name = name; dev->vq = NULL; - dev->ready = NULL; dev->feature_len = 0; dev->num_vq = 0; + dev->running = false; /* Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus @@ -1282,13 +1092,10 @@ static void setup_console(void) * raw input stream to the Guest. */ term.c_lflag &= ~(ISIG|ICANON|ECHO); tcsetattr(STDIN_FILENO, TCSANOW, &term); - /* If we exit gracefully, the original settings will be - * restored so the user can see what they're typing. */ - atexit(restore_term); } - dev = new_device("console", VIRTIO_ID_CONSOLE, - STDIN_FILENO, handle_console_input); + dev = new_device("console", VIRTIO_ID_CONSOLE); + /* We store the console state in dev->priv, and initialize it. */ dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; @@ -1297,31 +1104,13 @@ static void setup_console(void) * they put something the input queue, we make sure we're listening to * stdin. When they put something in the output queue, we write it to * stdout. */ - add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); - add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); + add_virtqueue(dev, VIRTQUEUE_NUM, console_input); + add_virtqueue(dev, VIRTQUEUE_NUM, console_output); - verbose("device %u: console\n", devices.device_num++); + verbose("device %u: console\n", ++devices.device_num); } /*:*/ -static void timeout_alarm(int sig) -{ - write(timeoutpipe[1], "", 1); -} - -static void setup_timeout(void) -{ - if (pipe(timeoutpipe) != 0) - err(1, "Creating timeout pipe"); - - if (fcntl(timeoutpipe[1], F_SETFL, - fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) - err(1, "Making timeout pipe nonblocking"); - - add_device_fd(timeoutpipe[0]); - signal(SIGALRM, timeout_alarm); -} - /*M:010 Inter-guest networking is an interesting area. Simplest is to have a * --sharenet= option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. @@ -1443,21 +1232,23 @@ static int get_tun_device(char tapif[IFNAMSIZ]) static void setup_tun_net(char *arg) { struct device *dev; - int netfd, ipfd; + struct net_info *net_info = malloc(sizeof(*net_info)); + int ipfd; u32 ip = INADDR_ANY; bool bridging = false; char tapif[IFNAMSIZ], *p; struct virtio_net_config conf; - netfd = get_tun_device(tapif); + net_info->tunfd = get_tun_device(tapif); /* First we create a new network device. */ - dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); + dev = new_device("net", VIRTIO_ID_NET); + dev->priv = net_info; /* Network devices need a receive and a send queue, just like * console. */ - add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); - add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); + add_virtqueue(dev, VIRTQUEUE_NUM, net_input); + add_virtqueue(dev, VIRTQUEUE_NUM, net_output); /* We need a socket to perform the magic network ioctls to bring up the * tap interface, connect to the bridge etc. Any socket will do! */ @@ -1546,20 +1337,18 @@ struct vblk_info * Remember that the block device is handled by a separate I/O thread. We head * straight into the core of that thread here: */ -static bool service_io(struct device *dev) +static void blk_request(struct virtqueue *vq) { - struct vblk_info *vblk = dev->priv; + struct vblk_info *vblk = vq->dev->priv; unsigned int head, out_num, in_num, wlen; int ret; u8 *in; struct virtio_blk_outhdr *out; - struct iovec iov[dev->vq->vring.num]; + struct iovec iov[vq->vring.num]; off64_t off; - /* See if there's a request waiting. If not, nothing to do. */ - head = get_vq_desc(dev->vq, iov, &out_num, &in_num); - if (head == dev->vq->vring.num) - return false; + /* Get the next request. */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); /* Every block request should contain at least one output buffer * (detailing the location on disk and the type of request) and one @@ -1633,83 +1422,21 @@ static bool service_io(struct device *dev) if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); - /* We can't trigger an IRQ, because we're not the Launcher. It does - * that when we tell it we're done. */ - add_used(dev->vq, head, wlen); - return true; -} - -/* This is the thread which actually services the I/O. */ -static int io_thread(void *_dev) -{ - struct device *dev = _dev; - struct vblk_info *vblk = dev->priv; - char c; - - /* Close other side of workpipe so we get 0 read when main dies. */ - close(vblk->workpipe[1]); - /* Close the other side of the done_fd pipe. */ - close(dev->fd); - - /* When this read fails, it means Launcher died, so we follow. */ - while (read(vblk->workpipe[0], &c, 1) == 1) { - /* We acknowledge each request immediately to reduce latency, - * rather than waiting until we've done them all. I haven't - * measured to see if it makes any difference. - * - * That would be an interesting test, wouldn't it? You could - * also try having more than one I/O thread. */ - while (service_io(dev)) - write(vblk->done_fd, &c, 1); - } - return 0; -} - -/* Now we've seen the I/O thread, we return to the Launcher to see what happens - * when that thread tells us it's completed some I/O. */ -static bool handle_io_finish(struct device *dev) -{ - char c; - - /* If the I/O thread died, presumably it printed the error, so we - * simply exit. */ - if (read(dev->fd, &c, 1) != 1) - exit(1); - - /* It did some work, so trigger the irq. */ - trigger_irq(dev->vq); - return true; -} - -/* When the Guest submits some I/O, we just need to wake the I/O thread. */ -static void handle_virtblk_output(struct virtqueue *vq, bool timeout) -{ - struct vblk_info *vblk = vq->dev->priv; - char c = 0; - - /* Wake up I/O thread and tell it to go to work! */ - if (write(vblk->workpipe[1], &c, 1) != 1) - /* Presumably it indicated why it died. */ - exit(1); + add_used_and_trigger(vq, head, wlen); } /*L:198 This actually sets up a virtual block device. */ static void setup_block_file(const char *filename) { - int p[2]; struct device *dev; struct vblk_info *vblk; - void *stack; struct virtio_blk_config conf; - /* This is the pipe the I/O thread will use to tell us I/O is done. */ - pipe(p); - /* The device responds to return from I/O thread. */ - dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); + dev = new_device("block", VIRTIO_ID_BLOCK); /* The device has one virtqueue, where the Guest places requests. */ - add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); + add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); /* Allocate the room for our own bookkeeping */ vblk = dev->priv = malloc(sizeof(*vblk)); @@ -1731,49 +1458,29 @@ static void setup_block_file(const char *filename) set_config(dev, sizeof(conf), &conf); - /* The I/O thread writes to this end of the pipe when done. */ - vblk->done_fd = p[1]; - - /* This is the second pipe, which is how we tell the I/O thread about - * more work. */ - pipe(vblk->workpipe); - - /* Create stack for thread and run it. Since stack grows upwards, we - * point the stack pointer to the end of this region. */ - stack = malloc(32768); - /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from - * becoming a zombie. */ - if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) - err(1, "Creating clone"); - - /* We don't need to keep the I/O thread's end of the pipes open. */ - close(vblk->done_fd); - close(vblk->workpipe[0]); - verbose("device %u: virtblock %llu sectors\n", - devices.device_num, le64_to_cpu(conf.capacity)); + ++devices.device_num, le64_to_cpu(conf.capacity)); } +struct rng_info { + int rfd; +}; + /* Our random number generator device reads from /dev/random into the Guest's * input buffers. The usual case is that the Guest doesn't want random numbers * and so has no buffers although /dev/random is still readable, whereas * console is the reverse. * * The same logic applies, however. */ -static bool handle_rng_input(struct device *dev) +static void rng_input(struct virtqueue *vq) { int len; unsigned int head, in_num, out_num, totlen = 0; - struct iovec iov[dev->vq->vring.num]; + struct rng_info *rng_info = vq->dev->priv; + struct iovec iov[vq->vring.num]; /* First we need a buffer from the Guests's virtqueue. */ - head = get_vq_desc(dev->vq, iov, &out_num, &in_num); - - /* If they're not ready for input, stop listening to this file - * descriptor. We'll start again once they add an input buffer. */ - if (head == dev->vq->vring.num) - return false; - + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in rng?"); @@ -1781,7 +1488,7 @@ static bool handle_rng_input(struct device *dev) * it reads straight into the Guest's buffer. We loop to make sure we * fill it. */ while (!iov_empty(iov, in_num)) { - len = readv(dev->fd, iov, in_num); + len = readv(rng_info->rfd, iov, in_num); if (len <= 0) err(1, "Read from /dev/random gave %i", len); iov_consume(iov, in_num, len); @@ -1789,25 +1496,23 @@ static bool handle_rng_input(struct device *dev) } /* Tell the Guest about the new input. */ - add_used_and_trigger(dev->vq, head, totlen); - - /* Everything went OK! */ - return true; + add_used_and_trigger(vq, head, totlen); } /* And this creates a "hardware" random number device for the Guest. */ static void setup_rng(void) { struct device *dev; - int fd; + struct rng_info *rng_info = malloc(sizeof(*rng_info)); - fd = open_or_die("/dev/random", O_RDONLY); + rng_info->rfd = open_or_die("/dev/random", O_RDONLY); /* The device responds to return from I/O thread. */ - dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); + dev = new_device("rng", VIRTIO_ID_RNG); + dev->priv = rng_info; /* The device has one virtqueue, where the Guest places inbufs. */ - add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); + add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); verbose("device %u: rng\n", devices.device_num++); } @@ -1823,7 +1528,9 @@ static void __attribute__((noreturn)) restart_guest(void) for (i = 3; i < FD_SETSIZE; i++) close(i); - /* The exec automatically gets rid of the I/O and Waker threads. */ + /* Reset all the devices (kills all threads). */ + cleanup_devices(); + execv(main_args[0], main_args); err(1, "Could not exec %s", main_args[0]); } @@ -1833,7 +1540,6 @@ static void __attribute__((noreturn)) restart_guest(void) static void __attribute__((noreturn)) run_guest(void) { for (;;) { - unsigned long args[] = { LHREQ_BREAK, 0 }; unsigned long notify_addr; int readval; @@ -1845,7 +1551,6 @@ static void __attribute__((noreturn)) run_guest(void) if (readval == sizeof(notify_addr)) { verbose("Notify on address %#lx\n", notify_addr); handle_output(notify_addr); - continue; /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { char reason[1024] = { 0 }; @@ -1854,19 +1559,9 @@ static void __attribute__((noreturn)) run_guest(void) /* ERESTART means that we need to reboot the guest */ } else if (errno == ERESTART) { restart_guest(); - /* EAGAIN means a signal (timeout). - * Anything else means a bug or incompatible change. */ - } else if (errno != EAGAIN) + /* Anything else means a bug or incompatible change. */ + } else err(1, "Running guest failed"); - - /* Only service input on thread for CPU 0. */ - if (cpu_id != 0) - continue; - - /* Service input, then unset the BREAK to release the Waker. */ - handle_input(); - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) - err(1, "Resetting break"); } } /*L:240 @@ -1909,18 +1604,10 @@ int main(int argc, char *argv[]) /* Save the args: we "reboot" by execing ourselves again. */ main_args = argv; - /* We don't "wait" for the children, so prevent them from becoming - * zombies. */ - signal(SIGCHLD, SIG_IGN); - /* First we initialize the device list. Since console and network - * device receive input from a file descriptor, we keep an fdset - * (infds) and the maximum fd number (max_infd) with the head of the - * list. We also keep a pointer to the last device. Finally, we keep - * the next interrupt number to use for devices (1: remember that 0 is - * used by the timer). */ - FD_ZERO(&devices.infds); - devices.max_infd = -1; + /* First we initialize the device list. We keep a pointer to the last + * device, and the next interrupt number to use for devices (1: + * remember that 0 is used by the timer). */ devices.lastdev = NULL; devices.next_irq = 1; @@ -1978,9 +1665,6 @@ int main(int argc, char *argv[]) /* We always have a console device */ setup_console(); - /* We can timeout waiting for Guest network transmit. */ - setup_timeout(); - /* Now we load the kernel */ start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); @@ -2021,10 +1705,11 @@ int main(int argc, char *argv[]) * /dev/lguest file descriptor. */ tell_kernel(start); - /* We clone off a thread, which wakes the Launcher whenever one of the - * input file descriptors needs attention. We call this the Waker, and - * we'll cover it in a moment. */ - setup_waker(); + /* Ensure that we terminate if a child dies. */ + signal(SIGCHLD, kill_launcher); + + /* If we exit via err(), this kills all the threads, restores tty. */ + atexit(cleanup_devices); /* Finally, run the Guest. This doesn't return. */ run_guest(); -- cgit v1.2.3 From 38bc2b8c56a2e212bbd19de7cf9976dcc7bf9953 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:11 -0600 Subject: lguest: implement deferred interrupts in example Launcher Rather than sending an interrupt on every buffer, we only send an interrupt when we're about to wait for the Guest to send us a new one. The console input and network input still send interrupts manually, but the block device, network and console output queues can simply rely on this logic to send interrupts to the Guest at the right time. The patch is cluttered by moving trigger_irq() higher in the code. In practice, two factors make this optimization less interesting: (1) we often only get one input at a time, even for networking, (2) triggering an interrupt rapidly tends to get coalesced anyway. Before: Secs RxIRQS TxIRQs 1G TCP Guest->Host: 3.72 32784 32771 1M normal pings: 99 1000004 995541 100,000 1k pings (-l 120): 5 49510 49058 After: 1G TCP Guest->Host: 3.69 32809 32769 1M normal pings: 99 1000004 996196 100,000 1k pings (-l 120): 5 52435 52361 (Note the interrupt count on 100k pings goes *up*: see next patch). Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 5470b8ed214..84c471b07c2 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -551,6 +551,21 @@ static unsigned next_desc(struct virtqueue *vq, unsigned int i) return next; } +/* This actually sends the interrupt for this virtqueue */ +static void trigger_irq(struct virtqueue *vq) +{ + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; + + /* If they don't want an interrupt, don't send one, unless empty. */ + if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && lg_last_avail(vq) != vq->vring.avail->idx) + return; + + /* Send the Guest an interrupt tell them we used something up. */ + if (write(lguest_fd, buf, sizeof(buf)) != 0) + err(1, "Triggering irq %i", vq->config.irq); +} + /* This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two @@ -567,6 +582,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, while (last_avail == vq->vring.avail->idx) { u64 event; + /* OK, tell Guest about progress up to now. */ + trigger_irq(vq); + /* Nothing new? Wait for eventfd to tell us they refilled. */ if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) errx(1, "Event read failed?"); @@ -631,21 +649,6 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len) vq->vring.used->idx++; } -/* This actually sends the interrupt for this virtqueue */ -static void trigger_irq(struct virtqueue *vq) -{ - unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; - - /* If they don't want an interrupt, don't send one, unless empty. */ - if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && lg_last_avail(vq) != vq->vring.avail->idx) - return; - - /* Send the Guest an interrupt tell them we used something up. */ - if (write(lguest_fd, buf, sizeof(buf)) != 0) - err(1, "Triggering irq %i", vq->config.irq); -} - /* And here's the combo meal deal. Supersize me! */ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) { @@ -730,7 +733,7 @@ static void console_output(struct virtqueue *vq) err(1, "Write to stdout gave %i", len); iov_consume(iov, out, len); } - add_used_and_trigger(vq, head, 0); + add_used(vq, head, 0); } /* @@ -754,7 +757,7 @@ static void net_output(struct virtqueue *vq) errx(1, "Input buffers in net output queue?"); if (writev(net_info->tunfd, iov, out) < 0) errx(1, "Write to tun failed?"); - add_used_and_trigger(vq, head, 0); + add_used(vq, head, 0); } /* This is where we handle packets coming in from the tun device to our @@ -1422,7 +1425,7 @@ static void blk_request(struct virtqueue *vq) if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); - add_used_and_trigger(vq, head, wlen); + add_used(vq, head, wlen); } /*L:198 This actually sets up a virtual block device. */ @@ -1496,7 +1499,7 @@ static void rng_input(struct virtqueue *vq) } /* Tell the Guest about the new input. */ - add_used_and_trigger(vq, head, totlen); + add_used(vq, head, totlen); } /* And this creates a "hardware" random number device for the Guest. */ -- cgit v1.2.3 From 95c517c09bad31a03e22f2fdb5f0aa26a490a92d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:11 -0600 Subject: lguest: avoid sending interrupts to Guest when no activity occurs. If we track how many buffers we've used, we can tell whether we really need to interrupt the Guest. This happens as a side effect of spurious notifications. Spurious notifications happen because it can take a while before the Host thread wakes up and sets the VRING_USED_F_NO_NOTIFY flag, and meanwhile the Guest can more notifications. A real fix would be to use wake counts, rather than a suppression flag, but the practical difference is generally in the noise: the interrupt is usually coalesced into a pending one anyway so we just save a system call which isn't clearly measurable. Secs Spurious IRQS 1G TCP Guest->Host: 3.93 58 1M normal pings: 100 72 1M 1k pings (-l 120): 57 492904 Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 84c471b07c2..20f8253881b 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -151,6 +151,9 @@ struct virtqueue /* Last available index we saw. */ u16 last_avail_idx; + /* How many are used since we sent last irq? */ + unsigned int pending_used; + /* Eventfd where Guest notifications arrive. */ int eventfd; @@ -556,6 +559,11 @@ static void trigger_irq(struct virtqueue *vq) { unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; + /* Don't inform them if nothing used. */ + if (!vq->pending_used) + return; + vq->pending_used = 0; + /* If they don't want an interrupt, don't send one, unless empty. */ if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) && lg_last_avail(vq) != vq->vring.avail->idx) @@ -647,6 +655,7 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len) /* Make sure buffer is written before we update index. */ wmb(); vq->vring.used->idx++; + vq->pending_used++; } /* And here's the combo meal deal. Supersize me! */ -- cgit v1.2.3 From 4a8962e21bc505c714fc2508494d4c7dd3fe2d29 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:12 -0600 Subject: lguest: try to batch interrupts on network receive Rather than triggering an interrupt every time, we only trigger an interrupt when there are no more incoming packets (or the recv queue is full). However, the overhead of doing the select to figure this out is measurable: 1M pings goes from 98 to 104 seconds, and 1G Guest->Host TCP goes from 3.69 to 3.94 seconds. It's close to the noise though. I tested various timeouts, including reducing it as the number of pending packets increased, timing a 1 gigabyte TCP send from Guest -> Host and Host -> Guest (GSO disabled, to increase packet rate). // time tcpblast -o -s 65536 -c 16k 192.168.2.1:9999 > /dev/null Timeout Guest->Host Pkts/irq Host->Guest Pkts/irq Before 11.3s 1.0 6.3s 1.0 0 11.7s 1.0 6.6s 23.5 1 17.1s 8.8 8.6s 26.0 1/pending 13.4s 1.9 6.6s 23.8 2/pending 13.6s 2.8 6.6s 24.1 5/pending 14.1s 5.0 6.6s 24.4 Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 20f8253881b..64110797044 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -769,6 +769,16 @@ static void net_output(struct virtqueue *vq) add_used(vq, head, 0); } +/* Will reading from this file descriptor block? */ +static bool will_block(int fd) +{ + fd_set fdset; + struct timeval zero = { 0, 0 }; + FD_ZERO(&fdset); + FD_SET(fd, &fdset); + return select(fd+1, &fdset, NULL, NULL, &zero) != 1; +} + /* This is where we handle packets coming in from the tun device to our * Guest. */ static void net_input(struct virtqueue *vq) @@ -781,10 +791,15 @@ static void net_input(struct virtqueue *vq) head = wait_for_vq_desc(vq, iov, &out, &in); if (out) errx(1, "Output buffers in net input queue?"); + + /* Deliver interrupt now, since we're about to sleep. */ + if (vq->pending_used && will_block(net_info->tunfd)) + trigger_irq(vq); + len = readv(net_info->tunfd, iov, in); if (len <= 0) err(1, "Failed to read from tun."); - add_used_and_trigger(vq, head, len); + add_used(vq, head, len); } /* This is the helper to create threads. */ -- cgit v1.2.3 From b60da13fc7bbf99d3c68578bd3fbcf66e1cb5f41 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:12 -0600 Subject: lguest: suppress notifications in example Launcher The Guest only really needs to tell us about activity when we're going to listen to the eventfd: normally, we don't want to know. So if there are no available buffers, turn on notifications, re-check, then wait for the Guest to notify us via the eventfd, then turn notifications off again. There's enough else going on that the differences are in the noise. Before: Secs RxKicks TxKicks 1G TCP Guest->Host: 3.94 4686 32815 1M normal pings: 104 142862 1000010 1M 1k pings (-l 120): 57 142026 1000007 After: 1G TCP Guest->Host: 3.76 4691 32811 1M normal pings: 111 142859 997467 1M 1k pings (-l 120): 55 19648 501549 Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 64110797044..bb5e3c28d9d 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -172,6 +172,7 @@ static struct termios orig_term; * threads and so we need to make sure that changes visible to the Guest happen * in precise order. */ #define wmb() __asm__ __volatile__("" : : : "memory") +#define mb() __asm__ __volatile__("" : : : "memory") /* Convert an iovec element to the given type. * @@ -593,9 +594,23 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, /* OK, tell Guest about progress up to now. */ trigger_irq(vq); + /* OK, now we need to know about added descriptors. */ + vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; + + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + mb(); + if (last_avail != vq->vring.avail->idx) { + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + break; + } + /* Nothing new? Wait for eventfd to tell us they refilled. */ if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) errx(1, "Event read failed?"); + + /* We don't need to be notified again. */ + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; } /* Check it isn't doing very strange things with descriptor numbers. */ -- cgit v1.2.3 From d1f0132e76a11b05167313c606a853953f416081 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Mon, 11 May 2009 18:11:46 +0100 Subject: lguest: add support for indirect ring entries Support the VIRTIO_RING_F_INDIRECT_DESC feature. This is a simple matter of changing the descriptor walking code to operate on a struct vring_desc* and supplying it with an indirect table if detected. Signed-off-by: Mark McLoughlin Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'Documentation') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index bb5e3c28d9d..9ebcd6ef361 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -536,20 +536,21 @@ static void *_check_pointer(unsigned long addr, unsigned int size, /* Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, or vq->vring.num if we're * at the end. */ -static unsigned next_desc(struct virtqueue *vq, unsigned int i) +static unsigned next_desc(struct vring_desc *desc, + unsigned int i, unsigned int max) { unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ - if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) - return vq->vring.num; + if (!(desc[i].flags & VRING_DESC_F_NEXT)) + return max; /* Check they're not leading us off end of descriptors. */ - next = vq->vring.desc[i].next; + next = desc[i].next; /* Make sure compiler knows to grab that: we don't want it changing! */ wmb(); - if (next >= vq->vring.num) + if (next >= max) errx(1, "Desc next is %u", next); return next; @@ -585,7 +586,8 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], unsigned int *out_num, unsigned int *in_num) { - unsigned int i, head; + unsigned int i, head, max; + struct vring_desc *desc; u16 last_avail = lg_last_avail(vq); while (last_avail == vq->vring.avail->idx) { @@ -630,15 +632,28 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; + max = vq->vring.num; + desc = vq->vring.desc; i = head; + + /* If this is an indirect entry, then this buffer contains a descriptor + * table which we handle as if it's any normal descriptor chain. */ + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) + errx(1, "Invalid size for indirect buffer table"); + + max = desc[i].len / sizeof(struct vring_desc); + desc = check_pointer(desc[i].addr, desc[i].len); + i = 0; + } + do { /* Grab the first descriptor, and check it's OK. */ - iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; + iov[*out_num + *in_num].iov_len = desc[i].len; iov[*out_num + *in_num].iov_base - = check_pointer(vq->vring.desc[i].addr, - vq->vring.desc[i].len); + = check_pointer(desc[i].addr, desc[i].len); /* If this is an input descriptor, increment that count. */ - if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) + if (desc[i].flags & VRING_DESC_F_WRITE) (*in_num)++; else { /* If it's an output descriptor, they're all supposed @@ -649,9 +664,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, } /* If we've got too many, that implies a descriptor loop. */ - if (*out_num + *in_num > vq->vring.num) + if (*out_num + *in_num > max) errx(1, "Looped descriptor"); - } while ((i = next_desc(vq, i)) != vq->vring.num); + } while ((i = next_desc(desc, i, max)) != max); return head; } @@ -1331,6 +1346,8 @@ static void setup_tun_net(char *arg) add_feature(dev, VIRTIO_NET_F_HOST_TSO4); add_feature(dev, VIRTIO_NET_F_HOST_TSO6); add_feature(dev, VIRTIO_NET_F_HOST_ECN); + /* We handle indirect ring entries */ + add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); set_config(dev, sizeof(conf), &conf); /* We don't need the socket any more; setup is done. */ -- cgit v1.2.3