diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/acpi/toshiba_acpi.c | 9 | ||||
-rw-r--r-- | drivers/ata/Kconfig | 2 | ||||
-rw-r--r-- | drivers/ata/pata_hpt37x.c | 6 | ||||
-rw-r--r-- | drivers/block/pktcdvd.c | 49 | ||||
-rw-r--r-- | drivers/char/ip2/i2ellis.h | 4 | ||||
-rw-r--r-- | drivers/connector/cn_proc.c | 11 | ||||
-rw-r--r-- | drivers/i2c/busses/Kconfig | 9 | ||||
-rw-r--r-- | drivers/i2c/busses/i2c-mv64xxx.c | 4 | ||||
-rw-r--r-- | drivers/i2c/busses/i2c-pnx.c | 7 | ||||
-rw-r--r-- | drivers/i2c/chips/m41t00.c | 1 | ||||
-rw-r--r-- | drivers/i2c/i2c-core.c | 28 | ||||
-rw-r--r-- | drivers/ide/pci/atiixp.c | 18 | ||||
-rw-r--r-- | drivers/ide/pci/via82cxxx.c | 138 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 106 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 155 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 1114 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 260 | ||||
-rw-r--r-- | drivers/kvm/svm.c | 113 | ||||
-rw-r--r-- | drivers/kvm/vmx.c | 175 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.c | 2 | ||||
-rw-r--r-- | drivers/leds/leds-s3c24xx.c | 2 | ||||
-rw-r--r-- | drivers/macintosh/via-pmu.c | 1 | ||||
-rw-r--r-- | drivers/pci/search.c | 24 | ||||
-rw-r--r-- | drivers/rtc/rtc-at91rm9200.c | 2 | ||||
-rw-r--r-- | drivers/rtc/rtc-rs5c372.c | 535 |
25 files changed, 2185 insertions, 590 deletions
diff --git a/drivers/acpi/toshiba_acpi.c b/drivers/acpi/toshiba_acpi.c index 88aeccbafaa..d9b651ffcdc 100644 --- a/drivers/acpi/toshiba_acpi.c +++ b/drivers/acpi/toshiba_acpi.c @@ -321,13 +321,16 @@ static int set_lcd_status(struct backlight_device *bd) static unsigned long write_lcd(const char *buffer, unsigned long count) { int value; - int ret = count; + int ret; if (sscanf(buffer, " brightness : %i", &value) == 1 && - value >= 0 && value < HCI_LCD_BRIGHTNESS_LEVELS) + value >= 0 && value < HCI_LCD_BRIGHTNESS_LEVELS) { ret = set_lcd(value); - else + if (ret == 0) + ret = count; + } else { ret = -EINVAL; + } return ret; } diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig index b34e0a958d0..da21552d2b1 100644 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@ -381,7 +381,7 @@ config PATA_OPTI If unsure, say N. config PATA_OPTIDMA - tristate "OPTI FireStar PATA support (Veyr Experimental)" + tristate "OPTI FireStar PATA support (Very Experimental)" depends on PCI && EXPERIMENTAL help This option enables DMA/PIO support for the later OPTi diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c index 47082df7199..dfb306057cf 100644 --- a/drivers/ata/pata_hpt37x.c +++ b/drivers/ata/pata_hpt37x.c @@ -25,7 +25,7 @@ #include <linux/libata.h> #define DRV_NAME "pata_hpt37x" -#define DRV_VERSION "0.5.1" +#define DRV_VERSION "0.5.2" struct hpt_clock { u8 xfer_speed; @@ -416,7 +416,7 @@ static const char *bad_ata100_5[] = { static unsigned long hpt370_filter(const struct ata_port *ap, struct ata_device *adev, unsigned long mask) { - if (adev->class != ATA_DEV_ATA) { + if (adev->class == ATA_DEV_ATA) { if (hpt_dma_blacklisted(adev, "UDMA", bad_ata33)) mask &= ~ATA_MASK_UDMA; if (hpt_dma_blacklisted(adev, "UDMA100", bad_ata100_5)) @@ -749,7 +749,7 @@ static void hpt37x_bmdma_stop(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; struct pci_dev *pdev = to_pci_dev(ap->host->dev); - int mscreg = 0x50 + 2 * ap->port_no; + int mscreg = 0x50 + 4 * ap->port_no; u8 bwsr_stat, msc_stat; pci_read_config_byte(pdev, 0x6A, &bwsr_stat); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7c95c762950..62462190e07 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -765,47 +765,34 @@ static inline struct bio *pkt_get_list_first(struct bio **list_head, struct bio */ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc) { - char sense[SCSI_SENSE_BUFFERSIZE]; - request_queue_t *q; + request_queue_t *q = bdev_get_queue(pd->bdev); struct request *rq; - DECLARE_COMPLETION_ONSTACK(wait); - int err = 0; + int ret = 0; - q = bdev_get_queue(pd->bdev); + rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? + WRITE : READ, __GFP_WAIT); + + if (cgc->buflen) { + if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT)) + goto out; + } + + rq->cmd_len = COMMAND_SIZE(rq->cmd[0]); + memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE); + if (sizeof(rq->cmd) > CDROM_PACKET_SIZE) + memset(rq->cmd + CDROM_PACKET_SIZE, 0, sizeof(rq->cmd) - CDROM_PACKET_SIZE); - rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ, - __GFP_WAIT); - rq->errors = 0; - rq->rq_disk = pd->bdev->bd_disk; - rq->bio = NULL; - rq->buffer = NULL; rq->timeout = 60*HZ; - rq->data = cgc->buffer; - rq->data_len = cgc->buflen; - rq->sense = sense; - memset(sense, 0, sizeof(sense)); - rq->sense_len = 0; rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->cmd_flags |= REQ_HARDBARRIER; if (cgc->quiet) rq->cmd_flags |= REQ_QUIET; - memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE); - if (sizeof(rq->cmd) > CDROM_PACKET_SIZE) - memset(rq->cmd + CDROM_PACKET_SIZE, 0, sizeof(rq->cmd) - CDROM_PACKET_SIZE); - rq->cmd_len = COMMAND_SIZE(rq->cmd[0]); - - rq->ref_count++; - rq->end_io_data = &wait; - rq->end_io = blk_end_sync_rq; - elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); - generic_unplug_device(q); - wait_for_completion(&wait); - - if (rq->errors) - err = -EIO; + blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0); + ret = rq->errors; +out: blk_put_request(rq); - return err; + return ret; } /* diff --git a/drivers/char/ip2/i2ellis.h b/drivers/char/ip2/i2ellis.h index 5eabe47b0bc..433305062fb 100644 --- a/drivers/char/ip2/i2ellis.h +++ b/drivers/char/ip2/i2ellis.h @@ -606,9 +606,9 @@ static int iiDownloadAll(i2eBordStrPtr, loadHdrStrPtr, int, int); // code and returning. // #define COMPLETE(pB,code) \ - if(1){ \ + do { \ pB->i2eError = code; \ return (code == I2EE_GOOD);\ - } + } while (0) #endif // I2ELLIS_H diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 3ece6923134..5c9f67f98d1 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -28,6 +28,7 @@ #include <linux/init.h> #include <linux/connector.h> #include <asm/atomic.h> +#include <asm/unaligned.h> #include <linux/cn_proc.h> @@ -60,7 +61,7 @@ void proc_fork_connector(struct task_struct *task) ev = (struct proc_event*)msg->data; get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ - ev->timestamp_ns = timespec_to_ns(&ts); + put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); ev->what = PROC_EVENT_FORK; ev->event_data.fork.parent_pid = task->real_parent->pid; ev->event_data.fork.parent_tgid = task->real_parent->tgid; @@ -88,7 +89,7 @@ void proc_exec_connector(struct task_struct *task) ev = (struct proc_event*)msg->data; get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ - ev->timestamp_ns = timespec_to_ns(&ts); + put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); ev->what = PROC_EVENT_EXEC; ev->event_data.exec.process_pid = task->pid; ev->event_data.exec.process_tgid = task->tgid; @@ -124,7 +125,7 @@ void proc_id_connector(struct task_struct *task, int which_id) return; get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ - ev->timestamp_ns = timespec_to_ns(&ts); + put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ @@ -146,7 +147,7 @@ void proc_exit_connector(struct task_struct *task) ev = (struct proc_event*)msg->data; get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ - ev->timestamp_ns = timespec_to_ns(&ts); + put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); ev->what = PROC_EVENT_EXIT; ev->event_data.exit.process_pid = task->pid; ev->event_data.exit.process_tgid = task->tgid; @@ -181,7 +182,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack) ev = (struct proc_event*)msg->data; msg->seq = rcvd_seq; ktime_get_ts(&ts); /* get high res monotonic timestamp */ - ev->timestamp_ns = timespec_to_ns(&ts); + put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); ev->cpu = -1; ev->what = PROC_EVENT_NONE; ev->event_data.ack.err = err; diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index e1989f3a268..9367c4cfe93 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -564,13 +564,4 @@ config I2C_PNX This driver can also be built as a module. If so, the module will be called i2c-pnx. -config I2C_PNX_EARLY - bool "Early initialization for I2C on PNXxxxx" - depends on I2C_PNX=y - help - Under certain circumstances one may need to make sure I2C on PNXxxxx - is initialized earlier than some other driver that depends on it - (for instance, that might be USB in case of PNX4008). With this - option turned on you can guarantee that. - endmenu diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c index bbc8e3a7ff5..490173611d6 100644 --- a/drivers/i2c/busses/i2c-mv64xxx.c +++ b/drivers/i2c/busses/i2c-mv64xxx.c @@ -529,6 +529,8 @@ mv64xxx_i2c_probe(struct platform_device *pd) platform_set_drvdata(pd, drv_data); i2c_set_adapdata(&drv_data->adapter, drv_data); + mv64xxx_i2c_hw_init(drv_data); + if (request_irq(drv_data->irq, mv64xxx_i2c_intr, 0, MV64XXX_I2C_CTLR_NAME, drv_data)) { dev_err(&drv_data->adapter.dev, @@ -542,8 +544,6 @@ mv64xxx_i2c_probe(struct platform_device *pd) goto exit_free_irq; } - mv64xxx_i2c_hw_init(drv_data); - return 0; exit_free_irq: diff --git a/drivers/i2c/busses/i2c-pnx.c b/drivers/i2c/busses/i2c-pnx.c index de0bca77e92..17376feb1ac 100644 --- a/drivers/i2c/busses/i2c-pnx.c +++ b/drivers/i2c/busses/i2c-pnx.c @@ -305,8 +305,7 @@ static int i2c_pnx_master_rcv(struct i2c_adapter *adap) return 0; } -static irqreturn_t -i2c_pnx_interrupt(int irq, void *dev_id, struct pt_regs *regs) +static irqreturn_t i2c_pnx_interrupt(int irq, void *dev_id) { u32 stat, ctl; struct i2c_adapter *adap = dev_id; @@ -699,10 +698,6 @@ MODULE_AUTHOR("Vitaly Wool, Dennis Kovalev <source@mvista.com>"); MODULE_DESCRIPTION("I2C driver for Philips IP3204-based I2C busses"); MODULE_LICENSE("GPL"); -#ifdef CONFIG_I2C_PNX_EARLY /* We need to make sure I2C is initialized before USB */ subsys_initcall(i2c_adap_pnx_init); -#else -mudule_init(i2c_adap_pnx_init); -#endif module_exit(i2c_adap_pnx_exit); diff --git a/drivers/i2c/chips/m41t00.c b/drivers/i2c/chips/m41t00.c index 420377c8642..3fcb646e207 100644 --- a/drivers/i2c/chips/m41t00.c +++ b/drivers/i2c/chips/m41t00.c @@ -209,6 +209,7 @@ m41t00_set(void *arg) buf[m41t00_chip->hour] = (buf[m41t00_chip->hour] & ~0x3f) | (hour& 0x3f); buf[m41t00_chip->day] = (buf[m41t00_chip->day] & ~0x3f) | (day & 0x3f); buf[m41t00_chip->mon] = (buf[m41t00_chip->mon] & ~0x1f) | (mon & 0x1f); + buf[m41t00_chip->year] = year; if (i2c_master_send(save_client, wbuf, 9) < 0) dev_err(&save_client->dev, "m41t00_set: Write error\n"); diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 3e31f1d265c..b05378a3d67 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -95,16 +95,32 @@ struct device_driver i2c_adapter_driver = { .bus = &i2c_bus_type, }; +/* ------------------------------------------------------------------------- */ + +/* I2C bus adapters -- one roots each I2C or SMBUS segment */ + static void i2c_adapter_class_dev_release(struct class_device *dev) { struct i2c_adapter *adap = class_dev_to_i2c_adapter(dev); complete(&adap->class_dev_released); } +static ssize_t i2c_adapter_show_name(struct class_device *cdev, char *buf) +{ + struct i2c_adapter *adap = class_dev_to_i2c_adapter(cdev); + return sprintf(buf, "%s\n", adap->name); +} + +static struct class_device_attribute i2c_adapter_attrs[] = { + __ATTR(name, S_IRUGO, i2c_adapter_show_name, NULL), + { }, +}; + struct class i2c_adapter_class = { - .owner = THIS_MODULE, - .name = "i2c-adapter", - .release = &i2c_adapter_class_dev_release, + .owner = THIS_MODULE, + .name = "i2c-adapter", + .class_dev_attrs = i2c_adapter_attrs, + .release = &i2c_adapter_class_dev_release, }; static ssize_t show_adapter_name(struct device *dev, struct device_attribute *attr, char *buf) @@ -175,8 +191,12 @@ int i2c_add_adapter(struct i2c_adapter *adap) * If the parent pointer is not set up, * we add this adapter to the host bus. */ - if (adap->dev.parent == NULL) + if (adap->dev.parent == NULL) { adap->dev.parent = &platform_bus; + printk(KERN_WARNING "**WARNING** I2C adapter driver [%s] " + "forgot to specify physical device; fix it!\n", + adap->name); + } sprintf(adap->dev.bus_id, "i2c-%d", adap->nr); adap->dev.driver = &i2c_adapter_driver; adap->dev.release = &i2c_adapter_dev_release; diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c index ffdffb6379e..524e65de439 100644 --- a/drivers/ide/pci/atiixp.c +++ b/drivers/ide/pci/atiixp.c @@ -46,6 +46,8 @@ static atiixp_ide_timing mdma_timing[] = { static int save_mdma_mode[4]; +static DEFINE_SPINLOCK(atiixp_lock); + /** * atiixp_ratemask - compute rate mask for ATIIXP IDE * @drive: IDE drive to compute for @@ -105,7 +107,7 @@ static int atiixp_ide_dma_host_on(ide_drive_t *drive) unsigned long flags; u16 tmp16; - spin_lock_irqsave(&ide_lock, flags); + spin_lock_irqsave(&atiixp_lock, flags); pci_read_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, &tmp16); if (save_mdma_mode[drive->dn]) @@ -114,7 +116,7 @@ static int atiixp_ide_dma_host_on(ide_drive_t *drive) tmp16 |= (1 << drive->dn); pci_write_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, tmp16); - spin_unlock_irqrestore(&ide_lock, flags); + spin_unlock_irqrestore(&atiixp_lock, flags); return __ide_dma_host_on(drive); } @@ -125,13 +127,13 @@ static int atiixp_ide_dma_host_off(ide_drive_t *drive) unsigned long flags; u16 tmp16; - spin_lock_irqsave(&ide_lock, flags); + spin_lock_irqsave(&atiixp_lock, flags); pci_read_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, &tmp16); tmp16 &= ~(1 << drive->dn); pci_write_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, tmp16); - spin_unlock_irqrestore(&ide_lock, flags); + spin_unlock_irqrestore(&atiixp_lock, flags); return __ide_dma_host_off(drive); } @@ -152,7 +154,7 @@ static void atiixp_tuneproc(ide_drive_t *drive, u8 pio) u32 pio_timing_data; u16 pio_mode_data; - spin_lock_irqsave(&ide_lock, flags); + spin_lock_irqsave(&atiixp_lock, flags); pci_read_config_word(dev, ATIIXP_IDE_PIO_MODE, &pio_mode_data); pio_mode_data &= ~(0x07 << (drive->dn * 4)); @@ -165,7 +167,7 @@ static void atiixp_tuneproc(ide_drive_t *drive, u8 pio) (pio_timing[pio].command_width << (timing_shift + 4)); pci_write_config_dword(dev, ATIIXP_IDE_PIO_TIMING, pio_timing_data); - spin_unlock_irqrestore(&ide_lock, flags); + spin_unlock_irqrestore(&atiixp_lock, flags); } /** @@ -189,7 +191,7 @@ static int atiixp_speedproc(ide_drive_t *drive, u8 xferspeed) speed = ide_rate_filter(atiixp_ratemask(drive), xferspeed); - spin_lock_irqsave(&ide_lock, flags); + spin_lock_irqsave(&atiixp_lock, flags); save_mdma_mode[drive->dn] = 0; if (speed >= XFER_UDMA_0) { @@ -208,7 +210,7 @@ static int atiixp_speedproc(ide_drive_t *drive, u8 xferspeed) } } - spin_unlock_irqrestore(&ide_lock, flags); + spin_unlock_irqrestore(&atiixp_lock, flags); if (speed >= XFER_SW_DMA_0) pio = atiixp_dma_2_pio(speed); diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c index 61f1a9665a7..381cc6f101c 100644 --- a/drivers/ide/pci/via82cxxx.c +++ b/drivers/ide/pci/via82cxxx.c @@ -123,7 +123,7 @@ struct via82cxxx_dev static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing) { struct pci_dev *dev = hwif->pci_dev; - struct via82cxxx_dev *vdev = ide_get_hwifdata(hwif); + struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev); u8 t; if (~vdev->via_config->flags & VIA_BAD_AST) { @@ -162,7 +162,7 @@ static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing) static int via_set_drive(ide_drive_t *drive, u8 speed) { ide_drive_t *peer = HWIF(drive)->drives + (~drive->dn & 1); - struct via82cxxx_dev *vdev = ide_get_hwifdata(drive->hwif); + struct via82cxxx_dev *vdev = pci_get_drvdata(drive->hwif->pci_dev); struct ide_timing t, p; unsigned int T, UT; @@ -225,7 +225,7 @@ static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio) static int via82cxxx_ide_dma_check (ide_drive_t *drive) { ide_hwif_t *hwif = HWIF(drive); - struct via82cxxx_dev *vdev = ide_get_hwifdata(hwif); + struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev); u16 w80 = hwif->udma_four; u16 speed = ide_find_best_mode(drive, @@ -262,6 +262,53 @@ static struct via_isa_bridge *via_config_find(struct pci_dev **isa) return via_config; } +/* + * Check and handle 80-wire cable presence + */ +static void __devinit via_cable_detect(struct via82cxxx_dev *vdev, u32 u) +{ + int i; + + switch (vdev->via_config->flags & VIA_UDMA) { + case VIA_UDMA_66: + for (i = 24; i >= 0; i -= 8) + if (((u >> (i & 16)) & 8) && + ((u >> i) & 0x20) && + (((u >> i) & 7) < 2)) { + /* + * 2x PCI clock and + * UDMA w/ < 3T/cycle + */ + vdev->via_80w |= (1 << (1 - (i >> 4))); + } + break; + + case VIA_UDMA_100: + for (i = 24; i >= 0; i -= 8) + if (((u >> i) & 0x10) || + (((u >> i) & 0x20) && + (((u >> i) & 7) < 4))) { + /* BIOS 80-wire bit or + * UDMA w/ < 60ns/cycle + */ + vdev->via_80w |= (1 << (1 - (i >> 4))); + } + break; + + case VIA_UDMA_133: + for (i = 24; i >= 0; i -= 8) + if (((u >> i) & 0x10) || + (((u >> i) & 0x20) && + (((u >> i) & 7) < 6))) { + /* BIOS 80-wire bit or + * UDMA w/ < 60ns/cycle + */ + vdev->via_80w |= (1 << (1 - (i >> 4))); + } + break; + } +} + /** * init_chipset_via82cxxx - initialization handler * @dev: PCI device @@ -274,14 +321,22 @@ static struct via_isa_bridge *via_config_find(struct pci_dev **isa) static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const char *name) { struct pci_dev *isa = NULL; + struct via82cxxx_dev *vdev; struct via_isa_bridge *via_config; u8 t, v; - unsigned int u; + u32 u; + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) { + printk(KERN_ERR "VP_IDE: out of memory :(\n"); + return -ENOMEM; + } + pci_set_drvdata(dev, vdev); /* * Find the ISA bridge to see how good the IDE is. */ - via_config = via_config_find(&isa); + vdev->via_config = via_config = via_config_find(&isa); /* We checked this earlier so if it fails here deeep badness is involved */ @@ -289,16 +344,17 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const BUG_ON(!via_config->id); /* - * Setup or disable Clk66 if appropriate + * Detect cable and configure Clk66 */ + pci_read_config_dword(dev, VIA_UDMA_TIMING, &u); + + via_cable_detect(vdev, u); if ((via_config->flags & VIA_UDMA) == VIA_UDMA_66) { /* Enable Clk66 */ - pci_read_config_dword(dev, VIA_UDMA_TIMING, &u); pci_write_config_dword(dev, VIA_UDMA_TIMING, u|0x80008); } else if (via_config->flags & VIA_BAD_CLK66) { /* Would cause trouble on 596a and 686 */ - pci_read_config_dword(dev, VIA_UDMA_TIMING, &u); pci_write_config_dword(dev, VIA_UDMA_TIMING, u & ~0x80008); } @@ -367,75 +423,11 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const return 0; } -/* - * Check and handle 80-wire cable presence - */ -static void __devinit via_cable_detect(struct pci_dev *dev, struct via82cxxx_dev *vdev) -{ - unsigned int u; - int i; - pci_read_config_dword(dev, VIA_UDMA_TIMING, &u); - - switch (vdev->via_config->flags & VIA_UDMA) { - - case VIA_UDMA_66: - for (i = 24; i >= 0; i -= 8) - if (((u >> (i & 16)) & 8) && - ((u >> i) & 0x20) && - (((u >> i) & 7) < 2)) { - /* - * 2x PCI clock and - * UDMA w/ < 3T/cycle - */ - vdev->via_80w |= (1 << (1 - (i >> 4))); - } - break; - - case VIA_UDMA_100: - for (i = 24; i >= 0; i -= 8) - if (((u >> i) & 0x10) || - (((u >> i) & 0x20) && - (((u >> i) & 7) < 4))) { - /* BIOS 80-wire bit or - * UDMA w/ < 60ns/cycle - */ - vdev->via_80w |= (1 << (1 - (i >> 4))); - } - break; - - case VIA_UDMA_133: - for (i = 24; i >= 0; i -= 8) - if (((u >> i) & 0x10) || - (((u >> i) & 0x20) && - (((u >> i) & 7) < 6))) { - /* BIOS 80-wire bit or - * UDMA w/ < 60ns/cycle - */ - vdev->via_80w |= (1 << (1 - (i >> 4))); - } - break; - - } -} - static void __devinit init_hwif_via82cxxx(ide_hwif_t *hwif) { - struct via82cxxx_dev *vdev = kmalloc(sizeof(struct via82cxxx_dev), - GFP_KERNEL); - struct pci_dev *isa = NULL; + struct via82cxxx_dev *vdev = pci_get_drvdata(hwif->pci_dev); int i; - if (vdev == NULL) { - printk(KERN_ERR "VP_IDE: out of memory :(\n"); - return; - } - - memset(vdev, 0, sizeof(struct via82cxxx_dev)); - ide_set_hwifdata(hwif, vdev); - - vdev->via_config = via_config_find(&isa); - via_cable_detect(hwif->pci_dev, vdev); - hwif->autodma = 0; hwif->tuneproc = &via82cxxx_tune_drive; diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 100df6f38d9..91e0c75aca8 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -52,6 +52,8 @@ #define KVM_MAX_VCPUS 1 #define KVM_MEMORY_SLOTS 4 #define KVM_NUM_MMU_PAGES 256 +#define KVM_MIN_FREE_MMU_PAGES 5 +#define KVM_REFILL_PAGES 25 #define FX_IMAGE_SIZE 512 #define FX_IMAGE_ALIGN 16 @@ -89,14 +91,54 @@ typedef unsigned long hva_t; typedef u64 hpa_t; typedef unsigned long hfn_t; +#define NR_PTE_CHAIN_ENTRIES 5 + +struct kvm_pte_chain { + u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; + struct hlist_node link; +}; + +/* + * kvm_mmu_page_role, below, is defined as: + * + * bits 0:3 - total guest paging levels (2-4, or zero for real mode) + * bits 4:7 - page table level for this shadow (1-4) + * bits 8:9 - page table quadrant for 2-level guests + * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) + */ +union kvm_mmu_page_role { + unsigned word; + struct { + unsigned glevels : 4; + unsigned level : 4; + unsigned quadrant : 2; + unsigned pad_for_nice_hex_output : 6; + unsigned metaphysical : 1; + }; +}; + struct kvm_mmu_page { struct list_head link; + struct hlist_node hash_link; + + /* + * The following two entries are used to key the shadow page in the + * hash table. + */ + gfn_t gfn; + union kvm_mmu_page_role role; + hpa_t page_hpa; unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ int global; /* Set if all ptes in this page are global */ - u64 *parent_pte; + int multimapped; /* More than one parent_pte? */ + int root_count; /* Currently serving as active root */ + union { + u64 *parent_pte; /* !multimapped */ + struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ + }; }; struct vmcs { @@ -117,14 +159,26 @@ struct kvm_vcpu; struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); - void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); hpa_t root_hpa; int root_level; int shadow_root_level; + + u64 *pae_root; +}; + +#define KVM_NR_MEM_OBJS 20 + +struct kvm_mmu_memory_cache { + int nobjs; + void *objects[KVM_NR_MEM_OBJS]; }; +/* + * We don't want allocation failures within the mmu code, so we preallocate + * enough memory for a single page fault in a cache. + */ struct kvm_guest_debug { int enabled; unsigned long bp[4]; @@ -173,6 +227,7 @@ struct kvm_vcpu { struct mutex mutex; int cpu; int launched; + int interrupt_window_open; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) unsigned long irq_pending[NR_IRQ_WORDS]; @@ -184,6 +239,7 @@ struct kvm_vcpu { unsigned long cr3; unsigned long cr4; unsigned long cr8; + u64 pdptrs[4]; /* pae */ u64 shadow_efer; u64 apic_base; int nmsrs; @@ -194,6 +250,12 @@ struct kvm_vcpu { struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; struct kvm_mmu mmu; + struct kvm_mmu_memory_cache mmu_pte_chain_cache; + struct kvm_mmu_memory_cache mmu_rmap_desc_cache; + + gfn_t last_pt_write_gfn; + int last_pt_write_count; + struct kvm_guest_debug guest_debug; char fx_buf[FX_BUF_SIZE]; @@ -231,10 +293,16 @@ struct kvm { spinlock_t lock; /* protects everything except vcpus */ int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; + /* + * Hash table of struct kvm_mmu_page. + */ struct list_head active_mmu_pages; + int n_free_mmu_pages; + struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; int memory_config_version; int busy; + unsigned long rmap_overflow; }; struct kvm_stat { @@ -247,6 +315,9 @@ struct kvm_stat { u32 io_exits; u32 mmio_exits; u32 signal_exits; + u32 irq_window_exits; + u32 halt_exits; + u32 request_irq_exits; u32 irq_exits; }; @@ -279,6 +350,7 @@ struct kvm_arch_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, unsigned long cr0); @@ -323,7 +395,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot); hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) @@ -396,6 +468,19 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, unsigned long segment_base(u16 selector); +void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); +void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); + +static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) +{ + if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + kvm_mmu_free_some_pages(vcpu); + return vcpu->mmu.page_fault(vcpu, gva, error_code); +} + static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); @@ -541,19 +626,4 @@ static inline u32 get_rdx_init_val(void) #define TSS_REDIRECTION_SIZE (256 / 8) #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) -#ifdef CONFIG_X86_64 - -/* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. Therefore - * we need to allocate shadow page tables in the first 4GB of memory, which - * happens to fit the DMA32 zone. - */ -#define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32) - -#else - -#define GFP_KVM_MMU GFP_KERNEL - -#endif - #endif diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ce7fe640f18..67c1154960f 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -58,6 +58,9 @@ static struct kvm_stats_debugfs_item { { "io_exits", &kvm_stat.io_exits }, { "mmio_exits", &kvm_stat.mmio_exits }, { "signal_exits", &kvm_stat.signal_exits }, + { "irq_window", &kvm_stat.irq_window_exits }, + { "halt_exits", &kvm_stat.halt_exits }, + { "request_irq", &kvm_stat.request_irq_exits }, { "irq_exits", &kvm_stat.irq_exits }, { 0, 0 } }; @@ -227,6 +230,7 @@ static int kvm_dev_open(struct inode *inode, struct file *filp) struct kvm_vcpu *vcpu = &kvm->vcpus[i]; mutex_init(&vcpu->mutex); + vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; INIT_LIST_HEAD(&vcpu->free_pages); } @@ -268,8 +272,8 @@ static void kvm_free_physmem(struct kvm *kvm) static void kvm_free_vcpu(struct kvm_vcpu *vcpu) { - kvm_arch_ops->vcpu_free(vcpu); kvm_mmu_destroy(vcpu); + kvm_arch_ops->vcpu_free(vcpu); } static void kvm_free_vcpus(struct kvm *kvm) @@ -295,14 +299,17 @@ static void inject_gp(struct kvm_vcpu *vcpu) kvm_arch_ops->inject_gp(vcpu, 0); } -static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu, - unsigned long cr3) +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5; + unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; u64 pdpte; u64 *pdpt; + int ret; struct kvm_memory_slot *memslot; spin_lock(&vcpu->kvm->lock); @@ -310,16 +317,23 @@ static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu, /* FIXME: !memslot - emulate? 0xff? */ pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); + ret = 1; for (i = 0; i < 4; ++i) { pdpte = pdpt[offset + i]; - if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) - break; + if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) { + ret = 0; + goto out; + } } + for (i = 0; i < 4; ++i) + vcpu->pdptrs[i] = pdpt[offset + i]; + +out: kunmap_atomic(pdpt, KM_USER0); spin_unlock(&vcpu->kvm->lock); - return i != 4; + return ret; } void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) @@ -365,8 +379,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } } else #endif - if (is_pae(vcpu) && - pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr0: #GP, pdptrs " "reserved bits\n"); inject_gp(vcpu); @@ -387,6 +400,7 @@ EXPORT_SYMBOL_GPL(set_cr0); void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { + kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(lmsw); @@ -407,7 +421,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return; } } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) - && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { + && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); inject_gp(vcpu); } @@ -439,7 +453,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return; } if (is_paging(vcpu) && is_pae(vcpu) && - pdptrs_have_reserved_bits_set(vcpu, cr3)) { + !load_pdptrs(vcpu, cr3)) { printk(KERN_DEBUG "set_cr3: #GP, pdptrs " "reserved bits\n"); inject_gp(vcpu); @@ -449,7 +463,19 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->cr3 = cr3; spin_lock(&vcpu->kvm->lock); - vcpu->mmu.new_cr3(vcpu); + /* + * Does the new cr3 value map to physical memory? (Note, we + * catch an invalid cr3 even in real-mode, because it would + * cause trouble later on when we turn on paging anyway.) + * + * A real CPU would silently accept an invalid cr3 and would + * attempt to use it - with largely undefined (and often hard + * to debug) behavior on the guest side. + */ + if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) + inject_gp(vcpu); + else + vcpu->mmu.new_cr3(vcpu); spin_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr3); @@ -517,7 +543,6 @@ static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n) vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; vcpu->cpu = -1; /* First load will set up TR */ - vcpu->kvm = kvm; r = kvm_arch_ops->vcpu_create(vcpu); if (r < 0) goto out_free_vcpus; @@ -634,6 +659,7 @@ raced: | __GFP_ZERO); if (!new.phys_mem[i]) goto out_free; + new.phys_mem[i]->private = 0; } } @@ -688,6 +714,13 @@ out: return r; } +static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot) +{ + spin_lock(&vcpu->kvm->lock); + kvm_mmu_slot_remove_write_access(vcpu, slot); + spin_unlock(&vcpu->kvm->lock); +} + /* * Get (and clear) the dirty memory log for a memory slot. */ @@ -697,6 +730,7 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int r, i; int n; + int cleared; unsigned long any = 0; spin_lock(&kvm->lock); @@ -727,15 +761,17 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, if (any) { - spin_lock(&kvm->lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - spin_unlock(&kvm->lock); - memset(memslot->dirty_bitmap, 0, n); + cleared = 0; for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu = vcpu_load(kvm, i); if (!vcpu) continue; + if (!cleared) { + do_remove_write_access(vcpu, log->slot); + memset(memslot->dirty_bitmap, 0, n); + cleared = 1; + } kvm_arch_ops->tlb_flush(vcpu); vcpu_put(vcpu); } @@ -863,6 +899,27 @@ static int emulator_read_emulated(unsigned long addr, } } +static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + unsigned long val, int bytes) +{ + struct kvm_memory_slot *m; + struct page *page; + void *virt; + + if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) + return 0; + m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!m) + return 0; + page = gfn_to_page(m, gpa >> PAGE_SHIFT); + kvm_mmu_pre_write(vcpu, gpa, bytes); + virt = kmap_atomic(page, KM_USER0); + memcpy(virt + offset_in_page(gpa), &val, bytes); + kunmap_atomic(virt, KM_USER0); + kvm_mmu_post_write(vcpu, gpa, bytes); + return 1; +} + static int emulator_write_emulated(unsigned long addr, unsigned long val, unsigned int bytes, @@ -874,6 +931,9 @@ static int emulator_write_emulated(unsigned long addr, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; + if (emulator_write_phys(vcpu, gpa, val, bytes)) + return X86EMUL_CONTINUE; + vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; @@ -898,6 +958,30 @@ static int emulator_cmpxchg_emulated(unsigned long addr, return emulator_write_emulated(addr, new, bytes, ctxt); } +#ifdef CONFIG_X86_32 + +static int emulator_cmpxchg8b_emulated(unsigned long addr, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt *ctxt) +{ + static int reported; + int r; + + if (!reported) { + reported = 1; + printk(KERN_WARNING "kvm: emulating exchange8b as write\n"); + } + r = emulator_write_emulated(addr, new_lo, 4, ctxt); + if (r != X86EMUL_CONTINUE) + return r; + return emulator_write_emulated(addr+4, new_hi, 4, ctxt); +} + +#endif + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_arch_ops->get_segment_base(vcpu, seg); @@ -905,18 +989,15 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) { - spin_lock(&vcpu->kvm->lock); - vcpu->mmu.inval_page(vcpu, address); - spin_unlock(&vcpu->kvm->lock); - kvm_arch_ops->invlpg(vcpu, address); return X86EMUL_CONTINUE; } int emulate_clts(struct kvm_vcpu *vcpu) { - unsigned long cr0 = vcpu->cr0; + unsigned long cr0; - cr0 &= ~CR0_TS_MASK; + kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); + cr0 = vcpu->cr0 & ~CR0_TS_MASK; kvm_arch_ops->set_cr0(vcpu, cr0); return X86EMUL_CONTINUE; } @@ -975,6 +1056,9 @@ struct x86_emulate_ops emulate_ops = { .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, +#ifdef CONFIG_X86_32 + .cmpxchg8b_emulated = emulator_cmpxchg8b_emulated, +#endif }; int emulate_instruction(struct kvm_vcpu *vcpu, @@ -1024,6 +1108,8 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; if (!vcpu->mmio_needed) { report_emulation_failure(&emulate_ctxt); return EMULATE_FAIL; @@ -1069,6 +1155,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { + kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); switch (cr) { case 0: return vcpu->cr0; @@ -1403,6 +1490,7 @@ static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs) sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; + kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); sregs->cr0 = vcpu->cr0; sregs->cr2 = vcpu->cr2; sregs->cr3 = vcpu->cr3; @@ -1467,11 +1555,15 @@ static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs) #endif vcpu->apic_base = sregs->apic_base; + kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); + mmu_reset_needed |= vcpu->cr0 != sregs->cr0; kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); mmu_reset_needed |= vcpu->cr4 != sregs->cr4; kvm_arch_ops->set_cr4(vcpu, sregs->cr4); + if (!is_long_mode(vcpu) && is_pae(vcpu)) + load_pdptrs(vcpu, vcpu->cr3); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); @@ -1693,12 +1785,12 @@ static long kvm_dev_ioctl(struct file *filp, if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run)) goto out; r = kvm_dev_ioctl_run(kvm, &kvm_run); - if (r < 0) + if (r < 0 && r != -EINTR) goto out; - r = -EFAULT; - if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) + if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) { + r = -EFAULT; goto out; - r = 0; + } break; } case KVM_GET_REGS: { @@ -1842,6 +1934,7 @@ static long kvm_dev_ioctl(struct file *filp, num_msrs_to_save * sizeof(u32))) goto out; r = 0; + break; } default: ; @@ -1944,17 +2037,17 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) return -EEXIST; } - kvm_arch_ops = ops; - - if (!kvm_arch_ops->cpu_has_kvm_support()) { + if (!ops->cpu_has_kvm_support()) { printk(KERN_ERR "kvm: no hardware support\n"); return -EOPNOTSUPP; } - if (kvm_arch_ops->disabled_by_bios()) { + if (ops->disabled_by_bios()) { printk(KERN_ERR "kvm: disabled by bios\n"); return -EOPNOTSUPP; } + kvm_arch_ops = ops; + r = kvm_arch_ops->hardware_setup(); if (r < 0) return r; diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 790423c5f23..c6f972914f0 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -26,7 +26,31 @@ #include "vmx.h" #include "kvm.h" +#undef MMU_DEBUG + +#undef AUDIT + +#ifdef AUDIT +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} +#endif + +#ifdef MMU_DEBUG + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) + +#else + #define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) + +#endif + +#if defined(MMU_DEBUG) || defined(AUDIT) +static int dbg = 1; +#endif #define ASSERT(x) \ if (!(x)) { \ @@ -34,8 +58,10 @@ __FILE__, __LINE__, #x); \ } -#define PT64_ENT_PER_PAGE 512 -#define PT32_ENT_PER_PAGE 1024 +#define PT64_PT_BITS 9 +#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT32_PT_BITS 10 +#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) #define PT_WRITABLE_SHIFT 1 @@ -125,6 +151,13 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 +#define RMAP_EXT 4 + +struct kvm_rmap_desc { + u64 *shadow_ptes[RMAP_EXT]; + struct kvm_rmap_desc *more; +}; + static int is_write_protection(struct kvm_vcpu *vcpu) { return vcpu->cr0 & CR0_WP_MASK; @@ -150,32 +183,272 @@ static int is_io_pte(unsigned long pte) return pte & PT_SHADOW_IO_MARK; } +static int is_rmap_pte(u64 pte) +{ + return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK)) + == (PT_WRITABLE_MASK | PT_PRESENT_MASK); +} + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + size_t objsize, int min) +{ + void *obj; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + obj = kzalloc(objsize, GFP_NOWAIT); + if (!obj) + return -ENOMEM; + cache->objects[cache->nobjs++] = obj; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + kfree(mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain), 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc), 1); +out: + return r; +} + +static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); + mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, + size_t size) +{ + void *p; + + BUG_ON(!mc->nobjs); + p = mc->objects[--mc->nobjs]; + memset(p, 0, size); + return p; +} + +static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj) +{ + if (mc->nobjs < KVM_NR_MEM_OBJS) + mc->objects[mc->nobjs++] = obj; + else + kfree(obj); +} + +static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain)); +} + +static void mmu_free_pte_chain(struct kvm_vcpu *vcpu, + struct kvm_pte_chain *pc) +{ + mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc); +} + +static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc)); +} + +static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu, + struct kvm_rmap_desc *rd) +{ + mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd); +} + +/* + * Reverse mapping data structures: + * + * If page->private bit zero is zero, then page->private points to the + * shadow page table entry that points to page_address(page). + * + * If page->private bit zero is one, (then page->private & ~1) points + * to a struct kvm_rmap_desc containing more mappings. + */ +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) +{ + struct page *page; + struct kvm_rmap_desc *desc; + int i; + + if (!is_rmap_pte(*spte)) + return; + page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + if (!page->private) { + rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); + page->private = (unsigned long)spte; + } else if (!(page->private & 1)) { + rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); + desc = mmu_alloc_rmap_desc(vcpu); + desc->shadow_ptes[0] = (u64 *)page->private; + desc->shadow_ptes[1] = spte; + page->private = (unsigned long)desc | 1; + } else { + rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + desc = desc->more; + if (desc->shadow_ptes[RMAP_EXT-1]) { + desc->more = mmu_alloc_rmap_desc(vcpu); + desc = desc->more; + } + for (i = 0; desc->shadow_ptes[i]; ++i) + ; + desc->shadow_ptes[i] = spte; + } +} + +static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu, + struct page *page, + struct kvm_rmap_desc *desc, + int i, + struct kvm_rmap_desc *prev_desc) +{ + int j; + + for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + ; + desc->shadow_ptes[i] = desc->shadow_ptes[j]; + desc->shadow_ptes[j] = 0; + if (j != 0) + return; + if (!prev_desc && !desc->more) + page->private = (unsigned long)desc->shadow_ptes[0]; + else + if (prev_desc) + prev_desc->more = desc->more; + else + page->private = (unsigned long)desc->more | 1; + mmu_free_rmap_desc(vcpu, desc); +} + +static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte) +{ + struct page *page; + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + int i; + + if (!is_rmap_pte(*spte)) + return; + page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + if (!page->private) { + printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); + BUG(); + } else if (!(page->private & 1)) { + rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); + if ((u64 *)page->private != spte) { + printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", + spte, *spte); + BUG(); + } + page->private = 0; + } else { + rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + prev_desc = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) + if (desc->shadow_ptes[i] == spte) { + rmap_desc_remove_entry(vcpu, page, + desc, i, + prev_desc); + return; + } + prev_desc = desc; + desc = desc->more; + } + BUG(); + } +} + +static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +{ + struct kvm *kvm = vcpu->kvm; + struct page *page; + struct kvm_memory_slot *slot; + struct kvm_rmap_desc *desc; + u64 *spte; + + slot = gfn_to_memslot(kvm, gfn); + BUG_ON(!slot); + page = gfn_to_page(slot, gfn); + + while (page->private) { + if (!(page->private & 1)) + spte = (u64 *)page->private; + else { + desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + spte = desc->shadow_ptes[0]; + } + BUG_ON(!spte); + BUG_ON((*spte & PT64_BASE_ADDR_MASK) != + page_to_pfn(page) << PAGE_SHIFT); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON(!(*spte & PT_WRITABLE_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + rmap_remove(vcpu, spte); + kvm_arch_ops->tlb_flush(vcpu); + *spte &= ~(u64)PT_WRITABLE_MASK; + } +} + +static int is_empty_shadow_page(hpa_t page_hpa) +{ + u64 *pos; + u64 *end; + + for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); + pos != end; pos++) + if (*pos != 0) { + printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, + pos, *pos); + return 0; + } + return 1; +} + static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) { struct kvm_mmu_page *page_head = page_header(page_hpa); + ASSERT(is_empty_shadow_page(page_hpa)); list_del(&page_head->link); page_head->page_hpa = page_hpa; list_add(&page_head->link, &vcpu->free_pages); + ++vcpu->kvm->n_free_mmu_pages; } -static int is_empty_shadow_page(hpa_t page_hpa) +static unsigned kvm_page_table_hashfn(gfn_t gfn) { - u32 *pos; - u32 *end; - for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32); - pos != end; pos++) - if (*pos != 0) - return 0; - return 1; + return gfn; } -static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, + u64 *parent_pte) { struct kvm_mmu_page *page; if (list_empty(&vcpu->free_pages)) - return INVALID_PAGE; + return NULL; page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_del(&page->link); @@ -183,8 +456,239 @@ static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) ASSERT(is_empty_shadow_page(page->page_hpa)); page->slot_bitmap = 0; page->global = 1; + page->multimapped = 0; page->parent_pte = parent_pte; - return page->page_hpa; + --vcpu->kvm->n_free_mmu_pages; + return page; +} + +static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!parent_pte) + return; + if (!page->multimapped) { + u64 *old = page->parent_pte; + + if (!old) { + page->parent_pte = parent_pte; + return; + } + page->multimapped = 1; + pte_chain = mmu_alloc_pte_chain(vcpu); + INIT_HLIST_HEAD(&page->parent_ptes); + hlist_add_head(&pte_chain->link, &page->parent_ptes); + pte_chain->parent_ptes[0] = old; + } + hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) { + if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) + continue; + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) + if (!pte_chain->parent_ptes[i]) { + pte_chain->parent_ptes[i] = parent_pte; + return; + } + } + pte_chain = mmu_alloc_pte_chain(vcpu); + BUG_ON(!pte_chain); + hlist_add_head(&pte_chain->link, &page->parent_ptes); + pte_chain->parent_ptes[0] = parent_pte; +} + +static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!page->multimapped) { + BUG_ON(page->parent_pte != parent_pte); + page->parent_pte = NULL; + return; + } + hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + if (pte_chain->parent_ptes[i] != parent_pte) + continue; + while (i + 1 < NR_PTE_CHAIN_ENTRIES + && pte_chain->parent_ptes[i + 1]) { + pte_chain->parent_ptes[i] + = pte_chain->parent_ptes[i + 1]; + ++i; + } + pte_chain->parent_ptes[i] = NULL; + if (i == 0) { + hlist_del(&pte_chain->link); + mmu_free_pte_chain(vcpu, pte_chain); + if (hlist_empty(&page->parent_ptes)) { + page->multimapped = 0; + page->parent_pte = NULL; + } + } + return; + } + BUG(); +} + +static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, + gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *page; + struct hlist_node *node; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry(page, node, bucket, hash_link) + if (page->gfn == gfn && !page->role.metaphysical) { + pgprintk("%s: found role %x\n", + __FUNCTION__, page->role.word); + return page; + } + return NULL; +} + +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + gva_t gaddr, + unsigned level, + int metaphysical, + u64 *parent_pte) +{ + union kvm_mmu_page_role role; + unsigned index; + unsigned quadrant; + struct hlist_head *bucket; + struct kvm_mmu_page *page; + struct hlist_node *node; + + role.word = 0; + role.glevels = vcpu->mmu.root_level; + role.level = level; + role.metaphysical = metaphysical; + if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { + quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); + quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; + role.quadrant = quadrant; + } + pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, + gfn, role.word); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry(page, node, bucket, hash_link) + if (page->gfn == gfn && page->role.word == role.word) { + mmu_page_add_parent_pte(vcpu, page, parent_pte); + pgprintk("%s: found\n", __FUNCTION__); + return page; + } + page = kvm_mmu_alloc_page(vcpu, parent_pte); + if (!page) + return page; + pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); + page->gfn = gfn; + page->role = role; + hlist_add_head(&page->hash_link, bucket); + if (!metaphysical) + rmap_write_protect(vcpu, gfn); + return page; +} + +static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page) +{ + unsigned i; + u64 *pt; + u64 ent; + + pt = __va(page->page_hpa); + + if (page->role.level == PT_PAGE_TABLE_LEVEL) { + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] & PT_PRESENT_MASK) + rmap_remove(vcpu, &pt[i]); + pt[i] = 0; + } + kvm_arch_ops->tlb_flush(vcpu); + return; + } + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + ent = pt[i]; + + pt[i] = 0; + if (!(ent & PT_PRESENT_MASK)) + continue; + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]); + } +} + +static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *parent_pte) +{ + mmu_page_remove_parent_pte(vcpu, page, parent_pte); +} + +static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page) +{ + u64 *parent_pte; + + while (page->multimapped || page->parent_pte) { + if (!page->multimapped) + parent_pte = page->parent_pte; + else { + struct kvm_pte_chain *chain; + + chain = container_of(page->parent_ptes.first, + struct kvm_pte_chain, link); + parent_pte = chain->parent_ptes[0]; + } + BUG_ON(!parent_pte); + kvm_mmu_put_page(vcpu, page, parent_pte); + *parent_pte = 0; + } + kvm_mmu_page_unlink_children(vcpu, page); + if (!page->root_count) { + hlist_del(&page->hash_link); + kvm_mmu_free_page(vcpu, page->page_hpa); + } else { + list_del(&page->link); + list_add(&page->link, &vcpu->kvm->active_mmu_pages); + } +} + +static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *page; + struct hlist_node *node, *n; + int r; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + r = 0; + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry_safe(page, node, n, bucket, hash_link) + if (page->gfn == gfn && !page->role.metaphysical) { + pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, + page->role.word); + kvm_mmu_zap_page(vcpu, page); + r = 1; + } + return r; } static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) @@ -225,35 +729,6 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) return gpa_to_hpa(vcpu, gpa); } - -static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa, - int level) -{ - ASSERT(vcpu); - ASSERT(VALID_PAGE(page_hpa)); - ASSERT(level <= PT64_ROOT_LEVEL && level > 0); - - if (level == 1) - memset(__va(page_hpa), 0, PAGE_SIZE); - else { - u64 *pos; - u64 *end; - - for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE; - pos != end; pos++) { - u64 current_ent = *pos; - - *pos = 0; - if (is_present_pte(current_ent)) - release_pt_page_64(vcpu, - current_ent & - PT64_BASE_ADDR_MASK, - level - 1); - } - } - kvm_mmu_free_page(vcpu, page_hpa); -} - static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } @@ -266,52 +741,109 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) for (; ; level--) { u32 index = PT64_INDEX(v, level); u64 *table; + u64 pte; ASSERT(VALID_PAGE(table_addr)); table = __va(table_addr); if (level == 1) { + pte = table[index]; + if (is_present_pte(pte) && is_writeble_pte(pte)) + return 0; mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); page_header_update_slot(vcpu->kvm, table, v); table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + rmap_add(vcpu, &table[index]); return 0; } if (table[index] == 0) { - hpa_t new_table = kvm_mmu_alloc_page(vcpu, - &table[index]); - - if (!VALID_PAGE(new_table)) { + struct kvm_mmu_page *new_table; + gfn_t pseudo_gfn; + + pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) + >> PAGE_SHIFT; + new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, + v, level - 1, + 1, &table[index]); + if (!new_table) { pgprintk("nonpaging_map: ENOMEM\n"); return -ENOMEM; } - if (level == PT32E_ROOT_LEVEL) - table[index] = new_table | PT_PRESENT_MASK; - else - table[index] = new_table | PT_PRESENT_MASK | - PT_WRITABLE_MASK | PT_USER_MASK; + table[index] = new_table->page_hpa | PT_PRESENT_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; } table_addr = table[index] & PT64_BASE_ADDR_MASK; } } -static void nonpaging_flush(struct kvm_vcpu *vcpu) +static void mmu_free_roots(struct kvm_vcpu *vcpu) { - hpa_t root = vcpu->mmu.root_hpa; + int i; + struct kvm_mmu_page *page; - ++kvm_stat.tlb_flush; - pgprintk("nonpaging_flush\n"); - ASSERT(VALID_PAGE(root)); - release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); - root = kvm_mmu_alloc_page(vcpu, NULL); - ASSERT(VALID_PAGE(root)); - vcpu->mmu.root_hpa = root; - if (is_paging(vcpu)) - root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)); - kvm_arch_ops->set_cr3(vcpu, root); - kvm_arch_ops->tlb_flush(vcpu); +#ifdef CONFIG_X86_64 + if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->mmu.root_hpa; + + ASSERT(VALID_PAGE(root)); + page = page_header(root); + --page->root_count; + vcpu->mmu.root_hpa = INVALID_PAGE; + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->mmu.pae_root[i]; + + ASSERT(VALID_PAGE(root)); + root &= PT64_BASE_ADDR_MASK; + page = page_header(root); + --page->root_count; + vcpu->mmu.pae_root[i] = INVALID_PAGE; + } + vcpu->mmu.root_hpa = INVALID_PAGE; +} + +static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + int i; + gfn_t root_gfn; + struct kvm_mmu_page *page; + + root_gfn = vcpu->cr3 >> PAGE_SHIFT; + +#ifdef CONFIG_X86_64 + if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->mmu.root_hpa; + + ASSERT(!VALID_PAGE(root)); + page = kvm_mmu_get_page(vcpu, root_gfn, 0, + PT64_ROOT_LEVEL, 0, NULL); + root = page->page_hpa; + ++page->root_count; + vcpu->mmu.root_hpa = root; + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) + root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; + else if (vcpu->mmu.root_level == 0) + root_gfn = 0; + page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, !is_paging(vcpu), + NULL); + root = page->page_hpa; + ++page->root_count; + vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -322,43 +854,29 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { - int ret; gpa_t addr = gva; + hpa_t paddr; + int r; + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); - for (;;) { - hpa_t paddr; - - paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); - if (is_error_hpa(paddr)) - return 1; + paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); - ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr); - if (ret) { - nonpaging_flush(vcpu); - continue; - } - break; - } - return ret; -} + if (is_error_hpa(paddr)) + return 1; -static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) -{ + return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); } static void nonpaging_free(struct kvm_vcpu *vcpu) { - hpa_t root; - - ASSERT(vcpu); - root = vcpu->mmu.root_hpa; - if (VALID_PAGE(root)) - release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); - vcpu->mmu.root_hpa = INVALID_PAGE; + mmu_free_roots(vcpu); } static int nonpaging_init_context(struct kvm_vcpu *vcpu) @@ -367,40 +885,31 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; - context->inval_page = nonpaging_inval_page; context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; - context->root_level = PT32E_ROOT_LEVEL; + context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + mmu_alloc_roots(vcpu); ASSERT(VALID_PAGE(context->root_hpa)); kvm_arch_ops->set_cr3(vcpu, context->root_hpa); return 0; } - static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - struct kvm_mmu_page *page, *npage; - - list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages, - link) { - if (page->global) - continue; - - if (!page->parent_pte) - continue; - - *page->parent_pte = 0; - release_pt_page_64(vcpu, page->page_hpa, 1); - } ++kvm_stat.tlb_flush; kvm_arch_ops->tlb_flush(vcpu); } static void paging_new_cr3(struct kvm_vcpu *vcpu) { + pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); + mmu_free_roots(vcpu); + if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + kvm_mmu_free_some_pages(vcpu); + mmu_alloc_roots(vcpu); kvm_mmu_flush_tlb(vcpu); + kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } static void mark_pagetable_nonglobal(void *shadow_pte) @@ -412,7 +921,8 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, u64 *shadow_pte, gpa_t gaddr, int dirty, - u64 access_bits) + u64 access_bits, + gfn_t gfn) { hpa_t paddr; @@ -420,13 +930,10 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, if (!dirty) access_bits &= ~PT_WRITABLE_MASK; - if (access_bits & PT_WRITABLE_MASK) - mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); *shadow_pte |= access_bits; - paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); - if (!(*shadow_pte & PT_GLOBAL_MASK)) mark_pagetable_nonglobal(shadow_pte); @@ -434,10 +941,31 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, *shadow_pte |= gaddr; *shadow_pte |= PT_SHADOW_IO_MARK; *shadow_pte &= ~PT_PRESENT_MASK; - } else { - *shadow_pte |= paddr; - page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); + return; + } + + *shadow_pte |= paddr; + + if (access_bits & PT_WRITABLE_MASK) { + struct kvm_mmu_page *shadow; + + shadow = kvm_mmu_lookup_page(vcpu, gfn); + if (shadow) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + access_bits &= ~PT_WRITABLE_MASK; + if (is_writeble_pte(*shadow_pte)) { + *shadow_pte &= ~PT_WRITABLE_MASK; + kvm_arch_ops->tlb_flush(vcpu); + } + } } + + if (access_bits & PT_WRITABLE_MASK) + mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + + page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); + rmap_add(vcpu, shadow_pte); } static void inject_page_fault(struct kvm_vcpu *vcpu, @@ -474,41 +1002,6 @@ static int may_access(u64 pte, int write, int user) return 1; } -/* - * Remove a shadow pte. - */ -static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) -{ - hpa_t page_addr = vcpu->mmu.root_hpa; - int level = vcpu->mmu.shadow_root_level; - - ++kvm_stat.invlpg; - - for (; ; level--) { - u32 index = PT64_INDEX(addr, level); - u64 *table = __va(page_addr); - - if (level == PT_PAGE_TABLE_LEVEL ) { - table[index] = 0; - return; - } - - if (!is_present_pte(table[index])) - return; - - page_addr = table[index] & PT64_BASE_ADDR_MASK; - - if (level == PT_DIRECTORY_LEVEL && - (table[index] & PT_SHADOW_PS_MARK)) { - table[index] = 0; - release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL); - - kvm_arch_ops->tlb_flush(vcpu); - return; - } - } -} - static void paging_free(struct kvm_vcpu *vcpu) { nonpaging_free(vcpu); @@ -522,37 +1015,40 @@ static void paging_free(struct kvm_vcpu *vcpu) #include "paging_tmpl.h" #undef PTTYPE -static int paging64_init_context(struct kvm_vcpu *vcpu) +static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->mmu; ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; - context->inval_page = paging_inval_page; context->gva_to_gpa = paging64_gva_to_gpa; context->free = paging_free; - context->root_level = PT64_ROOT_LEVEL; - context->shadow_root_level = PT64_ROOT_LEVEL; - context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + context->root_level = level; + context->shadow_root_level = level; + mmu_alloc_roots(vcpu); ASSERT(VALID_PAGE(context->root_hpa)); kvm_arch_ops->set_cr3(vcpu, context->root_hpa | (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); return 0; } +static int paging64_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); +} + static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->mmu; context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; - context->inval_page = paging_inval_page; context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + mmu_alloc_roots(vcpu); ASSERT(VALID_PAGE(context->root_hpa)); kvm_arch_ops->set_cr3(vcpu, context->root_hpa | (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); @@ -561,14 +1057,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) static int paging32E_init_context(struct kvm_vcpu *vcpu) { - int ret; - - if ((ret = paging64_init_context(vcpu))) - return ret; - - vcpu->mmu.root_level = PT32E_ROOT_LEVEL; - vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL; - return 0; + return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } static int init_kvm_mmu(struct kvm_vcpu *vcpu) @@ -597,41 +1086,161 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { + int r; + destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); + r = init_kvm_mmu(vcpu); + if (r < 0) + goto out; + r = mmu_topup_memory_caches(vcpu); +out: + return r; } -static void free_mmu_pages(struct kvm_vcpu *vcpu) +void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) { - while (!list_empty(&vcpu->free_pages)) { + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm_mmu_page *page; + struct kvm_mmu_page *child; + struct hlist_node *node, *n; + struct hlist_head *bucket; + unsigned index; + u64 *spte; + u64 pte; + unsigned offset = offset_in_page(gpa); + unsigned pte_size; + unsigned page_offset; + unsigned misaligned; + int level; + int flooded = 0; + + pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + if (gfn == vcpu->last_pt_write_gfn) { + ++vcpu->last_pt_write_count; + if (vcpu->last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->last_pt_write_gfn = gfn; + vcpu->last_pt_write_count = 1; + } + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry_safe(page, node, n, bucket, hash_link) { + if (page->gfn != gfn || page->role.metaphysical) + continue; + pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + if (misaligned || flooded) { + /* + * Misaligned accesses are too much trouble to fix + * up; also, they usually indicate a page is not used + * as a page table. + * + * If we're seeing too many writes to a page, + * it may no longer be a page table, or we may be + * forking, in which case it is better to unmap the + * page. + */ + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, page->role.word); + kvm_mmu_zap_page(vcpu, page); + continue; + } + page_offset = offset; + level = page->role.level; + if (page->role.glevels == PT32_ROOT_LEVEL) { + page_offset <<= 1; /* 32->64 */ + page_offset &= ~PAGE_MASK; + } + spte = __va(page->page_hpa); + spte += page_offset / sizeof(*spte); + pte = *spte; + if (is_present_pte(pte)) { + if (level == PT_PAGE_TABLE_LEVEL) + rmap_remove(vcpu, spte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(vcpu, child, spte); + } + } + *spte = 0; + } +} + +void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) +{ +} + +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT); +} + +void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) { struct kvm_mmu_page *page; + page = container_of(vcpu->kvm->active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu, page); + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages); + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *page; + + while (!list_empty(&vcpu->kvm->active_mmu_pages)) { + page = container_of(vcpu->kvm->active_mmu_pages.next, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu, page); + } + while (!list_empty(&vcpu->free_pages)) { page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_del(&page->link); __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); page->page_hpa = INVALID_PAGE; } + free_page((unsigned long)vcpu->mmu.pae_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) { + struct page *page; int i; ASSERT(vcpu); for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { - struct page *page; struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; INIT_LIST_HEAD(&page_header->link); - if ((page = alloc_page(GFP_KVM_MMU)) == NULL) + if ((page = alloc_page(GFP_KERNEL)) == NULL) goto error_1; page->private = (unsigned long)page_header; page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; memset(__va(page_header->page_hpa), 0, PAGE_SIZE); list_add(&page_header->link, &vcpu->free_pages); + ++vcpu->kvm->n_free_mmu_pages; } + + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. + * Therefore we need to allocate shadow page tables in the first + * 4GB of memory, which happens to fit the DMA32 zone. + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + goto error_1; + vcpu->mmu.pae_root = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->mmu.pae_root[i] = INVALID_PAGE; + return 0; error_1: @@ -663,10 +1272,12 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) destroy_kvm_mmu(vcpu); free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) { + struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *page; list_for_each_entry(page, &kvm->active_mmu_pages, link) { @@ -679,8 +1290,169 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt = __va(page->page_hpa); for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ - if (pt[i] & PT_WRITABLE_MASK) + if (pt[i] & PT_WRITABLE_MASK) { + rmap_remove(vcpu, &pt[i]); pt[i] &= ~PT_WRITABLE_MASK; + } + } +} + +#ifdef AUDIT + +static const char *audit_msg; + +static gva_t canonicalize(gva_t gva) +{ +#ifdef CONFIG_X86_64 + gva = (long long)(gva << 16) >> 16; +#endif + return gva; +} +static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, + gva_t va, int level) +{ + u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); + int i; + gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { + u64 ent = pt[i]; + + if (!ent & PT_PRESENT_MASK) + continue; + + va = canonicalize(va); + if (level > 1) + audit_mappings_page(vcpu, ent, va, level - 1); + else { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); + hpa_t hpa = gpa_to_hpa(vcpu, gpa); + + if ((ent & PT_PRESENT_MASK) + && (ent & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "audit error: (%s) levels %d" + " gva %lx gpa %llx hpa %llx ent %llx\n", + audit_msg, vcpu->mmu.root_level, + va, gpa, hpa, ent); + } } } + +static void audit_mappings(struct kvm_vcpu *vcpu) +{ + int i; + + if (vcpu->mmu.root_level == 4) + audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); + else + for (i = 0; i < 4; ++i) + if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) + audit_mappings_page(vcpu, + vcpu->mmu.pae_root[i], + i << 30, + 2); +} + +static int count_rmaps(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + int i, j, k; + + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_rmap_desc *d; + + for (j = 0; j < m->npages; ++j) { + struct page *page = m->phys_mem[j]; + + if (!page->private) + continue; + if (!(page->private & 1)) { + ++nmaps; + continue; + } + d = (struct kvm_rmap_desc *)(page->private & ~1ul); + while (d) { + for (k = 0; k < RMAP_EXT; ++k) + if (d->shadow_ptes[k]) + ++nmaps; + else + break; + d = d->more; + } + } + } + return nmaps; +} + +static int count_writable_mappings(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + struct kvm_mmu_page *page; + int i; + + list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { + u64 *pt = __va(page->page_hpa); + + if (page->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = pt[i]; + + if (!(ent & PT_PRESENT_MASK)) + continue; + if (!(ent & PT_WRITABLE_MASK)) + continue; + ++nmaps; + } + } + return nmaps; +} + +static void audit_rmap(struct kvm_vcpu *vcpu) +{ + int n_rmap = count_rmaps(vcpu); + int n_actual = count_writable_mappings(vcpu); + + if (n_rmap != n_actual) + printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", + __FUNCTION__, audit_msg, n_rmap, n_actual); +} + +static void audit_write_protection(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *page; + + list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { + hfn_t hfn; + struct page *pg; + + if (page->role.metaphysical) + continue; + + hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT) + >> PAGE_SHIFT; + pg = pfn_to_page(hfn); + if (pg->private) + printk(KERN_ERR "%s: (%s) shadow page has writable" + " mappings: gfn %lx role %x\n", + __FUNCTION__, audit_msg, page->gfn, + page->role.word); + } +} + +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +{ + int olddbg = dbg; + + dbg = 0; + audit_msg = msg; + audit_rmap(vcpu); + audit_write_protection(vcpu); + audit_mappings(vcpu); + dbg = olddbg; +} + +#endif diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 09bb9b4ed12..2dbf4307ed9 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -32,6 +32,11 @@ #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK + #ifdef CONFIG_X86_64 + #define PT_MAX_FULL_LEVELS 4 + #else + #define PT_MAX_FULL_LEVELS 2 + #endif #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 @@ -42,6 +47,7 @@ #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK + #define PT_MAX_FULL_LEVELS 2 #else #error Invalid PTTYPE value #endif @@ -52,93 +58,126 @@ */ struct guest_walker { int level; + gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t *table; + pt_element_t *ptep; pt_element_t inherited_ar; + gfn_t gfn; }; -static void FNAME(init_walker)(struct guest_walker *walker, - struct kvm_vcpu *vcpu) +/* + * Fetch a guest pte for a guest virtual address + */ +static void FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr) { hpa_t hpa; struct kvm_memory_slot *slot; + pt_element_t *ptep; + pt_element_t root; + gfn_t table_gfn; + pgprintk("%s: addr %lx\n", __FUNCTION__, addr); walker->level = vcpu->mmu.root_level; - slot = gfn_to_memslot(vcpu->kvm, - (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK); + walker->table = NULL; + root = vcpu->cr3; +#if PTTYPE == 64 + if (!is_long_mode(vcpu)) { + walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; + root = *walker->ptep; + if (!(root & PT_PRESENT_MASK)) + return; + --walker->level; + } +#endif + table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + walker->table_gfn[walker->level - 1] = table_gfn; + pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, + walker->level - 1, table_gfn); + slot = gfn_to_memslot(vcpu->kvm, table_gfn); + hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); - walker->table = (pt_element_t *)( (unsigned long)walker->table | - (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) ); walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; + + for (;;) { + int index = PT_INDEX(addr, walker->level); + hpa_t paddr; + + ptep = &walker->table[index]; + ASSERT(((unsigned long)walker->table & PAGE_MASK) == + ((unsigned long)ptep & PAGE_MASK)); + + if (is_present_pte(*ptep) && !(*ptep & PT_ACCESSED_MASK)) + *ptep |= PT_ACCESSED_MASK; + + if (!is_present_pte(*ptep)) + break; + + if (walker->level == PT_PAGE_TABLE_LEVEL) { + walker->gfn = (*ptep & PT_BASE_ADDR_MASK) + >> PAGE_SHIFT; + break; + } + + if (walker->level == PT_DIRECTORY_LEVEL + && (*ptep & PT_PAGE_SIZE_MASK) + && (PTTYPE == 64 || is_pse(vcpu))) { + walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK) + >> PAGE_SHIFT; + walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); + break; + } + + if (walker->level != 3 || is_long_mode(vcpu)) + walker->inherited_ar &= walker->table[index]; + table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; + paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); + kunmap_atomic(walker->table, KM_USER0); + walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), + KM_USER0); + --walker->level; + walker->table_gfn[walker->level - 1 ] = table_gfn; + pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, + walker->level - 1, table_gfn); + } + walker->ptep = ptep; + pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); } static void FNAME(release_walker)(struct guest_walker *walker) { - kunmap_atomic(walker->table, KM_USER0); + if (walker->table) + kunmap_atomic(walker->table, KM_USER0); } static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, - u64 *shadow_pte, u64 access_bits) + u64 *shadow_pte, u64 access_bits, gfn_t gfn) { ASSERT(*shadow_pte == 0); access_bits &= guest_pte; *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, - guest_pte & PT_DIRTY_MASK, access_bits); + guest_pte & PT_DIRTY_MASK, access_bits, gfn); } static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, - u64 *shadow_pte, u64 access_bits, - int index) + u64 *shadow_pte, u64 access_bits, gfn_t gfn) { gpa_t gaddr; ASSERT(*shadow_pte == 0); access_bits &= guest_pde; - gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index; + gaddr = (gpa_t)gfn << PAGE_SHIFT; if (PTTYPE == 32 && is_cpuid_PSE36()) gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); *shadow_pte = guest_pde & PT_PTE_COPY_MASK; set_pte_common(vcpu, shadow_pte, gaddr, - guest_pde & PT_DIRTY_MASK, access_bits); -} - -/* - * Fetch a guest pte from a specific level in the paging hierarchy. - */ -static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, - int level, - gva_t addr) -{ - - ASSERT(level > 0 && level <= walker->level); - - for (;;) { - int index = PT_INDEX(addr, walker->level); - hpa_t paddr; - - ASSERT(((unsigned long)walker->table & PAGE_MASK) == - ((unsigned long)&walker->table[index] & PAGE_MASK)); - if (level == walker->level || - !is_present_pte(walker->table[index]) || - (walker->level == PT_DIRECTORY_LEVEL && - (walker->table[index] & PT_PAGE_SIZE_MASK) && - (PTTYPE == 64 || is_pse(vcpu)))) - return &walker->table[index]; - if (walker->level != 3 || is_long_mode(vcpu)) - walker->inherited_ar &= walker->table[index]; - paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK); - kunmap_atomic(walker->table, KM_USER0); - walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), - KM_USER0); - --walker->level; - } + guest_pde & PT_DIRTY_MASK, access_bits, gfn); } /* @@ -150,15 +189,26 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, hpa_t shadow_addr; int level; u64 *prev_shadow_ent = NULL; + pt_element_t *guest_ent = walker->ptep; + + if (!is_present_pte(*guest_ent)) + return NULL; shadow_addr = vcpu->mmu.root_hpa; level = vcpu->mmu.shadow_root_level; + if (level == PT32E_ROOT_LEVEL) { + shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; + shadow_addr &= PT64_BASE_ADDR_MASK; + --level; + } for (; ; level--) { u32 index = SHADOW_PT_INDEX(addr, level); u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; - pt_element_t *guest_ent; + struct kvm_mmu_page *shadow_page; u64 shadow_pte; + int metaphysical; + gfn_t table_gfn; if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { if (level == PT_PAGE_TABLE_LEVEL) @@ -168,21 +218,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, continue; } - if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) { - ASSERT(level == PT32E_ROOT_LEVEL); - guest_ent = FNAME(fetch_guest)(vcpu, walker, - PT32_ROOT_LEVEL, addr); - } else - guest_ent = FNAME(fetch_guest)(vcpu, walker, - level, addr); - - if (!is_present_pte(*guest_ent)) - return NULL; - - /* Don't set accessed bit on PAE PDPTRs */ - if (vcpu->mmu.root_level != 3 || walker->level != 3) - *guest_ent |= PT_ACCESSED_MASK; - if (level == PT_PAGE_TABLE_LEVEL) { if (walker->level == PT_DIRECTORY_LEVEL) { @@ -190,21 +225,30 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, *prev_shadow_ent |= PT_SHADOW_PS_MARK; FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar, - PT_INDEX(addr, PT_PAGE_TABLE_LEVEL)); + walker->gfn); } else { ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); - FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar); + FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, + walker->inherited_ar, + walker->gfn); } return shadow_ent; } - shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent); - if (!VALID_PAGE(shadow_addr)) - return ERR_PTR(-ENOMEM); - shadow_pte = shadow_addr | PT_PRESENT_MASK; - if (vcpu->mmu.root_level > 3 || level != 3) - shadow_pte |= PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; + if (level - 1 == PT_PAGE_TABLE_LEVEL + && walker->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) + >> PAGE_SHIFT; + } else { + metaphysical = 0; + table_gfn = walker->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + metaphysical, shadow_ent); + shadow_addr = shadow_page->page_hpa; + shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; *shadow_ent = shadow_pte; prev_shadow_ent = shadow_ent; } @@ -221,11 +265,13 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, u64 *shadow_ent, struct guest_walker *walker, gva_t addr, - int user) + int user, + int *write_pt) { pt_element_t *guest_ent; int writable_shadow; gfn_t gfn; + struct kvm_mmu_page *page; if (is_writeble_pte(*shadow_ent)) return 0; @@ -250,17 +296,35 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, *shadow_ent &= ~PT_USER_MASK; } - guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr); + guest_ent = walker->ptep; if (!is_present_pte(*guest_ent)) { *shadow_ent = 0; return 0; } - gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + gfn = walker->gfn; + + if (user) { + /* + * Usermode page faults won't be for page table updates. + */ + while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { + pgprintk("%s: zap %lx %x\n", + __FUNCTION__, gfn, page->role.word); + kvm_mmu_zap_page(vcpu, page); + } + } else if (kvm_mmu_lookup_page(vcpu, gfn)) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + *guest_ent |= PT_DIRTY_MASK; + *write_pt = 1; + return 0; + } mark_page_dirty(vcpu->kvm, gfn); *shadow_ent |= PT_WRITABLE_MASK; *guest_ent |= PT_DIRTY_MASK; + rmap_add(vcpu, shadow_ent); return 1; } @@ -276,7 +340,8 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, * - normal guest page fault due to the guest pte marked not present, not * writable, or not executable * - * Returns: 1 if we need to emulate the instruction, 0 otherwise + * Returns: 1 if we need to emulate the instruction, 0 otherwise, or + * a negative value on error. */ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code) @@ -287,39 +352,47 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker walker; u64 *shadow_pte; int fixed; + int write_pt = 0; + int r; + + pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); + kvm_mmu_audit(vcpu, "pre page fault"); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; /* * Look up the shadow pte for the faulting address. */ - for (;;) { - FNAME(init_walker)(&walker, vcpu); - shadow_pte = FNAME(fetch)(vcpu, addr, &walker); - if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ - nonpaging_flush(vcpu); - FNAME(release_walker)(&walker); - continue; - } - break; - } + FNAME(walk_addr)(&walker, vcpu, addr); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!shadow_pte) { + pgprintk("%s: not mapped\n", __FUNCTION__); inject_page_fault(vcpu, addr, error_code); FNAME(release_walker)(&walker); return 0; } + pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, + shadow_pte, *shadow_pte); + /* * Update the shadow pte. */ if (write_fault) fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, - user_fault); + user_fault, &write_pt); else fixed = fix_read_pf(shadow_pte); + pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, + shadow_pte, *shadow_pte); + FNAME(release_walker)(&walker); /* @@ -331,20 +404,23 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pgprintk("%s: io work, no access\n", __FUNCTION__); inject_page_fault(vcpu, addr, error_code | PFERR_PRESENT_MASK); + kvm_mmu_audit(vcpu, "post page fault (io)"); return 0; } /* * pte not present, guest page fault. */ - if (pte_present && !fixed) { + if (pte_present && !fixed && !write_pt) { inject_page_fault(vcpu, addr, error_code); + kvm_mmu_audit(vcpu, "post page fault (guest)"); return 0; } ++kvm_stat.pf_fixed; + kvm_mmu_audit(vcpu, "post page fault (fixed)"); - return 0; + return write_pt; } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -353,9 +429,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) pt_element_t guest_pte; gpa_t gpa; - FNAME(init_walker)(&walker, vcpu); - guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL, - vaddr); + FNAME(walk_addr)(&walker, vcpu, vaddr); + guest_pte = *walker.ptep; FNAME(release_walker)(&walker); if (!is_present_pte(guest_pte)) @@ -389,3 +464,4 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) #undef PT_PTE_COPY_MASK #undef PT_NON_PTE_COPY_MASK #undef PT_DIR_BASE_ADDR_MASK +#undef PT_MAX_FULL_LEVELS diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index fa042873571..ccc06b1b91b 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -235,6 +235,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; + + vcpu->interrupt_window_open = 1; } static int has_svm(void) @@ -495,7 +497,6 @@ static void init_vmcb(struct vmcb *vmcb) /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_HLT) | - (1ULL << INTERCEPT_INVLPG) | (1ULL << INTERCEPT_INVLPGA) | (1ULL << INTERCEPT_IOIO_PROT) | (1ULL << INTERCEPT_MSR_PROT) | @@ -700,6 +701,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vcpu->svm->vmcb->save.gdtr.base = dt->base ; } +static void svm_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) +{ +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { #ifdef CONFIG_X86_64 @@ -847,6 +852,7 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u64 fault_address; u32 error_code; enum emulation_result er; + int r; if (is_external_interrupt(exit_int_info)) push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); @@ -855,7 +861,12 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) fault_address = vcpu->svm->vmcb->control.exit_info_2; error_code = vcpu->svm->vmcb->control.exit_info_1; - if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) { + r = kvm_mmu_page_fault(vcpu, fault_address, error_code); + if (r < 0) { + spin_unlock(&vcpu->kvm->lock); + return r; + } + if (!r) { spin_unlock(&vcpu->kvm->lock); return 1; } @@ -1031,10 +1042,11 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; skip_emulated_instruction(vcpu); - if (vcpu->irq_summary && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)) + if (vcpu->irq_summary) return 1; kvm_run->exit_reason = KVM_EXIT_HLT; + ++kvm_stat.halt_exits; return 0; } @@ -1186,6 +1198,23 @@ static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return rdmsr_interception(vcpu, kvm_run); } +static int interrupt_window_interception(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + /* + * If the user space waits to inject interrupts, exit as soon as + * possible + */ + if (kvm_run->request_interrupt_window && + !vcpu->irq_summary) { + ++kvm_stat.irq_window_exits; + kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + return 0; + } + + return 1; +} + static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) = { [SVM_EXIT_READ_CR0] = emulate_on_interception, @@ -1210,6 +1239,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, [SVM_EXIT_NMI] = nop_on_interception, [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, + [SVM_EXIT_VINTR] = interrupt_window_interception, /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_HLT] = halt_interception, @@ -1278,15 +1308,11 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) } -static inline void kvm_try_inject_irq(struct kvm_vcpu *vcpu) +static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { struct vmcb_control_area *control; - if (!vcpu->irq_summary) - return; - control = &vcpu->svm->vmcb->control; - control->int_vector = pop_irq(vcpu); control->int_ctl &= ~V_INTR_PRIO_MASK; control->int_ctl |= V_IRQ_MASK | @@ -1301,6 +1327,59 @@ static void kvm_reput_irq(struct kvm_vcpu *vcpu) control->int_ctl &= ~V_IRQ_MASK; push_irq(vcpu, control->int_vector); } + + vcpu->interrupt_window_open = + !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); +} + +static void do_interrupt_requests(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + struct vmcb_control_area *control = &vcpu->svm->vmcb->control; + + vcpu->interrupt_window_open = + (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && + (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); + + if (vcpu->interrupt_window_open && vcpu->irq_summary) + /* + * If interrupts enabled, and not blocked by sti or mov ss. Good. + */ + kvm_do_inject_irq(vcpu); + + /* + * Interrupts blocked. Wait for unblock. + */ + if (!vcpu->interrupt_window_open && + (vcpu->irq_summary || kvm_run->request_interrupt_window)) { + control->intercept |= 1ULL << INTERCEPT_VINTR; + } else + control->intercept &= ~(1ULL << INTERCEPT_VINTR); +} + +static void post_kvm_run_save(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && + vcpu->irq_summary == 0); + kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = vcpu->cr8; + kvm_run->apic_base = vcpu->apic_base; +} + +/* + * Check if userspace requested an interrupt window, and that the + * interrupt window is open. + * + * No need to exit to userspace if we already have an interrupt queued. + */ +static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + return (!vcpu->irq_summary && + kvm_run->request_interrupt_window && + vcpu->interrupt_window_open && + (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); } static void save_db_regs(unsigned long *db_regs) @@ -1324,9 +1403,10 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 fs_selector; u16 gs_selector; u16 ldt_selector; + int r; again: - kvm_try_inject_irq(vcpu); + do_interrupt_requests(vcpu, kvm_run); clgi(); @@ -1487,18 +1567,28 @@ again: if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; + post_kvm_run_save(vcpu, kvm_run); return 0; } - if (handle_exit(vcpu, kvm_run)) { + r = handle_exit(vcpu, kvm_run); + if (r > 0) { if (signal_pending(current)) { ++kvm_stat.signal_exits; + post_kvm_run_save(vcpu, kvm_run); + return -EINTR; + } + + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + ++kvm_stat.request_irq_exits; + post_kvm_run_save(vcpu, kvm_run); return -EINTR; } kvm_resched(vcpu); goto again; } - return 0; + post_kvm_run_save(vcpu, kvm_run); + return r; } static void svm_flush_tlb(struct kvm_vcpu *vcpu) @@ -1565,6 +1655,7 @@ static struct kvm_arch_ops svm_arch_ops = { .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cs_db_l_bits = svm_get_cs_db_l_bits, + .decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr0_no_modeswitch = svm_set_cr0, .set_cr3 = svm_set_cr3, diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index d0a2c2d5342..d4701cb4c65 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -116,7 +116,7 @@ static void vmcs_clear(struct vmcs *vmcs) static void __vcpu_clear(void *arg) { struct kvm_vcpu *vcpu = arg; - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); if (vcpu->cpu == cpu) vmcs_clear(vcpu->vmcs); @@ -152,15 +152,21 @@ static u64 vmcs_read64(unsigned long field) #endif } +static noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + dump_stack(); +} + static void vmcs_writel(unsigned long field, unsigned long value) { u8 error; asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" : "=q"(error) : "a"(value), "d"(field) : "cc" ); - if (error) - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + if (unlikely(error)) + vmwrite_error(field, value); } static void vmcs_write16(unsigned long field, u16 value) @@ -263,6 +269,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) if (interruptibility & 3) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3); + vcpu->interrupt_window_open = 1; } static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) @@ -541,7 +548,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) static struct vmcs *alloc_vmcs(void) { - return alloc_vmcs_cpu(smp_processor_id()); + return alloc_vmcs_cpu(raw_smp_processor_id()); } static void free_vmcs(struct vmcs *vmcs) @@ -736,6 +743,15 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif +static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) +{ + vcpu->cr0 &= KVM_GUEST_CR0_MASK; + vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK; + + vcpu->cr4 &= KVM_GUEST_CR4_MASK; + vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; +} + static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) @@ -1011,8 +1027,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, 0xfff0); vmcs_writel(GUEST_RSP, 0); - vmcs_writel(GUEST_CR3, 0); - //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 vmcs_writel(GUEST_DR7, 0x400); @@ -1049,7 +1063,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ - | CPU_BASED_INVDPG_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ ); @@ -1094,14 +1107,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_SYSENTER_EIP, a); vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ - ret = -ENOMEM; - vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vcpu->guest_msrs) - goto out; - vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vcpu->host_msrs) - goto out_free_guest_msrs; - for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; @@ -1155,8 +1160,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) return 0; -out_free_guest_msrs: - kfree(vcpu->guest_msrs); out: return ret; } @@ -1224,21 +1227,34 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } -static void kvm_try_inject_irq(struct kvm_vcpu *vcpu) + +static void do_interrupt_requests(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) { - if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) - && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0) + u32 cpu_based_vm_exec_control; + + vcpu->interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); + + if (vcpu->interrupt_window_open && + vcpu->irq_summary && + !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) /* - * Interrupts enabled, and not blocked by sti or mov ss. Good. + * If interrupts enabled, and not blocked by sti or mov ss. Good. */ kvm_do_inject_irq(vcpu); - else + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + if (!vcpu->interrupt_window_open && + (vcpu->irq_summary || kvm_run->request_interrupt_window)) /* * Interrupts blocked. Wait for unblock. */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) - | CPU_BASED_VIRTUAL_INTR_PENDING); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + else + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) @@ -1277,6 +1293,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long cr2, rip; u32 vect_info; enum emulation_result er; + int r; vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); intr_info = vmcs_read32(VM_EXIT_INTR_INFO); @@ -1305,7 +1322,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) cr2 = vmcs_readl(EXIT_QUALIFICATION); spin_lock(&vcpu->kvm->lock); - if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) { + r = kvm_mmu_page_fault(vcpu, cr2, error_code); + if (r < 0) { + spin_unlock(&vcpu->kvm->lock); + return r; + } + if (!r) { spin_unlock(&vcpu->kvm->lock); return 1; } @@ -1425,17 +1447,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - u64 address = vmcs_read64(EXIT_QUALIFICATION); - int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - spin_lock(&vcpu->kvm->lock); - vcpu->mmu.inval_page(vcpu, address); - spin_unlock(&vcpu->kvm->lock); - vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length); - return 1; -} - static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; @@ -1575,23 +1586,40 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static void post_kvm_run_save(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = vcpu->cr8; + kvm_run->apic_base = vcpu->apic_base; + kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && + vcpu->irq_summary == 0); +} + static int handle_interrupt_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - /* Turn off interrupt window reporting. */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) - & ~CPU_BASED_VIRTUAL_INTR_PENDING); + /* + * If the user space waits to inject interrupts, exit as soon as + * possible + */ + if (kvm_run->request_interrupt_window && + !vcpu->irq_summary) { + kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + ++kvm_stat.irq_window_exits; + return 0; + } return 1; } static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); - if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) + if (vcpu->irq_summary) return 1; kvm_run->exit_reason = KVM_EXIT_HLT; + ++kvm_stat.halt_exits; return 0; } @@ -1605,7 +1633,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_IO_INSTRUCTION] = handle_io, - [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, [EXIT_REASON_CPUID] = handle_cpuid, @@ -1642,11 +1669,27 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } +/* + * Check if userspace requested an interrupt window, and that the + * interrupt window is open. + * + * No need to exit to userspace if we already have an interrupt queued. + */ +static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + return (!vcpu->irq_summary && + kvm_run->request_interrupt_window && + vcpu->interrupt_window_open && + (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); +} + static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u8 fail; u16 fs_sel, gs_sel, ldt_sel; int fs_gs_ldt_reload_needed; + int r; again: /* @@ -1673,9 +1716,7 @@ again: vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); #endif - if (vcpu->irq_summary && - !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) - kvm_try_inject_irq(vcpu); + do_interrupt_requests(vcpu, kvm_run); if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); @@ -1812,6 +1853,7 @@ again: fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); + vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; #ifndef CONFIG_X86_64 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); @@ -1821,6 +1863,7 @@ again: if (fail) { kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + r = 0; } else { if (fs_gs_ldt_reload_needed) { load_ldt(ldt_sel); @@ -1840,17 +1883,28 @@ again: } vcpu->launched = 1; kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; - if (kvm_handle_exit(kvm_run, vcpu)) { + r = kvm_handle_exit(kvm_run, vcpu); + if (r > 0) { /* Give scheduler a change to reschedule. */ if (signal_pending(current)) { ++kvm_stat.signal_exits; + post_kvm_run_save(vcpu, kvm_run); + return -EINTR; + } + + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + ++kvm_stat.request_irq_exits; + post_kvm_run_save(vcpu, kvm_run); return -EINTR; } + kvm_resched(vcpu); goto again; } } - return 0; + + post_kvm_run_save(vcpu, kvm_run); + return r; } static void vmx_flush_tlb(struct kvm_vcpu *vcpu) @@ -1906,13 +1960,33 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vmcs *vmcs; + vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vcpu->guest_msrs) + return -ENOMEM; + + vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vcpu->host_msrs) + goto out_free_guest_msrs; + vmcs = alloc_vmcs(); if (!vmcs) - return -ENOMEM; + goto out_free_msrs; + vmcs_clear(vmcs); vcpu->vmcs = vmcs; vcpu->launched = 0; + return 0; + +out_free_msrs: + kfree(vcpu->host_msrs); + vcpu->host_msrs = NULL; + +out_free_guest_msrs: + kfree(vcpu->guest_msrs); + vcpu->guest_msrs = NULL; + + return -ENOMEM; } static struct kvm_arch_ops vmx_arch_ops = { @@ -1936,6 +2010,7 @@ static struct kvm_arch_ops vmx_arch_ops = { .get_segment = vmx_get_segment, .set_segment = vmx_set_segment, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, .set_cr3 = vmx_set_cr3, diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 1bff3e925fd..be70795b482 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -1323,7 +1323,7 @@ twobyte_special_insn: ctxt)) != 0)) goto done; if ((old_lo != _regs[VCPU_REGS_RAX]) - || (old_hi != _regs[VCPU_REGS_RDI])) { + || (old_hi != _regs[VCPU_REGS_RDX])) { _regs[VCPU_REGS_RAX] = old_lo; _regs[VCPU_REGS_RDX] = old_hi; _eflags &= ~EFLG_ZF; diff --git a/drivers/leds/leds-s3c24xx.c b/drivers/leds/leds-s3c24xx.c index fb1edc1c9ed..50914439d86 100644 --- a/drivers/leds/leds-s3c24xx.c +++ b/drivers/leds/leds-s3c24xx.c @@ -16,7 +16,7 @@ #include <linux/platform_device.h> #include <linux/leds.h> -#include <asm/arch/hardware.h> +#include <asm/hardware.h> #include <asm/arch/regs-gpio.h> #include <asm/arch/leds-gpio.h> diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index c8558d4ed50..8ca75e52f63 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -44,6 +44,7 @@ #include <linux/sysdev.h> #include <linux/freezer.h> #include <linux/syscalls.h> +#include <linux/suspend.h> #include <linux/cpu.h> #include <asm/prom.h> #include <asm/machdep.h> diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 45f2b20ef51..fab381ed853 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -193,6 +193,18 @@ static struct pci_dev * pci_find_subsys(unsigned int vendor, struct pci_dev *dev; WARN_ON(in_interrupt()); + + /* + * pci_find_subsys() can be called on the ide_setup() path, super-early + * in boot. But the down_read() will enable local interrupts, which + * can cause some machines to crash. So here we detect and flag that + * situation and bail out early. + */ + if (unlikely(list_empty(&pci_devices))) { + printk(KERN_INFO "pci_find_subsys() called while pci_devices " + "is still empty\n"); + return NULL; + } down_read(&pci_bus_sem); n = from ? from->global_list.next : pci_devices.next; @@ -259,6 +271,18 @@ pci_get_subsys(unsigned int vendor, unsigned int device, struct pci_dev *dev; WARN_ON(in_interrupt()); + + /* + * pci_get_subsys() can potentially be called by drivers super-early + * in boot. But the down_read() will enable local interrupts, which + * can cause some machines to crash. So here we detect and flag that + * situation and bail out early. + */ + if (unlikely(list_empty(&pci_devices))) { + printk(KERN_NOTICE "pci_get_subsys() called while pci_devices " + "is still empty\n"); + return NULL; + } down_read(&pci_bus_sem); n = from ? from->global_list.next : pci_devices.next; diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 4f654c901c6..a724ab49a79 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -33,6 +33,8 @@ #include <asm/mach/time.h> +#include <asm/arch/at91_rtc.h> + #define AT91_RTC_FREQ 1 #define AT91_RTC_EPOCH 1900UL /* just like arch/arm/common/rtctime.c */ diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index 1460f6b769f..e7851e3739a 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -1,5 +1,5 @@ /* - * An I2C driver for the Ricoh RS5C372 RTC + * An I2C driver for Ricoh RS5C372 and RV5C38[67] RTCs * * Copyright (C) 2005 Pavel Mironchik <pmironchik@optifacio.net> * Copyright (C) 2006 Tower Technologies @@ -13,7 +13,7 @@ #include <linux/rtc.h> #include <linux/bcd.h> -#define DRV_VERSION "0.3" +#define DRV_VERSION "0.4" /* Addresses to scan */ static unsigned short normal_i2c[] = { /* 0x32,*/ I2C_CLIENT_END }; @@ -21,6 +21,13 @@ static unsigned short normal_i2c[] = { /* 0x32,*/ I2C_CLIENT_END }; /* Insmod parameters */ I2C_CLIENT_INSMOD; + +/* + * Ricoh has a family of I2C based RTCs, which differ only slightly from + * each other. Differences center on pinout (e.g. how many interrupts, + * output clock, etc) and how the control registers are used. The '372 + * is significant only because that's the one this driver first supported. + */ #define RS5C372_REG_SECS 0 #define RS5C372_REG_MINS 1 #define RS5C372_REG_HOURS 2 @@ -29,59 +36,142 @@ I2C_CLIENT_INSMOD; #define RS5C372_REG_MONTH 5 #define RS5C372_REG_YEAR 6 #define RS5C372_REG_TRIM 7 +# define RS5C372_TRIM_XSL 0x80 +# define RS5C372_TRIM_MASK 0x7F + +#define RS5C_REG_ALARM_A_MIN 8 /* or ALARM_W */ +#define RS5C_REG_ALARM_A_HOURS 9 +#define RS5C_REG_ALARM_A_WDAY 10 + +#define RS5C_REG_ALARM_B_MIN 11 /* or ALARM_D */ +#define RS5C_REG_ALARM_B_HOURS 12 +#define RS5C_REG_ALARM_B_WDAY 13 /* (ALARM_B only) */ + +#define RS5C_REG_CTRL1 14 +# define RS5C_CTRL1_AALE (1 << 7) /* or WALE */ +# define RS5C_CTRL1_BALE (1 << 6) /* or DALE */ +# define RV5C387_CTRL1_24 (1 << 5) +# define RS5C372A_CTRL1_SL1 (1 << 5) +# define RS5C_CTRL1_CT_MASK (7 << 0) +# define RS5C_CTRL1_CT0 (0 << 0) /* no periodic irq */ +# define RS5C_CTRL1_CT4 (4 << 0) /* 1 Hz level irq */ +#define RS5C_REG_CTRL2 15 +# define RS5C372_CTRL2_24 (1 << 5) +# define RS5C_CTRL2_XSTP (1 << 4) +# define RS5C_CTRL2_CTFG (1 << 2) +# define RS5C_CTRL2_AAFG (1 << 1) /* or WAFG */ +# define RS5C_CTRL2_BAFG (1 << 0) /* or DAFG */ + + +/* to read (style 1) or write registers starting at R */ +#define RS5C_ADDR(R) (((R) << 4) | 0) + + +enum rtc_type { + rtc_undef = 0, + rtc_rs5c372a, + rtc_rs5c372b, + rtc_rv5c386, + rtc_rv5c387a, +}; -#define RS5C372_TRIM_XSL 0x80 -#define RS5C372_TRIM_MASK 0x7F +/* REVISIT: this assumes that: + * - we're in the 21st century, so it's safe to ignore the century + * bit for rv5c38[67] (REG_MONTH bit 7); + * - we should use ALARM_A not ALARM_B (may be wrong on some boards) + */ +struct rs5c372 { + struct i2c_client *client; + struct rtc_device *rtc; + enum rtc_type type; + unsigned time24:1; + unsigned has_irq:1; + char buf[17]; + char *regs; + + /* on conversion to a "new style" i2c driver, this vanishes */ + struct i2c_client dev; +}; -#define RS5C372_REG_BASE 0 +static int rs5c_get_regs(struct rs5c372 *rs5c) +{ + struct i2c_client *client = rs5c->client; + struct i2c_msg msgs[] = { + { client->addr, I2C_M_RD, sizeof rs5c->buf, rs5c->buf }, + }; + + /* This implements the third reading method from the datasheet, using + * an internal address that's reset after each transaction (by STOP) + * to 0x0f ... so we read extra registers, and skip the first one. + * + * The first method doesn't work with the iop3xx adapter driver, on at + * least 80219 chips; this works around that bug. + */ + if ((i2c_transfer(client->adapter, msgs, 1)) != 1) { + pr_debug("%s: can't read registers\n", rs5c->rtc->name); + return -EIO; + } -static int rs5c372_attach(struct i2c_adapter *adapter); -static int rs5c372_detach(struct i2c_client *client); -static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind); + dev_dbg(&client->dev, + "%02x %02x %02x (%02x) %02x %02x %02x (%02x), " + "%02x %02x %02x, %02x %02x %02x; %02x %02x\n", + rs5c->regs[0], rs5c->regs[1], rs5c->regs[2], rs5c->regs[3], + rs5c->regs[4], rs5c->regs[5], rs5c->regs[6], rs5c->regs[7], + rs5c->regs[8], rs5c->regs[9], rs5c->regs[10], rs5c->regs[11], + rs5c->regs[12], rs5c->regs[13], rs5c->regs[14], rs5c->regs[15]); -struct rs5c372 { - u8 reg_addr; - u8 regs[17]; - struct i2c_msg msg[1]; - struct i2c_client client; - struct rtc_device *rtc; -}; + return 0; +} -static struct i2c_driver rs5c372_driver = { - .driver = { - .name = "rs5c372", - }, - .attach_adapter = &rs5c372_attach, - .detach_client = &rs5c372_detach, -}; +static unsigned rs5c_reg2hr(struct rs5c372 *rs5c, unsigned reg) +{ + unsigned hour; -static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm) + if (rs5c->time24) + return BCD2BIN(reg & 0x3f); + + hour = BCD2BIN(reg & 0x1f); + if (hour == 12) + hour = 0; + if (reg & 0x20) + hour += 12; + return hour; +} + +static unsigned rs5c_hr2reg(struct rs5c372 *rs5c, unsigned hour) { + if (rs5c->time24) + return BIN2BCD(hour); + + if (hour > 12) + return 0x20 | BIN2BCD(hour - 12); + if (hour == 12) + return 0x20 | BIN2BCD(12); + if (hour == 0) + return BIN2BCD(12); + return BIN2BCD(hour); +} - struct rs5c372 *rs5c372 = i2c_get_clientdata(client); - u8 *buf = &(rs5c372->regs[1]); +static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm) +{ + struct rs5c372 *rs5c = i2c_get_clientdata(client); + int status = rs5c_get_regs(rs5c); - /* this implements the 3rd reading method, according - * to the datasheet. rs5c372 defaults to internal - * address 0xF, so 0x0 is in regs[1] - */ + if (status < 0) + return status; - if ((i2c_transfer(client->adapter, rs5c372->msg, 1)) != 1) { - dev_err(&client->dev, "%s: read error\n", __FUNCTION__); - return -EIO; - } + tm->tm_sec = BCD2BIN(rs5c->regs[RS5C372_REG_SECS] & 0x7f); + tm->tm_min = BCD2BIN(rs5c->regs[RS5C372_REG_MINS] & 0x7f); + tm->tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C372_REG_HOURS]); - tm->tm_sec = BCD2BIN(buf[RS5C372_REG_SECS] & 0x7f); - tm->tm_min = BCD2BIN(buf[RS5C372_REG_MINS] & 0x7f); - tm->tm_hour = BCD2BIN(buf[RS5C372_REG_HOURS] & 0x3f); - tm->tm_wday = BCD2BIN(buf[RS5C372_REG_WDAY] & 0x07); - tm->tm_mday = BCD2BIN(buf[RS5C372_REG_DAY] & 0x3f); + tm->tm_wday = BCD2BIN(rs5c->regs[RS5C372_REG_WDAY] & 0x07); + tm->tm_mday = BCD2BIN(rs5c->regs[RS5C372_REG_DAY] & 0x3f); /* tm->tm_mon is zero-based */ - tm->tm_mon = BCD2BIN(buf[RS5C372_REG_MONTH] & 0x1f) - 1; + tm->tm_mon = BCD2BIN(rs5c->regs[RS5C372_REG_MONTH] & 0x1f) - 1; /* year is 1900 + tm->tm_year */ - tm->tm_year = BCD2BIN(buf[RS5C372_REG_YEAR]) + 100; + tm->tm_year = BCD2BIN(rs5c->regs[RS5C372_REG_YEAR]) + 100; dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, " "mday=%d, mon=%d, year=%d, wday=%d\n", @@ -89,22 +179,25 @@ static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm) tm->tm_sec, tm->tm_min, tm->tm_hour, tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); - return 0; + /* rtc might need initialization */ + return rtc_valid_tm(tm); } static int rs5c372_set_datetime(struct i2c_client *client, struct rtc_time *tm) { - unsigned char buf[8] = { RS5C372_REG_BASE }; + struct rs5c372 *rs5c = i2c_get_clientdata(client); + unsigned char buf[8]; - dev_dbg(&client->dev, - "%s: secs=%d, mins=%d, hours=%d " + dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d " "mday=%d, mon=%d, year=%d, wday=%d\n", - __FUNCTION__, tm->tm_sec, tm->tm_min, tm->tm_hour, + __FUNCTION__, + tm->tm_sec, tm->tm_min, tm->tm_hour, tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); + buf[0] = RS5C_ADDR(RS5C372_REG_SECS); buf[1] = BIN2BCD(tm->tm_sec); buf[2] = BIN2BCD(tm->tm_min); - buf[3] = BIN2BCD(tm->tm_hour); + buf[3] = rs5c_hr2reg(rs5c, tm->tm_hour); buf[4] = BIN2BCD(tm->tm_wday); buf[5] = BIN2BCD(tm->tm_mday); buf[6] = BIN2BCD(tm->tm_mon + 1); @@ -118,21 +211,43 @@ static int rs5c372_set_datetime(struct i2c_client *client, struct rtc_time *tm) return 0; } +#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE) +#define NEED_TRIM +#endif + +#if defined(CONFIG_RTC_INTF_SYSFS) || defined(CONFIG_RTC_INTF_SYSFS_MODULE) +#define NEED_TRIM +#endif + +#ifdef NEED_TRIM static int rs5c372_get_trim(struct i2c_client *client, int *osc, int *trim) { struct rs5c372 *rs5c372 = i2c_get_clientdata(client); - u8 tmp = rs5c372->regs[RS5C372_REG_TRIM + 1]; + u8 tmp = rs5c372->regs[RS5C372_REG_TRIM]; if (osc) *osc = (tmp & RS5C372_TRIM_XSL) ? 32000 : 32768; if (trim) { - *trim = tmp & RS5C372_TRIM_MASK; - dev_dbg(&client->dev, "%s: raw trim=%x\n", __FUNCTION__, *trim); + dev_dbg(&client->dev, "%s: raw trim=%x\n", __FUNCTION__, tmp); + tmp &= RS5C372_TRIM_MASK; + if (tmp & 0x3e) { + int t = tmp & 0x3f; + + if (tmp & 0x40) + t = (~t | (s8)0xc0) + 1; + else + t = t - 1; + + tmp = t * 2; + } else + tmp = 0; + *trim = tmp; } return 0; } +#endif static int rs5c372_rtc_read_time(struct device *dev, struct rtc_time *tm) { @@ -144,25 +259,190 @@ static int rs5c372_rtc_set_time(struct device *dev, struct rtc_time *tm) return rs5c372_set_datetime(to_i2c_client(dev), tm); } +#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE) + +static int +rs5c_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) +{ + struct i2c_client *client = to_i2c_client(dev); + struct rs5c372 *rs5c = i2c_get_clientdata(client); + unsigned char buf[2]; + int status; + + buf[1] = rs5c->regs[RS5C_REG_CTRL1]; + switch (cmd) { + case RTC_UIE_OFF: + case RTC_UIE_ON: + /* some 327a modes use a different IRQ pin for 1Hz irqs */ + if (rs5c->type == rtc_rs5c372a + && (buf[1] & RS5C372A_CTRL1_SL1)) + return -ENOIOCTLCMD; + case RTC_AIE_OFF: + case RTC_AIE_ON: + /* these irq management calls only make sense for chips + * which are wired up to an IRQ. + */ + if (!rs5c->has_irq) + return -ENOIOCTLCMD; + break; + default: + return -ENOIOCTLCMD; + } + + status = rs5c_get_regs(rs5c); + if (status < 0) + return status; + + buf[0] = RS5C_ADDR(RS5C_REG_CTRL1); + switch (cmd) { + case RTC_AIE_OFF: /* alarm off */ + buf[1] &= ~RS5C_CTRL1_AALE; + break; + case RTC_AIE_ON: /* alarm on */ + buf[1] |= RS5C_CTRL1_AALE; + break; + case RTC_UIE_OFF: /* update off */ + buf[1] &= ~RS5C_CTRL1_CT_MASK; + break; + case RTC_UIE_ON: /* update on */ + buf[1] &= ~RS5C_CTRL1_CT_MASK; + buf[1] |= RS5C_CTRL1_CT4; + break; + } + if ((i2c_master_send(client, buf, 2)) != 2) { + printk(KERN_WARNING "%s: can't update alarm\n", + rs5c->rtc->name); + status = -EIO; + } else + rs5c->regs[RS5C_REG_CTRL1] = buf[1]; + return status; +} + +#else +#define rs5c_rtc_ioctl NULL +#endif + + +/* NOTE: Since RTC_WKALM_{RD,SET} were originally defined for EFI, + * which only exposes a polled programming interface; and since + * these calls map directly to those EFI requests; we don't demand + * we have an IRQ for this chip when we go through this API. + * + * The older x86_pc derived RTC_ALM_{READ,SET} calls require irqs + * though, managed through RTC_AIE_{ON,OFF} requests. + */ + +static int rs5c_read_alarm(struct device *dev, struct rtc_wkalrm *t) +{ + struct i2c_client *client = to_i2c_client(dev); + struct rs5c372 *rs5c = i2c_get_clientdata(client); + int status; + + status = rs5c_get_regs(rs5c); + if (status < 0) + return status; + + /* report alarm time */ + t->time.tm_sec = 0; + t->time.tm_min = BCD2BIN(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f); + t->time.tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C_REG_ALARM_A_HOURS]); + t->time.tm_mday = -1; + t->time.tm_mon = -1; + t->time.tm_year = -1; + t->time.tm_wday = -1; + t->time.tm_yday = -1; + t->time.tm_isdst = -1; + + /* ... and status */ + t->enabled = !!(rs5c->regs[RS5C_REG_CTRL1] & RS5C_CTRL1_AALE); + t->pending = !!(rs5c->regs[RS5C_REG_CTRL2] & RS5C_CTRL2_AAFG); + + return 0; +} + +static int rs5c_set_alarm(struct device *dev, struct rtc_wkalrm *t) +{ + struct i2c_client *client = to_i2c_client(dev); + struct rs5c372 *rs5c = i2c_get_clientdata(client); + int status; + unsigned char buf[4]; + + /* only handle up to 24 hours in the future, like RTC_ALM_SET */ + if (t->time.tm_mday != -1 + || t->time.tm_mon != -1 + || t->time.tm_year != -1) + return -EINVAL; + + /* REVISIT: round up tm_sec */ + + /* if needed, disable irq (clears pending status) */ + status = rs5c_get_regs(rs5c); + if (status < 0) + return status; + if (rs5c->regs[RS5C_REG_CTRL1] & RS5C_CTRL1_AALE) { + buf[0] = RS5C_ADDR(RS5C_REG_CTRL1); + buf[1] = rs5c->regs[RS5C_REG_CTRL1] & ~RS5C_CTRL1_AALE; + if (i2c_master_send(client, buf, 2) != 2) { + pr_debug("%s: can't disable alarm\n", rs5c->rtc->name); + return -EIO; + } + rs5c->regs[RS5C_REG_CTRL1] = buf[1]; + } + + /* set alarm */ + buf[0] = RS5C_ADDR(RS5C_REG_ALARM_A_MIN); + buf[1] = BIN2BCD(t->time.tm_min); + buf[2] = rs5c_hr2reg(rs5c, t->time.tm_hour); + buf[3] = 0x7f; /* any/all days */ + if ((i2c_master_send(client, buf, 4)) != 4) { + pr_debug("%s: can't set alarm time\n", rs5c->rtc->name); + return -EIO; + } + + /* ... and maybe enable its irq */ + if (t->enabled) { + buf[0] = RS5C_ADDR(RS5C_REG_CTRL1); + buf[1] = rs5c->regs[RS5C_REG_CTRL1] | RS5C_CTRL1_AALE; + if ((i2c_master_send(client, buf, 2)) != 2) + printk(KERN_WARNING "%s: can't enable alarm\n", + rs5c->rtc->name); + rs5c->regs[RS5C_REG_CTRL1] = buf[1]; + } + + return 0; +} + +#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE) + static int rs5c372_rtc_proc(struct device *dev, struct seq_file *seq) { int err, osc, trim; err = rs5c372_get_trim(to_i2c_client(dev), &osc, &trim); if (err == 0) { - seq_printf(seq, "%d.%03d KHz\n", osc / 1000, osc % 1000); - seq_printf(seq, "trim\t: %d\n", trim); + seq_printf(seq, "crystal\t\t: %d.%03d KHz\n", + osc / 1000, osc % 1000); + seq_printf(seq, "trim\t\t: %d\n", trim); } return 0; } +#else +#define rs5c372_rtc_proc NULL +#endif + static const struct rtc_class_ops rs5c372_rtc_ops = { .proc = rs5c372_rtc_proc, + .ioctl = rs5c_rtc_ioctl, .read_time = rs5c372_rtc_read_time, .set_time = rs5c372_rtc_set_time, + .read_alarm = rs5c_read_alarm, + .set_alarm = rs5c_set_alarm, }; +#if defined(CONFIG_RTC_INTF_SYSFS) || defined(CONFIG_RTC_INTF_SYSFS_MODULE) + static ssize_t rs5c372_sysfs_show_trim(struct device *dev, struct device_attribute *attr, char *buf) { @@ -172,7 +452,7 @@ static ssize_t rs5c372_sysfs_show_trim(struct device *dev, if (err) return err; - return sprintf(buf, "0x%2x\n", trim); + return sprintf(buf, "%d\n", trim); } static DEVICE_ATTR(trim, S_IRUGO, rs5c372_sysfs_show_trim, NULL); @@ -189,16 +469,35 @@ static ssize_t rs5c372_sysfs_show_osc(struct device *dev, } static DEVICE_ATTR(osc, S_IRUGO, rs5c372_sysfs_show_osc, NULL); -static int rs5c372_attach(struct i2c_adapter *adapter) +static int rs5c_sysfs_register(struct device *dev) { - return i2c_probe(adapter, &addr_data, rs5c372_probe); + int err; + + err = device_create_file(dev, &dev_attr_trim); + if (err) + return err; + err = device_create_file(dev, &dev_attr_osc); + if (err) + device_remove_file(dev, &dev_attr_trim); + + return err; +} + +#else +static int rs5c_sysfs_register(struct device *dev) +{ + return 0; } +#endif /* SYSFS */ + +static struct i2c_driver rs5c372_driver; static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind) { int err = 0; struct i2c_client *client; struct rs5c372 *rs5c372; + struct rtc_time tm; dev_dbg(adapter->class_dev.dev, "%s\n", __FUNCTION__); @@ -211,7 +510,15 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind) err = -ENOMEM; goto exit; } - client = &rs5c372->client; + + /* we read registers 0x0f then 0x00-0x0f; skip the first one */ + rs5c372->regs=&rs5c372->buf[1]; + + /* On conversion to a "new style" i2c driver, we'll be handed + * the i2c_client (we won't create it) + */ + client = &rs5c372->dev; + rs5c372->client = client; /* I2C client */ client->addr = address; @@ -222,16 +529,99 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind) i2c_set_clientdata(client, rs5c372); - rs5c372->msg[0].addr = address; - rs5c372->msg[0].flags = I2C_M_RD; - rs5c372->msg[0].len = sizeof(rs5c372->regs); - rs5c372->msg[0].buf = rs5c372->regs; - /* Inform the i2c layer */ if ((err = i2c_attach_client(client))) goto exit_kfree; - dev_info(&client->dev, "chip found, driver version " DRV_VERSION "\n"); + err = rs5c_get_regs(rs5c372); + if (err < 0) + goto exit_detach; + + /* For "new style" drivers, irq is in i2c_client and chip type + * info comes from i2c_client.dev.platform_data. Meanwhile: + * + * STICK BOARD-SPECIFIC SETUP CODE RIGHT HERE + */ + if (rs5c372->type == rtc_undef) { + rs5c372->type = rtc_rs5c372b; + dev_warn(&client->dev, "assuming rs5c372b\n"); + } + + /* clock may be set for am/pm or 24 hr time */ + switch (rs5c372->type) { + case rtc_rs5c372a: + case rtc_rs5c372b: + /* alarm uses ALARM_A; and nINTRA on 372a, nINTR on 372b. + * so does periodic irq, except some 327a modes. + */ + if (rs5c372->regs[RS5C_REG_CTRL2] & RS5C372_CTRL2_24) + rs5c372->time24 = 1; + break; + case rtc_rv5c386: + case rtc_rv5c387a: + if (rs5c372->regs[RS5C_REG_CTRL1] & RV5C387_CTRL1_24) + rs5c372->time24 = 1; + /* alarm uses ALARM_W; and nINTRB for alarm and periodic + * irq, on both 386 and 387 + */ + break; + default: + dev_err(&client->dev, "unknown RTC type\n"); + goto exit_detach; + } + + /* if the oscillator lost power and no other software (like + * the bootloader) set it up, do it here. + */ + if (rs5c372->regs[RS5C_REG_CTRL2] & RS5C_CTRL2_XSTP) { + unsigned char buf[3]; + + rs5c372->regs[RS5C_REG_CTRL2] &= ~RS5C_CTRL2_XSTP; + + buf[0] = RS5C_ADDR(RS5C_REG_CTRL1); + buf[1] = rs5c372->regs[RS5C_REG_CTRL1]; + buf[2] = rs5c372->regs[RS5C_REG_CTRL2]; + + /* use 24hr mode */ + switch (rs5c372->type) { + case rtc_rs5c372a: + case rtc_rs5c372b: + buf[2] |= RS5C372_CTRL2_24; + rs5c372->time24 = 1; + break; + case rtc_rv5c386: + case rtc_rv5c387a: + buf[1] |= RV5C387_CTRL1_24; + rs5c372->time24 = 1; + break; + default: + /* impossible */ + break; + } + + if ((i2c_master_send(client, buf, 3)) != 3) { + dev_err(&client->dev, "setup error\n"); + goto exit_detach; + } + rs5c372->regs[RS5C_REG_CTRL1] = buf[1]; + rs5c372->regs[RS5C_REG_CTRL2] = buf[2]; + } + + if (rs5c372_get_datetime(client, &tm) < 0) + dev_warn(&client->dev, "clock needs to be set\n"); + + dev_info(&client->dev, "%s found, %s, driver version " DRV_VERSION "\n", + ({ char *s; switch (rs5c372->type) { + case rtc_rs5c372a: s = "rs5c372a"; break; + case rtc_rs5c372b: s = "rs5c372b"; break; + case rtc_rv5c386: s = "rv5c386"; break; + case rtc_rv5c387a: s = "rv5c387a"; break; + default: s = "chip"; break; + }; s;}), + rs5c372->time24 ? "24hr" : "am/pm" + ); + + /* FIXME when client->irq exists, use it to register alarm irq */ rs5c372->rtc = rtc_device_register(rs5c372_driver.driver.name, &client->dev, &rs5c372_rtc_ops, THIS_MODULE); @@ -241,18 +631,12 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind) goto exit_detach; } - err = device_create_file(&client->dev, &dev_attr_trim); + err = rs5c_sysfs_register(&client->dev); if (err) goto exit_devreg; - err = device_create_file(&client->dev, &dev_attr_osc); - if (err) - goto exit_trim; return 0; -exit_trim: - device_remove_file(&client->dev, &dev_attr_trim); - exit_devreg: rtc_device_unregister(rs5c372->rtc); @@ -266,6 +650,11 @@ exit: return err; } +static int rs5c372_attach(struct i2c_adapter *adapter) +{ + return i2c_probe(adapter, &addr_data, rs5c372_probe); +} + static int rs5c372_detach(struct i2c_client *client) { int err; @@ -274,6 +663,8 @@ static int rs5c372_detach(struct i2c_client *client) if (rs5c372->rtc) rtc_device_unregister(rs5c372->rtc); + /* REVISIT properly destroy the sysfs files ... */ + if ((err = i2c_detach_client(client))) return err; @@ -281,6 +672,14 @@ static int rs5c372_detach(struct i2c_client *client) return 0; } +static struct i2c_driver rs5c372_driver = { + .driver = { + .name = "rtc-rs5c372", + }, + .attach_adapter = &rs5c372_attach, + .detach_client = &rs5c372_detach, +}; + static __init int rs5c372_init(void) { return i2c_add_driver(&rs5c372_driver); |