diff options
Diffstat (limited to 'drivers')
51 files changed, 11694 insertions, 620 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig index 4929e923b5c..e7da9fa724e 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -80,4 +80,6 @@ source "drivers/rtc/Kconfig" source "drivers/dma/Kconfig" +source "drivers/kvm/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 50f76da598c..0dd96d1afd3 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_SPI) += spi/ obj-$(CONFIG_PCCARD) += pcmcia/ obj-$(CONFIG_DIO) += dio/ obj-$(CONFIG_SBUS) += sbus/ +obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_ZORRO) += zorro/ obj-$(CONFIG_MAC) += macintosh/ obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ diff --git a/drivers/char/ipmi/ipmi_bt_sm.c b/drivers/char/ipmi/ipmi_bt_sm.c index 6c59baa887a..e736119b649 100644 --- a/drivers/char/ipmi/ipmi_bt_sm.c +++ b/drivers/char/ipmi/ipmi_bt_sm.c @@ -37,8 +37,10 @@ #define BT_DEBUG_ENABLE 1 /* Generic messages */ #define BT_DEBUG_MSG 2 /* Prints all request/response buffers */ #define BT_DEBUG_STATES 4 /* Verbose look at state changes */ +/* BT_DEBUG_OFF must be zero to correspond to the default uninitialized + value */ -static int bt_debug = BT_DEBUG_OFF; +static int bt_debug; /* 0 == BT_DEBUG_OFF */ module_param(bt_debug, int, 0644); MODULE_PARM_DESC(bt_debug, "debug bitmask, 1=enable, 2=messages, 4=states"); diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c index e257835a9a7..ff2d052177c 100644 --- a/drivers/char/ipmi/ipmi_devintf.c +++ b/drivers/char/ipmi/ipmi_devintf.c @@ -834,7 +834,7 @@ static const struct file_operations ipmi_fops = { #define DEVICE_NAME "ipmidev" -static int ipmi_major = 0; +static int ipmi_major; module_param(ipmi_major, int, 0); MODULE_PARM_DESC(ipmi_major, "Sets the major number of the IPMI device. By" " default, or if you set it to zero, it will choose the next" diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 5703ee28e1c..4e4691a5389 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -53,10 +53,10 @@ static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void); static int ipmi_init_msghandler(void); -static int initialized = 0; +static int initialized; #ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_ipmi_root = NULL; +static struct proc_dir_entry *proc_ipmi_root; #endif /* CONFIG_PROC_FS */ /* Remain in auto-maintenance mode for this amount of time (in ms). */ @@ -2142,8 +2142,7 @@ cleanup_bmc_device(struct kref *ref) bmc = container_of(ref, struct bmc_device, refcount); remove_files(bmc); - if (bmc->dev) - platform_device_unregister(bmc->dev); + platform_device_unregister(bmc->dev); kfree(bmc); } @@ -2341,8 +2340,7 @@ static int ipmi_bmc_register(ipmi_smi_t intf, int ifnum, while (ipmi_find_bmc_prod_dev_id(&ipmidriver, bmc->id.product_id, - bmc->id.device_id)) - { + bmc->id.device_id)) { if (!warn_printed) { printk(KERN_WARNING PFX "This machine has two different BMCs" @@ -4043,7 +4041,7 @@ static void send_panic_events(char *str) } #endif /* CONFIG_IPMI_PANIC_EVENT */ -static int has_panicked = 0; +static int has_panicked; static int panic_event(struct notifier_block *this, unsigned long event, diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c index 597eb4f88b8..9d23136e598 100644 --- a/drivers/char/ipmi/ipmi_poweroff.c +++ b/drivers/char/ipmi/ipmi_poweroff.c @@ -58,10 +58,10 @@ static int poweroff_powercycle; static int ifnum_to_use = -1; /* Our local state. */ -static int ready = 0; +static int ready; static ipmi_user_t ipmi_user; static int ipmi_ifnum; -static void (*specific_poweroff_func)(ipmi_user_t user) = NULL; +static void (*specific_poweroff_func)(ipmi_user_t user); /* Holds the old poweroff function so we can restore it on removal. */ static void (*old_poweroff_func)(void); @@ -182,7 +182,7 @@ static int ipmi_request_in_rc_mode(ipmi_user_t user, #define IPMI_MOTOROLA_MANUFACTURER_ID 0x0000A1 #define IPMI_MOTOROLA_PPS_IPMC_PRODUCT_ID 0x0051 -static void (*atca_oem_poweroff_hook)(ipmi_user_t user) = NULL; +static void (*atca_oem_poweroff_hook)(ipmi_user_t user); static void pps_poweroff_atca (ipmi_user_t user) { diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 81a0c89598e..f1afd26a509 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -845,7 +845,7 @@ static void request_events(void *send_info) atomic_set(&smi_info->req_events, 1); } -static int initialized = 0; +static int initialized; static void smi_timeout(unsigned long data) { @@ -1018,17 +1018,17 @@ static int num_ports; static int irqs[SI_MAX_PARMS]; static int num_irqs; static int regspacings[SI_MAX_PARMS]; -static int num_regspacings = 0; +static int num_regspacings; static int regsizes[SI_MAX_PARMS]; -static int num_regsizes = 0; +static int num_regsizes; static int regshifts[SI_MAX_PARMS]; -static int num_regshifts = 0; +static int num_regshifts; static int slave_addrs[SI_MAX_PARMS]; -static int num_slave_addrs = 0; +static int num_slave_addrs; #define IPMI_IO_ADDR_SPACE 0 #define IPMI_MEM_ADDR_SPACE 1 -static char *addr_space_to_str[] = { "I/O", "mem" }; +static char *addr_space_to_str[] = { "i/o", "mem" }; static int hotmod_handler(const char *val, struct kernel_param *kp); @@ -1397,20 +1397,7 @@ static struct hotmod_vals hotmod_as[] = { { "i/o", IPMI_IO_ADDR_SPACE }, { NULL } }; -static int ipmi_strcasecmp(const char *s1, const char *s2) -{ - while (*s1 || *s2) { - if (!*s1) - return -1; - if (!*s2) - return 1; - if (*s1 != *s2) - return *s1 - *s2; - s1++; - s2++; - } - return 0; -} + static int parse_str(struct hotmod_vals *v, int *val, char *name, char **curr) { char *s; @@ -1424,7 +1411,7 @@ static int parse_str(struct hotmod_vals *v, int *val, char *name, char **curr) *s = '\0'; s++; for (i = 0; hotmod_ops[i].name; i++) { - if (ipmi_strcasecmp(*curr, v[i].name) == 0) { + if (strcmp(*curr, v[i].name) == 0) { *val = v[i].val; *curr = s; return 0; @@ -1435,10 +1422,34 @@ static int parse_str(struct hotmod_vals *v, int *val, char *name, char **curr) return -EINVAL; } +static int check_hotmod_int_op(const char *curr, const char *option, + const char *name, int *val) +{ + char *n; + + if (strcmp(curr, name) == 0) { + if (!option) { + printk(KERN_WARNING PFX + "No option given for '%s'\n", + curr); + return -EINVAL; + } + *val = simple_strtoul(option, &n, 0); + if ((*n != '\0') || (*option == '\0')) { + printk(KERN_WARNING PFX + "Bad option given for '%s'\n", + curr); + return -EINVAL; + } + return 1; + } + return 0; +} + static int hotmod_handler(const char *val, struct kernel_param *kp) { char *str = kstrdup(val, GFP_KERNEL); - int rv = -EINVAL; + int rv; char *next, *curr, *s, *n, *o; enum hotmod_op op; enum si_type si_type; @@ -1450,13 +1461,15 @@ static int hotmod_handler(const char *val, struct kernel_param *kp) int irq; int ipmb; int ival; + int len; struct smi_info *info; if (!str) return -ENOMEM; /* Kill any trailing spaces, as we can get a "\n" from echo. */ - ival = strlen(str) - 1; + len = strlen(str); + ival = len - 1; while ((ival >= 0) && isspace(str[ival])) { str[ival] = '\0'; ival--; @@ -1513,35 +1526,37 @@ static int hotmod_handler(const char *val, struct kernel_param *kp) *o = '\0'; o++; } -#define HOTMOD_INT_OPT(name, val) \ - if (ipmi_strcasecmp(curr, name) == 0) { \ - if (!o) { \ - printk(KERN_WARNING PFX \ - "No option given for '%s'\n", \ - curr); \ - goto out; \ - } \ - val = simple_strtoul(o, &n, 0); \ - if ((*n != '\0') || (*o == '\0')) { \ - printk(KERN_WARNING PFX \ - "Bad option given for '%s'\n", \ - curr); \ - goto out; \ - } \ - } - - HOTMOD_INT_OPT("rsp", regspacing) - else HOTMOD_INT_OPT("rsi", regsize) - else HOTMOD_INT_OPT("rsh", regshift) - else HOTMOD_INT_OPT("irq", irq) - else HOTMOD_INT_OPT("ipmb", ipmb) - else { - printk(KERN_WARNING PFX - "Invalid hotmod option '%s'\n", - curr); + rv = check_hotmod_int_op(curr, o, "rsp", ®spacing); + if (rv < 0) goto out; - } -#undef HOTMOD_INT_OPT + else if (rv) + continue; + rv = check_hotmod_int_op(curr, o, "rsi", ®size); + if (rv < 0) + goto out; + else if (rv) + continue; + rv = check_hotmod_int_op(curr, o, "rsh", ®shift); + if (rv < 0) + goto out; + else if (rv) + continue; + rv = check_hotmod_int_op(curr, o, "irq", &irq); + if (rv < 0) + goto out; + else if (rv) + continue; + rv = check_hotmod_int_op(curr, o, "ipmb", &ipmb); + if (rv < 0) + goto out; + else if (rv) + continue; + + rv = -EINVAL; + printk(KERN_WARNING PFX + "Invalid hotmod option '%s'\n", + curr); + goto out; } if (op == HM_ADD) { @@ -1590,6 +1605,7 @@ static int hotmod_handler(const char *val, struct kernel_param *kp) mutex_unlock(&smi_infos_lock); } } + rv = len; out: kfree(str); return rv; @@ -1610,11 +1626,11 @@ static __devinit void hardcode_find_bmc(void) info->addr_source = "hardcoded"; - if (!si_type[i] || ipmi_strcasecmp(si_type[i], "kcs") == 0) { + if (!si_type[i] || strcmp(si_type[i], "kcs") == 0) { info->si_type = SI_KCS; - } else if (ipmi_strcasecmp(si_type[i], "smic") == 0) { + } else if (strcmp(si_type[i], "smic") == 0) { info->si_type = SI_SMIC; - } else if (ipmi_strcasecmp(si_type[i], "bt") == 0) { + } else if (strcmp(si_type[i], "bt") == 0) { info->si_type = SI_BT; } else { printk(KERN_WARNING @@ -1668,7 +1684,7 @@ static __devinit void hardcode_find_bmc(void) /* Once we get an ACPI failure, we don't try any more, because we go through the tables sequentially. Once we don't find a table, there are no more. */ -static int acpi_failure = 0; +static int acpi_failure; /* For GPE-type interrupts. */ static u32 ipmi_acpi_gpe(void *context) @@ -1779,7 +1795,6 @@ struct SPMITable { static __devinit int try_init_acpi(struct SPMITable *spmi) { struct smi_info *info; - char *io_type; u8 addr_space; if (spmi->IPMIlegacy != 1) { @@ -1843,11 +1858,9 @@ static __devinit int try_init_acpi(struct SPMITable *spmi) info->io.regshift = spmi->addr.register_bit_offset; if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) { - io_type = "memory"; info->io_setup = mem_setup; info->io.addr_type = IPMI_IO_ADDR_SPACE; } else if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_IO) { - io_type = "I/O"; info->io_setup = port_setup; info->io.addr_type = IPMI_MEM_ADDR_SPACE; } else { @@ -2773,8 +2786,7 @@ static __devinit int init_ipmi_si(void) #endif #ifdef CONFIG_ACPI - if (si_trydefaults) - acpi_find_bmc(); + acpi_find_bmc(); #endif #ifdef CONFIG_PCI diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 90fb2a54191..78280380a90 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -134,14 +134,14 @@ static int nowayout = WATCHDOG_NOWAYOUT; -static ipmi_user_t watchdog_user = NULL; +static ipmi_user_t watchdog_user; static int watchdog_ifnum; /* Default the timeout to 10 seconds. */ static int timeout = 10; /* The pre-timeout is disabled by default. */ -static int pretimeout = 0; +static int pretimeout; /* Default action is to reset the board on a timeout. */ static unsigned char action_val = WDOG_TIMEOUT_RESET; @@ -156,10 +156,10 @@ static unsigned char preop_val = WDOG_PREOP_NONE; static char preop[16] = "preop_none"; static DEFINE_SPINLOCK(ipmi_read_lock); -static char data_to_read = 0; +static char data_to_read; static DECLARE_WAIT_QUEUE_HEAD(read_q); -static struct fasync_struct *fasync_q = NULL; -static char pretimeout_since_last_heartbeat = 0; +static struct fasync_struct *fasync_q; +static char pretimeout_since_last_heartbeat; static char expect_close; static int ifnum_to_use = -1; @@ -177,7 +177,7 @@ static void ipmi_unregister_watchdog(int ipmi_intf); /* If true, the driver will start running as soon as it is configured and ready. */ -static int start_now = 0; +static int start_now; static int set_param_int(const char *val, struct kernel_param *kp) { @@ -300,16 +300,16 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started " static unsigned char ipmi_watchdog_state = WDOG_TIMEOUT_NONE; /* If shutting down via IPMI, we ignore the heartbeat. */ -static int ipmi_ignore_heartbeat = 0; +static int ipmi_ignore_heartbeat; /* Is someone using the watchdog? Only one user is allowed. */ -static unsigned long ipmi_wdog_open = 0; +static unsigned long ipmi_wdog_open; /* If set to 1, the heartbeat command will set the state to reset and start the timer. The timer doesn't normally run when the driver is first opened until the heartbeat is set the first time, this variable is used to accomplish this. */ -static int ipmi_start_timer_on_heartbeat = 0; +static int ipmi_start_timer_on_heartbeat; /* IPMI version of the BMC. */ static unsigned char ipmi_version_major; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 089020e0ee5..4f1813e0475 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -646,7 +646,8 @@ static inline size_t read_zero_pagealigned(char __user * buf, size_t size) count = size; zap_page_range(vma, addr, count, NULL); - zeromap_page_range(vma, addr, count, PAGE_COPY); + if (zeromap_page_range(vma, addr, count, PAGE_COPY)) + break; size -= count; buf += count; @@ -713,11 +714,14 @@ out: static int mmap_zero(struct file * file, struct vm_area_struct * vma) { + int err; + if (vma->vm_flags & VM_SHARED) return shmem_zero_setup(vma); - if (zeromap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) - return -EAGAIN; - return 0; + err = zeromap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, vma->vm_page_prot); + BUG_ON(err == -EEXIST); + return err; } #else /* CONFIG_MMU */ static ssize_t read_zero(struct file * file, char * buf, diff --git a/drivers/char/random.c b/drivers/char/random.c index 092a01cc02d..13d0b1350a6 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1203,7 +1203,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp, static int uuid_strategy(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { unsigned char tmp_uuid[16], *uuid; unsigned int len; diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index 8ab61ef97b4..b6bcdbbf57b 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -77,11 +77,11 @@ static struct clocksource clocksource_acpi_pm = { #ifdef CONFIG_PCI -static int acpi_pm_good; +static int __devinitdata acpi_pm_good; static int __init acpi_pm_good_setup(char *__str) { - acpi_pm_good = 1; - return 1; + acpi_pm_good = 1; + return 1; } __setup("acpi_pm_good", acpi_pm_good_setup); diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index e816535ab30..879250d3d06 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -53,7 +53,7 @@ config CRYPTO_DEV_PADLOCK_SHA config CRYPTO_DEV_GEODE tristate "Support for the Geode LX AES engine" - depends on CRYPTO && X86_32 + depends on CRYPTO && X86_32 && PCI select CRYPTO_ALGAPI select CRYPTO_BLKCIPHER default m diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 88214943d00..5969cec58dc 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -687,8 +687,15 @@ static void ide_dump_status_no_sense(ide_drive_t *drive, const char *msg, u8 sta static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret) { struct request *rq = HWGROUP(drive)->rq; + ide_hwif_t *hwif = HWIF(drive); int stat, err, sense_key; + /* We may have bogus DMA interrupts in PIO state here */ + if (HWIF(drive)->dma_status && hwif->atapi_irq_bogon) { + stat = hwif->INB(hwif->dma_status); + /* Should we force the bit as well ? */ + hwif->OUTB(stat, hwif->dma_status); + } /* Check for errors. */ stat = HWIF(drive)->INB(IDE_STATUS_REG); if (stat_ret) diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c index 3ca581063f7..7cb48576e47 100644 --- a/drivers/ide/pci/pdc202xx_new.c +++ b/drivers/ide/pci/pdc202xx_new.c @@ -39,6 +39,14 @@ #define PDC202_DEBUG_CABLE 0 +#undef DEBUG + +#ifdef DEBUG +#define DBG(fmt, args...) printk("%s: " fmt, __FUNCTION__, ## args) +#else +#define DBG(fmt, args...) +#endif + static const char *pdc_quirk_drives[] = { "QUANTUM FIREBALLlct08 08", "QUANTUM FIREBALLP KA6.4", @@ -51,37 +59,11 @@ static const char *pdc_quirk_drives[] = { NULL }; -#define set_2regs(a, b) \ - do { \ - hwif->OUTB((a + adj), indexreg); \ - hwif->OUTB(b, datareg); \ - } while(0) - -#define set_ultra(a, b, c) \ - do { \ - set_2regs(0x10,(a)); \ - set_2regs(0x11,(b)); \ - set_2regs(0x12,(c)); \ - } while(0) - -#define set_ata2(a, b) \ - do { \ - set_2regs(0x0e,(a)); \ - set_2regs(0x0f,(b)); \ - } while(0) - -#define set_pio(a, b, c) \ - do { \ - set_2regs(0x0c,(a)); \ - set_2regs(0x0d,(b)); \ - set_2regs(0x13,(c)); \ - } while(0) - -static u8 pdcnew_ratemask (ide_drive_t *drive) +static u8 max_dma_rate(struct pci_dev *pdev) { u8 mode; - switch(HWIF(drive)->pci_dev->device) { + switch(pdev->device) { case PCI_DEVICE_ID_PROMISE_20277: case PCI_DEVICE_ID_PROMISE_20276: case PCI_DEVICE_ID_PROMISE_20275: @@ -96,12 +78,21 @@ static u8 pdcnew_ratemask (ide_drive_t *drive) default: return 0; } - if (!eighty_ninty_three(drive)) - mode = min(mode, (u8)1); + return mode; } -static int check_in_drive_lists (ide_drive_t *drive, const char **list) +static u8 pdcnew_ratemask(ide_drive_t *drive) +{ + u8 mode = max_dma_rate(HWIF(drive)->pci_dev); + + if (!eighty_ninty_three(drive)) + mode = min_t(u8, mode, 1); + + return mode; +} + +static int check_in_drive_lists(ide_drive_t *drive, const char **list) { struct hd_driveid *id = drive->id; @@ -121,43 +112,141 @@ static int check_in_drive_lists (ide_drive_t *drive, const char **list) return 0; } -static int pdcnew_new_tune_chipset (ide_drive_t *drive, u8 xferspeed) +/** + * get_indexed_reg - Get indexed register + * @hwif: for the port address + * @index: index of the indexed register + */ +static u8 get_indexed_reg(ide_hwif_t *hwif, u8 index) +{ + u8 value; + + hwif->OUTB(index, hwif->dma_vendor1); + value = hwif->INB(hwif->dma_vendor3); + + DBG("index[%02X] value[%02X]\n", index, value); + return value; +} + +/** + * set_indexed_reg - Set indexed register + * @hwif: for the port address + * @index: index of the indexed register + */ +static void set_indexed_reg(ide_hwif_t *hwif, u8 index, u8 value) +{ + hwif->OUTB(index, hwif->dma_vendor1); + hwif->OUTB(value, hwif->dma_vendor3); + DBG("index[%02X] value[%02X]\n", index, value); +} + +/* + * ATA Timing Tables based on 133 MHz PLL output clock. + * + * If the PLL outputs 100 MHz clock, the ASIC hardware will set + * the timing registers automatically when "set features" command is + * issued to the device. However, if the PLL output clock is 133 MHz, + * the following tables must be used. + */ +static struct pio_timing { + u8 reg0c, reg0d, reg13; +} pio_timings [] = { + { 0xfb, 0x2b, 0xac }, /* PIO mode 0, IORDY off, Prefetch off */ + { 0x46, 0x29, 0xa4 }, /* PIO mode 1, IORDY off, Prefetch off */ + { 0x23, 0x26, 0x64 }, /* PIO mode 2, IORDY off, Prefetch off */ + { 0x27, 0x0d, 0x35 }, /* PIO mode 3, IORDY on, Prefetch off */ + { 0x23, 0x09, 0x25 }, /* PIO mode 4, IORDY on, Prefetch off */ +}; + +static struct mwdma_timing { + u8 reg0e, reg0f; +} mwdma_timings [] = { + { 0xdf, 0x5f }, /* MWDMA mode 0 */ + { 0x6b, 0x27 }, /* MWDMA mode 1 */ + { 0x69, 0x25 }, /* MWDMA mode 2 */ +}; + +static struct udma_timing { + u8 reg10, reg11, reg12; +} udma_timings [] = { + { 0x4a, 0x0f, 0xd5 }, /* UDMA mode 0 */ + { 0x3a, 0x0a, 0xd0 }, /* UDMA mode 1 */ + { 0x2a, 0x07, 0xcd }, /* UDMA mode 2 */ + { 0x1a, 0x05, 0xcd }, /* UDMA mode 3 */ + { 0x1a, 0x03, 0xcd }, /* UDMA mode 4 */ + { 0x1a, 0x02, 0xcb }, /* UDMA mode 5 */ + { 0x1a, 0x01, 0xcb }, /* UDMA mode 6 */ +}; + +static int pdcnew_tune_chipset(ide_drive_t *drive, u8 speed) { ide_hwif_t *hwif = HWIF(drive); - unsigned long indexreg = hwif->dma_vendor1; - unsigned long datareg = hwif->dma_vendor3; - u8 thold = 0x10; - u8 adj = (drive->dn%2) ? 0x08 : 0x00; - u8 speed = ide_rate_filter(pdcnew_ratemask(drive), xferspeed); - - if (speed == XFER_UDMA_2) { - hwif->OUTB((thold + adj), indexreg); - hwif->OUTB((hwif->INB(datareg) & 0x7f), datareg); - } + u8 adj = (drive->dn & 1) ? 0x08 : 0x00; + int err; - switch (speed) { - case XFER_UDMA_7: - speed = XFER_UDMA_6; - case XFER_UDMA_6: set_ultra(0x1a, 0x01, 0xcb); break; - case XFER_UDMA_5: set_ultra(0x1a, 0x02, 0xcb); break; - case XFER_UDMA_4: set_ultra(0x1a, 0x03, 0xcd); break; - case XFER_UDMA_3: set_ultra(0x1a, 0x05, 0xcd); break; - case XFER_UDMA_2: set_ultra(0x2a, 0x07, 0xcd); break; - case XFER_UDMA_1: set_ultra(0x3a, 0x0a, 0xd0); break; - case XFER_UDMA_0: set_ultra(0x4a, 0x0f, 0xd5); break; - case XFER_MW_DMA_2: set_ata2(0x69, 0x25); break; - case XFER_MW_DMA_1: set_ata2(0x6b, 0x27); break; - case XFER_MW_DMA_0: set_ata2(0xdf, 0x5f); break; - case XFER_PIO_4: set_pio(0x23, 0x09, 0x25); break; - case XFER_PIO_3: set_pio(0x27, 0x0d, 0x35); break; - case XFER_PIO_2: set_pio(0x23, 0x26, 0x64); break; - case XFER_PIO_1: set_pio(0x46, 0x29, 0xa4); break; - case XFER_PIO_0: set_pio(0xfb, 0x2b, 0xac); break; - default: - ; - } + speed = ide_rate_filter(pdcnew_ratemask(drive), speed); + + /* + * Issue SETFEATURES_XFER to the drive first. PDC202xx hardware will + * automatically set the timing registers based on 100 MHz PLL output. + */ + err = ide_config_drive_speed(drive, speed); + + /* + * As we set up the PLL to output 133 MHz for UltraDMA/133 capable + * chips, we must override the default register settings... + */ + if (max_dma_rate(hwif->pci_dev) == 4) { + u8 mode = speed & 0x07; + + switch (speed) { + case XFER_UDMA_6: + case XFER_UDMA_5: + case XFER_UDMA_4: + case XFER_UDMA_3: + case XFER_UDMA_2: + case XFER_UDMA_1: + case XFER_UDMA_0: + set_indexed_reg(hwif, 0x10 + adj, + udma_timings[mode].reg10); + set_indexed_reg(hwif, 0x11 + adj, + udma_timings[mode].reg11); + set_indexed_reg(hwif, 0x12 + adj, + udma_timings[mode].reg12); + break; + + case XFER_MW_DMA_2: + case XFER_MW_DMA_1: + case XFER_MW_DMA_0: + set_indexed_reg(hwif, 0x0e + adj, + mwdma_timings[mode].reg0e); + set_indexed_reg(hwif, 0x0f + adj, + mwdma_timings[mode].reg0f); + break; + case XFER_PIO_4: + case XFER_PIO_3: + case XFER_PIO_2: + case XFER_PIO_1: + case XFER_PIO_0: + set_indexed_reg(hwif, 0x0c + adj, + pio_timings[mode].reg0c); + set_indexed_reg(hwif, 0x0d + adj, + pio_timings[mode].reg0d); + set_indexed_reg(hwif, 0x13 + adj, + pio_timings[mode].reg13); + break; + default: + printk(KERN_ERR "pdc202xx_new: " + "Unknown speed %d ignored\n", speed); + } + } else if (speed == XFER_UDMA_2) { + /* Set tHOLD bit to 0 if using UDMA mode 2 */ + u8 tmp = get_indexed_reg(hwif, 0x10 + adj); - return (ide_config_drive_speed(drive, speed)); + set_indexed_reg(hwif, 0x10 + adj, tmp & 0x7f); + } + + return err; } /* 0 1 2 3 4 5 6 7 8 @@ -170,36 +259,42 @@ static int pdcnew_new_tune_chipset (ide_drive_t *drive, u8 xferspeed) static void pdcnew_tune_drive(ide_drive_t *drive, u8 pio) { pio = ide_get_best_pio_mode(drive, pio, 4, NULL); - (void)pdcnew_new_tune_chipset(drive, XFER_PIO_0 + pio); + (void)pdcnew_tune_chipset(drive, XFER_PIO_0 + pio); } -static u8 pdcnew_new_cable_detect (ide_hwif_t *hwif) +static u8 pdcnew_cable_detect(ide_hwif_t *hwif) { - hwif->OUTB(0x0b, hwif->dma_vendor1); - return ((u8)((hwif->INB(hwif->dma_vendor3) & 0x04))); + return get_indexed_reg(hwif, 0x0b) & 0x04; } -static int config_chipset_for_dma (ide_drive_t *drive) + +static int config_chipset_for_dma(ide_drive_t *drive) { struct hd_driveid *id = drive->id; ide_hwif_t *hwif = HWIF(drive); - u8 speed = -1; - u8 cable; - - u8 ultra_66 = ((id->dma_ultra & 0x0010) || - (id->dma_ultra & 0x0008)) ? 1 : 0; - - cable = pdcnew_new_cable_detect(hwif); + u8 ultra_66 = (id->dma_ultra & 0x0078) ? 1 : 0; + u8 cable = pdcnew_cable_detect(hwif); + u8 speed; if (ultra_66 && cable) { - printk(KERN_WARNING "Warning: %s channel requires an 80-pin cable for operation.\n", hwif->channel ? "Secondary":"Primary"); + printk(KERN_WARNING "Warning: %s channel " + "requires an 80-pin cable for operation.\n", + hwif->channel ? "Secondary" : "Primary"); printk(KERN_WARNING "%s reduced to Ultra33 mode.\n", drive->name); } if (drive->media != ide_disk) return 0; - if (id->capability & 4) { /* IORDY_EN & PREFETCH_EN */ - hwif->OUTB((0x13 + ((drive->dn%2) ? 0x08 : 0x00)), hwif->dma_vendor1); - hwif->OUTB((hwif->INB(hwif->dma_vendor3)|0x03), hwif->dma_vendor3); + + if (id->capability & 4) { + /* + * Set IORDY_EN & PREFETCH_EN (this seems to have + * NO real effect since this register is reloaded + * by hardware when the transfer mode is selected) + */ + u8 tmp, adj = (drive->dn & 1) ? 0x08 : 0x00; + + tmp = get_indexed_reg(hwif, 0x13 + adj); + set_indexed_reg(hwif, 0x13 + adj, tmp | 0x03); } speed = ide_dma_speed(drive, pdcnew_ratemask(drive)); @@ -211,7 +306,7 @@ static int config_chipset_for_dma (ide_drive_t *drive) return ide_dma_enable(drive); } -static int pdcnew_config_drive_xfer_rate (ide_drive_t *drive) +static int pdcnew_config_drive_xfer_rate(ide_drive_t *drive) { ide_hwif_t *hwif = HWIF(drive); struct hd_driveid *id = drive->id; @@ -236,9 +331,9 @@ fast_ata_pio: return 0; } -static int pdcnew_quirkproc (ide_drive_t *drive) +static int pdcnew_quirkproc(ide_drive_t *drive) { - return ((int) check_in_drive_lists(drive, pdc_quirk_drives)); + return check_in_drive_lists(drive, pdc_quirk_drives); } static int pdcnew_ide_dma_lostirq(ide_drive_t *drive) @@ -255,21 +350,100 @@ static int pdcnew_ide_dma_timeout(ide_drive_t *drive) return __ide_dma_timeout(drive); } -static void pdcnew_new_reset (ide_drive_t *drive) +static void pdcnew_reset(ide_drive_t *drive) { /* * Deleted this because it is redundant from the caller. */ - printk(KERN_WARNING "PDC202XX: %s channel reset.\n", + printk(KERN_WARNING "pdc202xx_new: %s channel reset.\n", HWIF(drive)->channel ? "Secondary" : "Primary"); } +/** + * read_counter - Read the byte count registers + * @dma_base: for the port address + */ +static long __devinit read_counter(u32 dma_base) +{ + u32 pri_dma_base = dma_base, sec_dma_base = dma_base + 0x08; + u8 cnt0, cnt1, cnt2, cnt3; + long count = 0, last; + int retry = 3; + + do { + last = count; + + /* Read the current count */ + outb(0x20, pri_dma_base + 0x01); + cnt0 = inb(pri_dma_base + 0x03); + outb(0x21, pri_dma_base + 0x01); + cnt1 = inb(pri_dma_base + 0x03); + outb(0x20, sec_dma_base + 0x01); + cnt2 = inb(sec_dma_base + 0x03); + outb(0x21, sec_dma_base + 0x01); + cnt3 = inb(sec_dma_base + 0x03); + + count = (cnt3 << 23) | (cnt2 << 15) | (cnt1 << 8) | cnt0; + + /* + * The 30-bit decrementing counter is read in 4 pieces. + * Incorrect value may be read when the most significant bytes + * are changing... + */ + } while (retry-- && (((last ^ count) & 0x3fff8000) || last < count)); + + DBG("cnt0[%02X] cnt1[%02X] cnt2[%02X] cnt3[%02X]\n", + cnt0, cnt1, cnt2, cnt3); + + return count; +} + +/** + * detect_pll_input_clock - Detect the PLL input clock in Hz. + * @dma_base: for the port address + * E.g. 16949000 on 33 MHz PCI bus, i.e. half of the PCI clock. + */ +static long __devinit detect_pll_input_clock(unsigned long dma_base) +{ + long start_count, end_count; + long pll_input; + u8 scr1; + + start_count = read_counter(dma_base); + + /* Start the test mode */ + outb(0x01, dma_base + 0x01); + scr1 = inb(dma_base + 0x03); + DBG("scr1[%02X]\n", scr1); + outb(scr1 | 0x40, dma_base + 0x03); + + /* Let the counter run for 10 ms. */ + mdelay(10); + + end_count = read_counter(dma_base); + + /* Stop the test mode */ + outb(0x01, dma_base + 0x01); + scr1 = inb(dma_base + 0x03); + DBG("scr1[%02X]\n", scr1); + outb(scr1 & ~0x40, dma_base + 0x03); + + /* + * Calculate the input clock in Hz + * (the clock counter is 30 bit wide and counts down) + */ + pll_input = ((start_count - end_count) & 0x3ffffff) * 100; + + DBG("start[%ld] end[%ld]\n", start_count, end_count); + + return pll_input; +} + #ifdef CONFIG_PPC_PMAC static void __devinit apple_kiwi_init(struct pci_dev *pdev) { struct device_node *np = pci_device_to_OF_node(pdev); unsigned int class_rev = 0; - void __iomem *mmio; u8 conf; if (np == NULL || !device_is_compatible(np, "kiwi-root")) @@ -280,30 +454,20 @@ static void __devinit apple_kiwi_init(struct pci_dev *pdev) if (class_rev >= 0x03) { /* Setup chip magic config stuff (from darwin) */ - pci_read_config_byte(pdev, 0x40, &conf); - pci_write_config_byte(pdev, 0x40, conf | 0x01); - } - mmio = ioremap(pci_resource_start(pdev, 5), - pci_resource_len(pdev, 5)); - - /* Setup some PLL stuffs */ - switch (pdev->device) { - case PCI_DEVICE_ID_PROMISE_20270: - writew(0x0d2b, mmio + 0x1202); - mdelay(30); - break; - case PCI_DEVICE_ID_PROMISE_20271: - writew(0x0826, mmio + 0x1202); - mdelay(30); - break; + pci_read_config_byte (pdev, 0x40, &conf); + pci_write_config_byte(pdev, 0x40, (conf | 0x01)); } - - iounmap(mmio); } #endif /* CONFIG_PPC_PMAC */ static unsigned int __devinit init_chipset_pdcnew(struct pci_dev *dev, const char *name) { + unsigned long dma_base = pci_resource_start(dev, 4); + unsigned long sec_dma_base = dma_base + 0x08; + long pll_input, pll_output, ratio; + int f, r; + u8 pll_ctl0, pll_ctl1; + if (dev->resource[PCI_ROM_RESOURCE].start) { pci_write_config_dword(dev, PCI_ROM_ADDRESS, dev->resource[PCI_ROM_RESOURCE].start | PCI_ROM_ADDRESS_ENABLE); @@ -315,6 +479,106 @@ static unsigned int __devinit init_chipset_pdcnew(struct pci_dev *dev, const cha apple_kiwi_init(dev); #endif + /* Calculate the required PLL output frequency */ + switch(max_dma_rate(dev)) { + case 4: /* it's 133 MHz for Ultra133 chips */ + pll_output = 133333333; + break; + case 3: /* and 100 MHz for Ultra100 chips */ + default: + pll_output = 100000000; + break; + } + + /* + * Detect PLL input clock. + * On some systems, where PCI bus is running at non-standard clock rate + * (e.g. 25 or 40 MHz), we have to adjust the cycle time. + * PDC20268 and newer chips employ PLL circuit to help correct timing + * registers setting. + */ + pll_input = detect_pll_input_clock(dma_base); + printk("%s: PLL input clock is %ld kHz\n", name, pll_input / 1000); + + /* Sanity check */ + if (unlikely(pll_input < 5000000L || pll_input > 70000000L)) { + printk(KERN_ERR "%s: Bad PLL input clock %ld Hz, giving up!\n", + name, pll_input); + goto out; + } + +#ifdef DEBUG + DBG("pll_output is %ld Hz\n", pll_output); + + /* Show the current clock value of PLL control register + * (maybe already configured by the BIOS) + */ + outb(0x02, sec_dma_base + 0x01); + pll_ctl0 = inb(sec_dma_base + 0x03); + outb(0x03, sec_dma_base + 0x01); + pll_ctl1 = inb(sec_dma_base + 0x03); + + DBG("pll_ctl[%02X][%02X]\n", pll_ctl0, pll_ctl1); +#endif + + /* + * Calculate the ratio of F, R and NO + * POUT = (F + 2) / (( R + 2) * NO) + */ + ratio = pll_output / (pll_input / 1000); + if (ratio < 8600L) { /* 8.6x */ + /* Using NO = 0x01, R = 0x0d */ + r = 0x0d; + } else if (ratio < 12900L) { /* 12.9x */ + /* Using NO = 0x01, R = 0x08 */ + r = 0x08; + } else if (ratio < 16100L) { /* 16.1x */ + /* Using NO = 0x01, R = 0x06 */ + r = 0x06; + } else if (ratio < 64000L) { /* 64x */ + r = 0x00; + } else { + /* Invalid ratio */ + printk(KERN_ERR "%s: Bad ratio %ld, giving up!\n", name, ratio); + goto out; + } + + f = (ratio * (r + 2)) / 1000 - 2; + + DBG("F[%d] R[%d] ratio*1000[%ld]\n", f, r, ratio); + + if (unlikely(f < 0 || f > 127)) { + /* Invalid F */ + printk(KERN_ERR "%s: F[%d] invalid!\n", name, f); + goto out; + } + + pll_ctl0 = (u8) f; + pll_ctl1 = (u8) r; + + DBG("Writing pll_ctl[%02X][%02X]\n", pll_ctl0, pll_ctl1); + + outb(0x02, sec_dma_base + 0x01); + outb(pll_ctl0, sec_dma_base + 0x03); + outb(0x03, sec_dma_base + 0x01); + outb(pll_ctl1, sec_dma_base + 0x03); + + /* Wait the PLL circuit to be stable */ + mdelay(30); + +#ifdef DEBUG + /* + * Show the current clock value of PLL control register + */ + outb(0x02, sec_dma_base + 0x01); + pll_ctl0 = inb(sec_dma_base + 0x03); + outb(0x03, sec_dma_base + 0x01); + pll_ctl1 = inb(sec_dma_base + 0x03); + + DBG("pll_ctl[%02X][%02X]\n", pll_ctl0, pll_ctl1); +#endif + + out: return dev->irq; } @@ -324,8 +588,8 @@ static void __devinit init_hwif_pdc202new(ide_hwif_t *hwif) hwif->tuneproc = &pdcnew_tune_drive; hwif->quirkproc = &pdcnew_quirkproc; - hwif->speedproc = &pdcnew_new_tune_chipset; - hwif->resetproc = &pdcnew_new_reset; + hwif->speedproc = &pdcnew_tune_chipset; + hwif->resetproc = &pdcnew_reset; hwif->drives[0].autotune = hwif->drives[1].autotune = 1; @@ -337,11 +601,14 @@ static void __devinit init_hwif_pdc202new(ide_hwif_t *hwif) hwif->ide_dma_check = &pdcnew_config_drive_xfer_rate; hwif->ide_dma_lostirq = &pdcnew_ide_dma_lostirq; hwif->ide_dma_timeout = &pdcnew_ide_dma_timeout; - if (!(hwif->udma_four)) - hwif->udma_four = (pdcnew_new_cable_detect(hwif)) ? 0 : 1; + + if (!hwif->udma_four) + hwif->udma_four = pdcnew_cable_detect(hwif) ? 0 : 1; + if (!noautodma) hwif->autodma = 1; hwif->drives[0].autodma = hwif->drives[1].autodma = hwif->autodma; + #if PDC202_DEBUG_CABLE printk(KERN_DEBUG "%s: %s-pin cable\n", hwif->name, hwif->udma_four ? "80" : "40"); diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c index cdc3aab9ebc..b1e9a8eba6b 100644 --- a/drivers/ide/pci/piix.c +++ b/drivers/ide/pci/piix.c @@ -505,6 +505,10 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif) /* This is a painful system best to let it self tune for now */ return; } + /* ESB2 appears to generate spurious DMA interrupts in PIO mode + when in native mode */ + if (hwif->pci_dev->device == PCI_DEVICE_ID_INTEL_ESB2_18) + hwif->atapi_irq_bogon = 1; hwif->autodma = 0; hwif->tuneproc = &piix_tune_drive; diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c index 0719b648482..695e23904d3 100644 --- a/drivers/ide/setup-pci.c +++ b/drivers/ide/setup-pci.c @@ -844,11 +844,11 @@ void __init ide_scan_pcibus (int scan_direction) pre_init = 0; if (!scan_direction) { - while ((dev = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { ide_scan_pcidev(dev); } } else { - while ((dev = pci_find_device_reverse(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + while ((dev = pci_get_device_reverse(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { ide_scan_pcidev(dev); } } diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index 29ca0ab0acb..3d5f1965803 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -146,7 +146,7 @@ config TOUCHSCREEN_TOUCHWIN config TOUCHSCREEN_UCB1400 tristate "Philips UCB1400 touchscreen" - select SND_AC97_BUS + depends on SND_AC97_BUS help This enables support for the Philips UCB1400 touchscreen interface. The UCB1400 is an AC97 audio codec. The touchscreen interface diff --git a/drivers/isdn/hisax/Kconfig b/drivers/isdn/hisax/Kconfig index 6fa12cc8e4f..34ab5f7dcab 100644 --- a/drivers/isdn/hisax/Kconfig +++ b/drivers/isdn/hisax/Kconfig @@ -110,7 +110,7 @@ config HISAX_16_3 config HISAX_TELESPCI bool "Teles PCI" - depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV)) + depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV)) help This enables HiSax support for the Teles PCI. See <file:Documentation/isdn/README.HiSax> on how to configure it. @@ -238,7 +238,7 @@ config HISAX_MIC config HISAX_NETJET bool "NETjet card" - depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV)) + depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV)) help This enables HiSax support for the NetJet from Traverse Technologies. @@ -249,7 +249,7 @@ config HISAX_NETJET config HISAX_NETJET_U bool "NETspider U card" - depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV)) + depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV)) help This enables HiSax support for the Netspider U interface ISDN card from Traverse Technologies. @@ -317,7 +317,7 @@ config HISAX_GAZEL config HISAX_HFC_PCI bool "HFC PCI-Bus cards" - depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV)) + depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV)) help This enables HiSax support for the HFC-S PCI 2BDS0 based cards. @@ -344,7 +344,7 @@ config HISAX_HFC_SX config HISAX_ENTERNOW_PCI bool "Formula-n enter:now PCI card" - depends on HISAX_NETJET && PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV)) + depends on HISAX_NETJET && PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV)) help This enables HiSax support for the Formula-n enter:now PCI ISDN card. diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig new file mode 100644 index 00000000000..36412e90f09 --- /dev/null +++ b/drivers/kvm/Kconfig @@ -0,0 +1,33 @@ +# +# KVM configuration +# +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" + depends on X86 && EXPERIMENTAL + ---help--- + Support hosting fully virtualized guest machines using hardware + virtualization extensions. You will need a fairly recent + processor equipped with virtualization extensions. You will also + need to select one or more of the processor modules below. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + To compile this as a module, choose M here: the module + will be called kvm. + + If unsure, say N. + +config KVM_INTEL + tristate "KVM for Intel processors support" + depends on KVM + ---help--- + Provides support for KVM on Intel processors equipped with the VT + extensions. + +config KVM_AMD + tristate "KVM for AMD processors support" + depends on KVM + ---help--- + Provides support for KVM on AMD processors equipped with the AMD-V + (SVM) extensions. diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile new file mode 100644 index 00000000000..c0a789fa9d6 --- /dev/null +++ b/drivers/kvm/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for Kernel-based Virtual Machine module +# + +kvm-objs := kvm_main.o mmu.o x86_emulate.o +obj-$(CONFIG_KVM) += kvm.o +kvm-intel-objs = vmx.o +obj-$(CONFIG_KVM_INTEL) += kvm-intel.o +kvm-amd-objs = svm.o +obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h new file mode 100644 index 00000000000..5785d0870ab --- /dev/null +++ b/drivers/kvm/kvm.h @@ -0,0 +1,551 @@ +#ifndef __KVM_H +#define __KVM_H + +/* + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/mm.h> + +#include "vmx.h" +#include <linux/kvm.h> + +#define CR0_PE_MASK (1ULL << 0) +#define CR0_TS_MASK (1ULL << 3) +#define CR0_NE_MASK (1ULL << 5) +#define CR0_WP_MASK (1ULL << 16) +#define CR0_NW_MASK (1ULL << 29) +#define CR0_CD_MASK (1ULL << 30) +#define CR0_PG_MASK (1ULL << 31) + +#define CR3_WPT_MASK (1ULL << 3) +#define CR3_PCD_MASK (1ULL << 4) + +#define CR3_RESEVED_BITS 0x07ULL +#define CR3_L_MODE_RESEVED_BITS (~((1ULL << 40) - 1) | 0x0fe7ULL) +#define CR3_FLAGS_MASK ((1ULL << 5) - 1) + +#define CR4_VME_MASK (1ULL << 0) +#define CR4_PSE_MASK (1ULL << 4) +#define CR4_PAE_MASK (1ULL << 5) +#define CR4_PGE_MASK (1ULL << 7) +#define CR4_VMXE_MASK (1ULL << 13) + +#define KVM_GUEST_CR0_MASK \ + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ + | CR0_NW_MASK | CR0_CD_MASK) +#define KVM_VM_CR0_ALWAYS_ON \ + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK) +#define KVM_GUEST_CR4_MASK \ + (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) +#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK | CR4_VME_MASK) + +#define INVALID_PAGE (~(hpa_t)0) +#define UNMAPPED_GVA (~(gpa_t)0) + +#define KVM_MAX_VCPUS 1 +#define KVM_MEMORY_SLOTS 4 +#define KVM_NUM_MMU_PAGES 256 + +#define FX_IMAGE_SIZE 512 +#define FX_IMAGE_ALIGN 16 +#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN) + +#define DE_VECTOR 0 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 + +#define SELECTOR_TI_MASK (1 << 2) +#define SELECTOR_RPL_MASK 0x03 + +#define IOPL_SHIFT 12 + +/* + * Address types: + * + * gva - guest virtual address + * gpa - guest physical address + * gfn - guest frame number + * hva - host virtual address + * hpa - host physical address + * hfn - host frame number + */ + +typedef unsigned long gva_t; +typedef u64 gpa_t; +typedef unsigned long gfn_t; + +typedef unsigned long hva_t; +typedef u64 hpa_t; +typedef unsigned long hfn_t; + +struct kvm_mmu_page { + struct list_head link; + hpa_t page_hpa; + unsigned long slot_bitmap; /* One bit set per slot which has memory + * in this shadow page. + */ + int global; /* Set if all ptes in this page are global */ + u64 *parent_pte; +}; + +struct vmcs { + u32 revision_id; + u32 abort; + char data[0]; +}; + +#define vmx_msr_entry kvm_msr_entry + +struct kvm_vcpu; + +/* + * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level + * 32-bit). The kvm_mmu structure abstracts the details of the current mmu + * mode. + */ +struct kvm_mmu { + void (*new_cr3)(struct kvm_vcpu *vcpu); + int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); + void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva); + void (*free)(struct kvm_vcpu *vcpu); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + hpa_t root_hpa; + int root_level; + int shadow_root_level; +}; + +struct kvm_guest_debug { + int enabled; + unsigned long bp[4]; + int singlestep; +}; + +enum { + VCPU_REGS_RAX = 0, + VCPU_REGS_RCX = 1, + VCPU_REGS_RDX = 2, + VCPU_REGS_RBX = 3, + VCPU_REGS_RSP = 4, + VCPU_REGS_RBP = 5, + VCPU_REGS_RSI = 6, + VCPU_REGS_RDI = 7, +#ifdef __x86_64__ + VCPU_REGS_R8 = 8, + VCPU_REGS_R9 = 9, + VCPU_REGS_R10 = 10, + VCPU_REGS_R11 = 11, + VCPU_REGS_R12 = 12, + VCPU_REGS_R13 = 13, + VCPU_REGS_R14 = 14, + VCPU_REGS_R15 = 15, +#endif + NR_VCPU_REGS +}; + +enum { + VCPU_SREG_CS, + VCPU_SREG_DS, + VCPU_SREG_ES, + VCPU_SREG_FS, + VCPU_SREG_GS, + VCPU_SREG_SS, + VCPU_SREG_TR, + VCPU_SREG_LDTR, +}; + +struct kvm_vcpu { + struct kvm *kvm; + union { + struct vmcs *vmcs; + struct vcpu_svm *svm; + }; + struct mutex mutex; + int cpu; + int launched; + unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ +#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) + unsigned long irq_pending[NR_IRQ_WORDS]; + unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ + unsigned long rip; /* needs vcpu_load_rsp_rip() */ + + unsigned long cr0; + unsigned long cr2; + unsigned long cr3; + unsigned long cr4; + unsigned long cr8; + u64 shadow_efer; + u64 apic_base; + int nmsrs; + struct vmx_msr_entry *guest_msrs; + struct vmx_msr_entry *host_msrs; + + struct list_head free_pages; + struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; + struct kvm_mmu mmu; + + struct kvm_guest_debug guest_debug; + + char fx_buf[FX_BUF_SIZE]; + char *host_fx_image; + char *guest_fx_image; + + int mmio_needed; + int mmio_read_completed; + int mmio_is_write; + int mmio_size; + unsigned char mmio_data[8]; + gpa_t mmio_phys_addr; + + struct { + int active; + u8 save_iopl; + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } tr, es, ds, fs, gs; + } rmode; +}; + +struct kvm_memory_slot { + gfn_t base_gfn; + unsigned long npages; + unsigned long flags; + struct page **phys_mem; + unsigned long *dirty_bitmap; +}; + +struct kvm { + spinlock_t lock; /* protects everything except vcpus */ + int nmemslots; + struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; + struct list_head active_mmu_pages; + struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; + int memory_config_version; + int busy; +}; + +struct kvm_stat { + u32 pf_fixed; + u32 pf_guest; + u32 tlb_flush; + u32 invlpg; + + u32 exits; + u32 io_exits; + u32 mmio_exits; + u32 signal_exits; + u32 irq_exits; +}; + +struct descriptor_table { + u16 limit; + unsigned long base; +} __attribute__((packed)); + +struct kvm_arch_ops { + int (*cpu_has_kvm_support)(void); /* __init */ + int (*disabled_by_bios)(void); /* __init */ + void (*hardware_enable)(void *dummy); /* __init */ + void (*hardware_disable)(void *dummy); + int (*hardware_setup)(void); /* __init */ + void (*hardware_unsetup)(void); /* __exit */ + + int (*vcpu_create)(struct kvm_vcpu *vcpu); + void (*vcpu_free)(struct kvm_vcpu *vcpu); + + struct kvm_vcpu *(*vcpu_load)(struct kvm_vcpu *vcpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + + int (*set_guest_debug)(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); + void (*get_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + void (*set_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + int (*is_long_mode)(struct kvm_vcpu *vcpu); + void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); + void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, + unsigned long cr0); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); + void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, + int *exception); + void (*cache_regs)(struct kvm_vcpu *vcpu); + void (*decache_regs)(struct kvm_vcpu *vcpu); + unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); + void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t addr); + void (*tlb_flush)(struct kvm_vcpu *vcpu); + void (*inject_page_fault)(struct kvm_vcpu *vcpu, + unsigned long addr, u32 err_code); + + void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); + + int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); + int (*vcpu_setup)(struct kvm_vcpu *vcpu); + void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); +}; + +extern struct kvm_stat kvm_stat; +extern struct kvm_arch_ops *kvm_arch_ops; + +#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) +#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) + +int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module); +void kvm_exit_arch(void); + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu); +int kvm_mmu_init(struct kvm_vcpu *vcpu); + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); + +hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); +#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) +#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) +static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } +hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); + +void kvm_emulator_want_group7_invlpg(void); + +extern hpa_t bad_page_address; + +static inline struct page *gfn_to_page(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return slot->phys_mem[gfn - slot->base_gfn]; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +void mark_page_dirty(struct kvm *kvm, gfn_t gfn); + +enum emulation_result { + EMULATE_DONE, /* no further processing */ + EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ + EMULATE_FAIL, /* can't emulate this instruction */ +}; + +int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long cr2, u16 error_code); +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); +void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags); + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, + unsigned long *rflags); + +struct x86_emulate_ctxt; + +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); +int emulate_clts(struct kvm_vcpu *vcpu); +int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, + unsigned long *dest); +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long value); + +void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); +void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); + +#ifdef __x86_64__ +void set_efer(struct kvm_vcpu *vcpu, u64 efer); +#endif + +void fx_init(struct kvm_vcpu *vcpu); + +void load_msrs(struct vmx_msr_entry *e, int n); +void save_msrs(struct vmx_msr_entry *e, int n); +void kvm_resched(struct kvm_vcpu *vcpu); + +int kvm_read_guest(struct kvm_vcpu *vcpu, + gva_t addr, + unsigned long size, + void *dest); + +int kvm_write_guest(struct kvm_vcpu *vcpu, + gva_t addr, + unsigned long size, + void *data); + +unsigned long segment_base(u16 selector); + +static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : NULL; +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return vcpu->cr4 & CR4_PAE_MASK; +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return vcpu->cr4 & CR4_PSE_MASK; +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return vcpu->cr0 & CR0_PG_MASK; +} + +static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + return slot - kvm->memslots; +} + +static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) +{ + struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page->private; +} + +static inline u16 read_fs(void) +{ + u16 seg; + asm ("mov %%fs, %0" : "=g"(seg)); + return seg; +} + +static inline u16 read_gs(void) +{ + u16 seg; + asm ("mov %%gs, %0" : "=g"(seg)); + return seg; +} + +static inline u16 read_ldt(void) +{ + u16 ldt; + asm ("sldt %0" : "=g"(ldt)); + return ldt; +} + +static inline void load_fs(u16 sel) +{ + asm ("mov %0, %%fs" : : "rm"(sel)); +} + +static inline void load_gs(u16 sel) +{ + asm ("mov %0, %%gs" : : "rm"(sel)); +} + +#ifndef load_ldt +static inline void load_ldt(u16 sel) +{ + asm ("lldt %0" : : "g"(sel)); +} +#endif + +static inline void get_idt(struct descriptor_table *table) +{ + asm ("sidt %0" : "=m"(*table)); +} + +static inline void get_gdt(struct descriptor_table *table) +{ + asm ("sgdt %0" : "=m"(*table)); +} + +static inline unsigned long read_tr_base(void) +{ + u16 tr; + asm ("str %0" : "=g"(tr)); + return segment_base(tr); +} + +#ifdef __x86_64__ +static inline unsigned long read_msr(unsigned long msr) +{ + u64 value; + + rdmsrl(msr, value); + return value; +} +#endif + +static inline void fx_save(void *image) +{ + asm ("fxsave (%0)":: "r" (image)); +} + +static inline void fx_restore(void *image) +{ + asm ("fxrstor (%0)":: "r" (image)); +} + +static inline void fpu_init(void) +{ + asm ("finit"); +} + +static inline u32 get_rdx_init_val(void) +{ + return 0x600; /* P6 family */ +} + +#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" +#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" +#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" +#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" +#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" +#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" +#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" + +#define MSR_IA32_TIME_STAMP_COUNTER 0x010 + +#define TSS_IOPB_BASE_OFFSET 0x66 +#define TSS_BASE_SIZE 0x68 +#define TSS_IOPB_SIZE (65536 / 8) +#define TSS_REDIRECTION_SIZE (256 / 8) +#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) + +#ifdef __x86_64__ + +/* + * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. Therefore + * we need to allocate shadow page tables in the first 4GB of memory, which + * happens to fit the DMA32 zone. + */ +#define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32) + +#else + +#define GFP_KVM_MMU GFP_KERNEL + +#endif + +#endif diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c new file mode 100644 index 00000000000..b6b8a41b5ec --- /dev/null +++ b/drivers/kvm/kvm_main.c @@ -0,0 +1,1935 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "kvm.h" + +#include <linux/kvm.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <asm/processor.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <asm/msr.h> +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/vmalloc.h> +#include <asm/uaccess.h> +#include <linux/reboot.h> +#include <asm/io.h> +#include <linux/debugfs.h> +#include <linux/highmem.h> +#include <linux/file.h> +#include <asm/desc.h> + +#include "x86_emulate.h" +#include "segment_descriptor.h" + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +struct kvm_arch_ops *kvm_arch_ops; +struct kvm_stat kvm_stat; +EXPORT_SYMBOL_GPL(kvm_stat); + +static struct kvm_stats_debugfs_item { + const char *name; + u32 *data; + struct dentry *dentry; +} debugfs_entries[] = { + { "pf_fixed", &kvm_stat.pf_fixed }, + { "pf_guest", &kvm_stat.pf_guest }, + { "tlb_flush", &kvm_stat.tlb_flush }, + { "invlpg", &kvm_stat.invlpg }, + { "exits", &kvm_stat.exits }, + { "io_exits", &kvm_stat.io_exits }, + { "mmio_exits", &kvm_stat.mmio_exits }, + { "signal_exits", &kvm_stat.signal_exits }, + { "irq_exits", &kvm_stat.irq_exits }, + { 0, 0 } +}; + +static struct dentry *debugfs_dir; + +#define MAX_IO_MSRS 256 + +#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL +#define LMSW_GUEST_MASK 0x0eULL +#define CR4_RESEVED_BITS (~((1ULL << 11) - 1)) +#define CR8_RESEVED_BITS (~0x0fULL) +#define EFER_RESERVED_BITS 0xfffffffffffff2fe + +struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +{ + int i; + + for (i = 0; i < vcpu->nmsrs; ++i) + if (vcpu->guest_msrs[i].index == msr) + return &vcpu->guest_msrs[i]; + return 0; +} +EXPORT_SYMBOL_GPL(find_msr_entry); + +#ifdef __x86_64__ +// LDT or TSS descriptor in the GDT. 16 bytes. +struct segment_descriptor_64 { + struct segment_descriptor s; + u32 base_higher; + u32 pad_zero; +}; + +#endif + +unsigned long segment_base(u16 selector) +{ + struct descriptor_table gdt; + struct segment_descriptor *d; + unsigned long table_base; + typedef unsigned long ul; + unsigned long v; + + if (selector == 0) + return 0; + + asm ("sgdt %0" : "=m"(gdt)); + table_base = gdt.base; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector; + + asm ("sldt %0" : "=g"(ldt_selector)); + table_base = segment_base(ldt_selector); + } + d = (struct segment_descriptor *)(table_base + (selector & ~7)); + v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); +#ifdef __x86_64__ + if (d->system == 0 + && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; +#endif + return v; +} +EXPORT_SYMBOL_GPL(segment_base); + +int kvm_read_guest(struct kvm_vcpu *vcpu, + gva_t addr, + unsigned long size, + void *dest) +{ + unsigned char *host_buf = dest; + unsigned long req_size = size; + + while (size) { + hpa_t paddr; + unsigned now; + unsigned offset; + hva_t guest_buf; + + paddr = gva_to_hpa(vcpu, addr); + + if (is_error_hpa(paddr)) + break; + + guest_buf = (hva_t)kmap_atomic( + pfn_to_page(paddr >> PAGE_SHIFT), + KM_USER0); + offset = addr & ~PAGE_MASK; + guest_buf |= offset; + now = min(size, PAGE_SIZE - offset); + memcpy(host_buf, (void*)guest_buf, now); + host_buf += now; + addr += now; + size -= now; + kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); + } + return req_size - size; +} +EXPORT_SYMBOL_GPL(kvm_read_guest); + +int kvm_write_guest(struct kvm_vcpu *vcpu, + gva_t addr, + unsigned long size, + void *data) +{ + unsigned char *host_buf = data; + unsigned long req_size = size; + + while (size) { + hpa_t paddr; + unsigned now; + unsigned offset; + hva_t guest_buf; + + paddr = gva_to_hpa(vcpu, addr); + + if (is_error_hpa(paddr)) + break; + + guest_buf = (hva_t)kmap_atomic( + pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0); + offset = addr & ~PAGE_MASK; + guest_buf |= offset; + now = min(size, PAGE_SIZE - offset); + memcpy((void*)guest_buf, host_buf, now); + host_buf += now; + addr += now; + size -= now; + kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); + } + return req_size - size; +} +EXPORT_SYMBOL_GPL(kvm_write_guest); + +static int vcpu_slot(struct kvm_vcpu *vcpu) +{ + return vcpu - vcpu->kvm->vcpus; +} + +/* + * Switches to specified vcpu, until a matching vcpu_put() + */ +static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot) +{ + struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot]; + + mutex_lock(&vcpu->mutex); + if (unlikely(!vcpu->vmcs)) { + mutex_unlock(&vcpu->mutex); + return 0; + } + return kvm_arch_ops->vcpu_load(vcpu); +} + +static void vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_arch_ops->vcpu_put(vcpu); + put_cpu(); + mutex_unlock(&vcpu->mutex); +} + +static int kvm_dev_open(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int i; + + if (!kvm) + return -ENOMEM; + + spin_lock_init(&kvm->lock); + INIT_LIST_HEAD(&kvm->active_mmu_pages); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + struct kvm_vcpu *vcpu = &kvm->vcpus[i]; + + mutex_init(&vcpu->mutex); + vcpu->mmu.root_hpa = INVALID_PAGE; + INIT_LIST_HEAD(&vcpu->free_pages); + } + filp->private_data = kvm; + return 0; +} + +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + if (!dont || free->phys_mem != dont->phys_mem) + if (free->phys_mem) { + for (i = 0; i < free->npages; ++i) + __free_page(free->phys_mem[i]); + vfree(free->phys_mem); + } + + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) + vfree(free->dirty_bitmap); + + free->phys_mem = 0; + free->npages = 0; + free->dirty_bitmap = 0; +} + +static void kvm_free_physmem(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) + kvm_free_physmem_slot(&kvm->memslots[i], 0); +} + +static void kvm_free_vcpu(struct kvm_vcpu *vcpu) +{ + kvm_arch_ops->vcpu_free(vcpu); + kvm_mmu_destroy(vcpu); +} + +static void kvm_free_vcpus(struct kvm *kvm) +{ + unsigned int i; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) + kvm_free_vcpu(&kvm->vcpus[i]); +} + +static int kvm_dev_release(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = filp->private_data; + + kvm_free_vcpus(kvm); + kvm_free_physmem(kvm); + kfree(kvm); + return 0; +} + +static void inject_gp(struct kvm_vcpu *vcpu) +{ + kvm_arch_ops->inject_gp(vcpu, 0); +} + +static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu, + unsigned long cr3) +{ + gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; + unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5; + int i; + u64 pdpte; + u64 *pdpt; + struct kvm_memory_slot *memslot; + + spin_lock(&vcpu->kvm->lock); + memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn); + /* FIXME: !memslot - emulate? 0xff? */ + pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); + + for (i = 0; i < 4; ++i) { + pdpte = pdpt[offset + i]; + if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) + break; + } + + kunmap_atomic(pdpt, KM_USER0); + spin_unlock(&vcpu->kvm->lock); + + return i != 4; +} + +void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (cr0 & CR0_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", + cr0, vcpu->cr0); + inject_gp(vcpu); + return; + } + + if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) { + printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); + inject_gp(vcpu); + return; + } + + if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) { + printk(KERN_DEBUG "set_cr0: #GP, set PG flag " + "and a clear PE flag\n"); + inject_gp(vcpu); + return; + } + + if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { +#ifdef __x86_64__ + if ((vcpu->shadow_efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while PAE is disabled\n"); + inject_gp(vcpu); + return; + } + kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while CS.L == 1\n"); + inject_gp(vcpu); + return; + + } + } else +#endif + if (is_pae(vcpu) && + pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { + printk(KERN_DEBUG "set_cr0: #GP, pdptrs " + "reserved bits\n"); + inject_gp(vcpu); + return; + } + + } + + kvm_arch_ops->set_cr0(vcpu, cr0); + vcpu->cr0 = cr0; + + spin_lock(&vcpu->kvm->lock); + kvm_mmu_reset_context(vcpu); + spin_unlock(&vcpu->kvm->lock); + return; +} +EXPORT_SYMBOL_GPL(set_cr0); + +void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); +} +EXPORT_SYMBOL_GPL(lmsw); + +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (cr4 & CR4_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); + inject_gp(vcpu); + return; + } + + if (kvm_arch_ops->is_long_mode(vcpu)) { + if (!(cr4 & CR4_PAE_MASK)) { + printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " + "in long mode\n"); + inject_gp(vcpu); + return; + } + } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) + && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { + printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); + inject_gp(vcpu); + } + + if (cr4 & CR4_VMXE_MASK) { + printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); + inject_gp(vcpu); + return; + } + kvm_arch_ops->set_cr4(vcpu, cr4); + spin_lock(&vcpu->kvm->lock); + kvm_mmu_reset_context(vcpu); + spin_unlock(&vcpu->kvm->lock); +} +EXPORT_SYMBOL_GPL(set_cr4); + +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (kvm_arch_ops->is_long_mode(vcpu)) { + if ( cr3 & CR3_L_MODE_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); + inject_gp(vcpu); + return; + } + } else { + if (cr3 & CR3_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); + inject_gp(vcpu); + return; + } + if (is_paging(vcpu) && is_pae(vcpu) && + pdptrs_have_reserved_bits_set(vcpu, cr3)) { + printk(KERN_DEBUG "set_cr3: #GP, pdptrs " + "reserved bits\n"); + inject_gp(vcpu); + return; + } + } + + vcpu->cr3 = cr3; + spin_lock(&vcpu->kvm->lock); + vcpu->mmu.new_cr3(vcpu); + spin_unlock(&vcpu->kvm->lock); +} +EXPORT_SYMBOL_GPL(set_cr3); + +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if ( cr8 & CR8_RESEVED_BITS) { + printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); + inject_gp(vcpu); + return; + } + vcpu->cr8 = cr8; +} +EXPORT_SYMBOL_GPL(set_cr8); + +void fx_init(struct kvm_vcpu *vcpu) +{ + struct __attribute__ ((__packed__)) fx_image_s { + u16 control; //fcw + u16 status; //fsw + u16 tag; // ftw + u16 opcode; //fop + u64 ip; // fpu ip + u64 operand;// fpu dp + u32 mxcsr; + u32 mxcsr_mask; + + } *fx_image; + + fx_save(vcpu->host_fx_image); + fpu_init(); + fx_save(vcpu->guest_fx_image); + fx_restore(vcpu->host_fx_image); + + fx_image = (struct fx_image_s *)vcpu->guest_fx_image; + fx_image->mxcsr = 0x1f80; + memset(vcpu->guest_fx_image + sizeof(struct fx_image_s), + 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s)); +} +EXPORT_SYMBOL_GPL(fx_init); + +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n) +{ + int r; + struct kvm_vcpu *vcpu; + + r = -EINVAL; + if (n < 0 || n >= KVM_MAX_VCPUS) + goto out; + + vcpu = &kvm->vcpus[n]; + + mutex_lock(&vcpu->mutex); + + if (vcpu->vmcs) { + mutex_unlock(&vcpu->mutex); + return -EEXIST; + } + + vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, + FX_IMAGE_ALIGN); + vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; + + vcpu->cpu = -1; /* First load will set up TR */ + vcpu->kvm = kvm; + r = kvm_arch_ops->vcpu_create(vcpu); + if (r < 0) + goto out_free_vcpus; + + kvm_arch_ops->vcpu_load(vcpu); + + r = kvm_arch_ops->vcpu_setup(vcpu); + if (r >= 0) + r = kvm_mmu_init(vcpu); + + vcpu_put(vcpu); + + if (r < 0) + goto out_free_vcpus; + + return 0; + +out_free_vcpus: + kvm_free_vcpu(vcpu); + mutex_unlock(&vcpu->mutex); +out: + return r; +} + +/* + * Allocate some memory and give it an address in the guest physical address + * space. + * + * Discontiguous memory is allowed, mostly for framebuffers. + */ +static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm, + struct kvm_memory_region *mem) +{ + int r; + gfn_t base_gfn; + unsigned long npages; + unsigned long i; + struct kvm_memory_slot *memslot; + struct kvm_memory_slot old, new; + int memory_config_version; + + r = -EINVAL; + /* General sanity checks */ + if (mem->memory_size & (PAGE_SIZE - 1)) + goto out; + if (mem->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (mem->slot >= KVM_MEMORY_SLOTS) + goto out; + if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) + goto out; + + memslot = &kvm->memslots[mem->slot]; + base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; + npages = mem->memory_size >> PAGE_SHIFT; + + if (!npages) + mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + +raced: + spin_lock(&kvm->lock); + + memory_config_version = kvm->memory_config_version; + new = old = *memslot; + + new.base_gfn = base_gfn; + new.npages = npages; + new.flags = mem->flags; + + /* Disallow changing a memory slot's size. */ + r = -EINVAL; + if (npages && old.npages && npages != old.npages) + goto out_unlock; + + /* Check for overlaps */ + r = -EEXIST; + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *s = &kvm->memslots[i]; + + if (s == memslot) + continue; + if (!((base_gfn + npages <= s->base_gfn) || + (base_gfn >= s->base_gfn + s->npages))) + goto out_unlock; + } + /* + * Do memory allocations outside lock. memory_config_version will + * detect any races. + */ + spin_unlock(&kvm->lock); + + /* Deallocate if slot is being removed */ + if (!npages) + new.phys_mem = 0; + + /* Free page dirty bitmap if unneeded */ + if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) + new.dirty_bitmap = 0; + + r = -ENOMEM; + + /* Allocate if a slot is being created */ + if (npages && !new.phys_mem) { + new.phys_mem = vmalloc(npages * sizeof(struct page *)); + + if (!new.phys_mem) + goto out_free; + + memset(new.phys_mem, 0, npages * sizeof(struct page *)); + for (i = 0; i < npages; ++i) { + new.phys_mem[i] = alloc_page(GFP_HIGHUSER + | __GFP_ZERO); + if (!new.phys_mem[i]) + goto out_free; + } + } + + /* Allocate page dirty bitmap if needed */ + if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { + unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; + + new.dirty_bitmap = vmalloc(dirty_bytes); + if (!new.dirty_bitmap) + goto out_free; + memset(new.dirty_bitmap, 0, dirty_bytes); + } + + spin_lock(&kvm->lock); + + if (memory_config_version != kvm->memory_config_version) { + spin_unlock(&kvm->lock); + kvm_free_physmem_slot(&new, &old); + goto raced; + } + + r = -EAGAIN; + if (kvm->busy) + goto out_unlock; + + if (mem->slot >= kvm->nmemslots) + kvm->nmemslots = mem->slot + 1; + + *memslot = new; + ++kvm->memory_config_version; + + spin_unlock(&kvm->lock); + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + struct kvm_vcpu *vcpu; + + vcpu = vcpu_load(kvm, i); + if (!vcpu) + continue; + kvm_mmu_reset_context(vcpu); + vcpu_put(vcpu); + } + + kvm_free_physmem_slot(&old, &new); + return 0; + +out_unlock: + spin_unlock(&kvm->lock); +out_free: + kvm_free_physmem_slot(&new, &old); +out: + return r; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + int r, i; + int n; + unsigned long any = 0; + + spin_lock(&kvm->lock); + + /* + * Prevent changes to guest memory configuration even while the lock + * is not taken. + */ + ++kvm->busy; + spin_unlock(&kvm->lock); + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, 8) / 8; + + for (i = 0; !any && i < n; ++i) + any = memslot->dirty_bitmap[i]; + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + + if (any) { + spin_lock(&kvm->lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->lock); + memset(memslot->dirty_bitmap, 0, n); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + struct kvm_vcpu *vcpu = vcpu_load(kvm, i); + + if (!vcpu) + continue; + kvm_arch_ops->tlb_flush(vcpu); + vcpu_put(vcpu); + } + } + + r = 0; + +out: + spin_lock(&kvm->lock); + --kvm->busy; + spin_unlock(&kvm->lock); + return r; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; + } + return 0; +} +EXPORT_SYMBOL_GPL(gfn_to_memslot); + +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_memory_slot *memslot = 0; + unsigned long rel_gfn; + + for (i = 0; i < kvm->nmemslots; ++i) { + memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) { + + if (!memslot || !memslot->dirty_bitmap) + return; + + rel_gfn = gfn - memslot->base_gfn; + + /* avoid RMW */ + if (!test_bit(rel_gfn, memslot->dirty_bitmap)) + set_bit(rel_gfn, memslot->dirty_bitmap); + return; + } + } +} + +static int emulator_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + void *data = val; + + while (bytes) { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + unsigned long pfn; + struct kvm_memory_slot *memslot; + void *page; + + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + pfn = gpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(vcpu->kvm, pfn); + if (!memslot) + return X86EMUL_UNHANDLEABLE; + page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0); + + memcpy(data, page + offset, tocopy); + + kunmap_atomic(page, KM_USER0); + + bytes -= tocopy; + data += tocopy; + addr += tocopy; + } + + return X86EMUL_CONTINUE; +} + +static int emulator_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + printk(KERN_ERR "emulator_write_std: addr %lx n %d\n", + addr, bytes); + return X86EMUL_UNHANDLEABLE; +} + +static int emulator_read_emulated(unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + vcpu->mmio_read_completed = 0; + return X86EMUL_CONTINUE; + } else if (emulator_read_std(addr, val, bytes, ctxt) + == X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; + else { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA) + return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT; + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return X86EMUL_UNHANDLEABLE; + } +} + +static int emulator_write_emulated(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, &val, bytes); + + return X86EMUL_CONTINUE; +} + +static int emulator_cmpxchg_emulated(unsigned long addr, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + static int reported; + + if (!reported) { + reported = 1; + printk(KERN_WARNING "kvm: emulating exchange as write\n"); + } + return emulator_write_emulated(addr, new, bytes, ctxt); +} + +static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return kvm_arch_ops->get_segment_base(vcpu, seg); +} + +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +{ + spin_lock(&vcpu->kvm->lock); + vcpu->mmu.inval_page(vcpu, address); + spin_unlock(&vcpu->kvm->lock); + kvm_arch_ops->invlpg(vcpu, address); + return X86EMUL_CONTINUE; +} + +int emulate_clts(struct kvm_vcpu *vcpu) +{ + unsigned long cr0 = vcpu->cr0; + + cr0 &= ~CR0_TS_MASK; + kvm_arch_ops->set_cr0(vcpu, cr0); + return X86EMUL_CONTINUE; +} + +int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + + switch (dr) { + case 0 ... 3: + *dest = kvm_arch_ops->get_dr(vcpu, dr); + return X86EMUL_CONTINUE; + default: + printk(KERN_DEBUG "%s: unexpected dr %u\n", + __FUNCTION__, dr); + return X86EMUL_UNHANDLEABLE; + } +} + +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +{ + unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; + int exception; + + kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); + if (exception) { + /* FIXME: better handling */ + return X86EMUL_UNHANDLEABLE; + } + return X86EMUL_CONTINUE; +} + +static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) +{ + static int reported; + u8 opcodes[4]; + unsigned long rip = ctxt->vcpu->rip; + unsigned long rip_linear; + + rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS); + + if (reported) + return; + + emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt); + + printk(KERN_ERR "emulation failed but !mmio_needed?" + " rip %lx %02x %02x %02x %02x\n", + rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + reported = 1; +} + +struct x86_emulate_ops emulate_ops = { + .read_std = emulator_read_std, + .write_std = emulator_write_std, + .read_emulated = emulator_read_emulated, + .write_emulated = emulator_write_emulated, + .cmpxchg_emulated = emulator_cmpxchg_emulated, +}; + +int emulate_instruction(struct kvm_vcpu *vcpu, + struct kvm_run *run, + unsigned long cr2, + u16 error_code) +{ + struct x86_emulate_ctxt emulate_ctxt; + int r; + int cs_db, cs_l; + + kvm_arch_ops->cache_regs(vcpu); + + kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + emulate_ctxt.vcpu = vcpu; + emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu); + emulate_ctxt.cr2 = cr2; + emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { + emulate_ctxt.cs_base = 0; + emulate_ctxt.ds_base = 0; + emulate_ctxt.es_base = 0; + emulate_ctxt.ss_base = 0; + } else { + emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); + emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); + emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); + emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); + } + + emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); + emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); + + vcpu->mmio_is_write = 0; + r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); + + if ((r || vcpu->mmio_is_write) && run) { + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } + + if (r) { + if (!vcpu->mmio_needed) { + report_emulation_failure(&emulate_ctxt); + return EMULATE_FAIL; + } + return EMULATE_DO_MMIO; + } + + kvm_arch_ops->decache_regs(vcpu); + kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) + return EMULATE_DO_MMIO; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(emulate_instruction); + +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; +} + +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt = { limit, base }; + + kvm_arch_ops->set_gdt(vcpu, &dt); +} + +void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt = { limit, base }; + + kvm_arch_ops->set_idt(vcpu, &dt); +} + +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags) +{ + lmsw(vcpu, msw); + *rflags = kvm_arch_ops->get_rflags(vcpu); +} + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) +{ + switch (cr) { + case 0: + return vcpu->cr0; + case 2: + return vcpu->cr2; + case 3: + return vcpu->cr3; + case 4: + return vcpu->cr4; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + return 0; + } +} + +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, + unsigned long *rflags) +{ + switch (cr) { + case 0: + set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); + *rflags = kvm_arch_ops->get_rflags(vcpu); + break; + case 2: + vcpu->cr2 = val; + break; + case 3: + set_cr3(vcpu, val); + break; + case 4: + set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + } +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + return kvm_arch_ops->get_msr(vcpu, msr_index, pdata); +} + +#ifdef __x86_64__ + +void set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vmx_msr_entry *msr; + + if (efer & EFER_RESERVED_BITS) { + printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", + efer); + inject_gp(vcpu); + return; + } + + if (is_paging(vcpu) + && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { + printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); + inject_gp(vcpu); + return; + } + + efer &= ~EFER_LMA; + efer |= vcpu->shadow_efer & EFER_LMA; + + vcpu->shadow_efer = efer; + + msr = find_msr_entry(vcpu, MSR_EFER); + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + msr->data = efer; +} +EXPORT_SYMBOL_GPL(set_efer); + +#endif + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + return kvm_arch_ops->set_msr(vcpu, msr_index, data); +} + +void kvm_resched(struct kvm_vcpu *vcpu) +{ + vcpu_put(vcpu); + cond_resched(); + /* Cannot fail - no vcpu unplug yet. */ + vcpu_load(vcpu->kvm, vcpu_slot(vcpu)); +} +EXPORT_SYMBOL_GPL(kvm_resched); + +void load_msrs(struct vmx_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + wrmsrl(e[i].index, e[i].data); +} +EXPORT_SYMBOL_GPL(load_msrs); + +void save_msrs(struct vmx_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + rdmsrl(e[i].index, e[i].data); +} +EXPORT_SYMBOL_GPL(save_msrs); + +static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run) +{ + struct kvm_vcpu *vcpu; + int r; + + if (kvm_run->vcpu < 0 || kvm_run->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = vcpu_load(kvm, kvm_run->vcpu); + if (!vcpu) + return -ENOENT; + + if (kvm_run->emulated) { + kvm_arch_ops->skip_emulated_instruction(vcpu); + kvm_run->emulated = 0; + } + + if (kvm_run->mmio_completed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + } + + vcpu->mmio_needed = 0; + + r = kvm_arch_ops->run(vcpu, kvm_run); + + vcpu_put(vcpu); + return r; +} + +static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs) +{ + struct kvm_vcpu *vcpu; + + if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = vcpu_load(kvm, regs->vcpu); + if (!vcpu) + return -ENOENT; + + kvm_arch_ops->cache_regs(vcpu); + + regs->rax = vcpu->regs[VCPU_REGS_RAX]; + regs->rbx = vcpu->regs[VCPU_REGS_RBX]; + regs->rcx = vcpu->regs[VCPU_REGS_RCX]; + regs->rdx = vcpu->regs[VCPU_REGS_RDX]; + regs->rsi = vcpu->regs[VCPU_REGS_RSI]; + regs->rdi = vcpu->regs[VCPU_REGS_RDI]; + regs->rsp = vcpu->regs[VCPU_REGS_RSP]; + regs->rbp = vcpu->regs[VCPU_REGS_RBP]; +#ifdef __x86_64__ + regs->r8 = vcpu->regs[VCPU_REGS_R8]; + regs->r9 = vcpu->regs[VCPU_REGS_R9]; + regs->r10 = vcpu->regs[VCPU_REGS_R10]; + regs->r11 = vcpu->regs[VCPU_REGS_R11]; + regs->r12 = vcpu->regs[VCPU_REGS_R12]; + regs->r13 = vcpu->regs[VCPU_REGS_R13]; + regs->r14 = vcpu->regs[VCPU_REGS_R14]; + regs->r15 = vcpu->regs[VCPU_REGS_R15]; +#endif + + regs->rip = vcpu->rip; + regs->rflags = kvm_arch_ops->get_rflags(vcpu); + + /* + * Don't leak debug flags in case they were set for guest debugging + */ + if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) + regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs) +{ + struct kvm_vcpu *vcpu; + + if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = vcpu_load(kvm, regs->vcpu); + if (!vcpu) + return -ENOENT; + + vcpu->regs[VCPU_REGS_RAX] = regs->rax; + vcpu->regs[VCPU_REGS_RBX] = regs->rbx; + vcpu->regs[VCPU_REGS_RCX] = regs->rcx; + vcpu->regs[VCPU_REGS_RDX] = regs->rdx; + vcpu->regs[VCPU_REGS_RSI] = regs->rsi; + vcpu->regs[VCPU_REGS_RDI] = regs->rdi; + vcpu->regs[VCPU_REGS_RSP] = regs->rsp; + vcpu->regs[VCPU_REGS_RBP] = regs->rbp; +#ifdef __x86_64__ + vcpu->regs[VCPU_REGS_R8] = regs->r8; + vcpu->regs[VCPU_REGS_R9] = regs->r9; + vcpu->regs[VCPU_REGS_R10] = regs->r10; + vcpu->regs[VCPU_REGS_R11] = regs->r11; + vcpu->regs[VCPU_REGS_R12] = regs->r12; + vcpu->regs[VCPU_REGS_R13] = regs->r13; + vcpu->regs[VCPU_REGS_R14] = regs->r14; + vcpu->regs[VCPU_REGS_R15] = regs->r15; +#endif + + vcpu->rip = regs->rip; + kvm_arch_ops->set_rflags(vcpu, regs->rflags); + + kvm_arch_ops->decache_regs(vcpu); + + vcpu_put(vcpu); + + return 0; +} + +static void get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + return kvm_arch_ops->get_segment(vcpu, var, seg); +} + +static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs) +{ + struct kvm_vcpu *vcpu; + struct descriptor_table dt; + + if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + vcpu = vcpu_load(kvm, sregs->vcpu); + if (!vcpu) + return -ENOENT; + + get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_arch_ops->get_idt(vcpu, &dt); + sregs->idt.limit = dt.limit; + sregs->idt.base = dt.base; + kvm_arch_ops->get_gdt(vcpu, &dt); + sregs->gdt.limit = dt.limit; + sregs->gdt.base = dt.base; + + sregs->cr0 = vcpu->cr0; + sregs->cr2 = vcpu->cr2; + sregs->cr3 = vcpu->cr3; + sregs->cr4 = vcpu->cr4; + sregs->cr8 = vcpu->cr8; + sregs->efer = vcpu->shadow_efer; + sregs->apic_base = vcpu->apic_base; + + memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, + sizeof sregs->interrupt_bitmap); + + vcpu_put(vcpu); + + return 0; +} + +static void set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + return kvm_arch_ops->set_segment(vcpu, var, seg); +} + +static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs) +{ + struct kvm_vcpu *vcpu; + int mmu_reset_needed = 0; + int i; + struct descriptor_table dt; + + if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + vcpu = vcpu_load(kvm, sregs->vcpu); + if (!vcpu) + return -ENOENT; + + set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + dt.limit = sregs->idt.limit; + dt.base = sregs->idt.base; + kvm_arch_ops->set_idt(vcpu, &dt); + dt.limit = sregs->gdt.limit; + dt.base = sregs->gdt.base; + kvm_arch_ops->set_gdt(vcpu, &dt); + + vcpu->cr2 = sregs->cr2; + mmu_reset_needed |= vcpu->cr3 != sregs->cr3; + vcpu->cr3 = sregs->cr3; + + vcpu->cr8 = sregs->cr8; + + mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; +#ifdef __x86_64__ + kvm_arch_ops->set_efer(vcpu, sregs->efer); +#endif + vcpu->apic_base = sregs->apic_base; + + mmu_reset_needed |= vcpu->cr0 != sregs->cr0; + kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); + + mmu_reset_needed |= vcpu->cr4 != sregs->cr4; + kvm_arch_ops->set_cr4(vcpu, sregs->cr4); + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, + sizeof vcpu->irq_pending); + vcpu->irq_summary = 0; + for (i = 0; i < NR_IRQ_WORDS; ++i) + if (vcpu->irq_pending[i]) + __set_bit(i, &vcpu->irq_summary); + + vcpu_put(vcpu); + + return 0; +} + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + */ +static u32 msrs_to_save[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef __x86_64__ + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TIME_STAMP_COUNTER, +}; + + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return set_msr(vcpu, index, *data); +} + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) +{ + struct kvm_vcpu *vcpu; + int i; + + if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = vcpu_load(kvm, msrs->vcpu); + if (!vcpu) + return -ENOENT; + + for (i = 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + + vcpu_put(vcpu); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + int r, n; + unsigned size; + + r = -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + goto out; + + r = -E2BIG; + if (msrs.nmsrs >= MAX_IO_MSRS) + goto out; + + r = -ENOMEM; + size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries = vmalloc(size); + if (!entries) + goto out; + + r = -EFAULT; + if (copy_from_user(entries, user_msrs->entries, size)) + goto out_free; + + r = n = __msr_io(kvm, &msrs, entries, do_msr); + if (r < 0) + goto out_free; + + r = -EFAULT; + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + goto out_free; + + r = n; + +out_free: + vfree(entries); +out: + return r; +} + +/* + * Translate a guest virtual address to a guest physical address. + */ +static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr) +{ + unsigned long vaddr = tr->linear_address; + struct kvm_vcpu *vcpu; + gpa_t gpa; + + vcpu = vcpu_load(kvm, tr->vcpu); + if (!vcpu) + return -ENOENT; + spin_lock(&kvm->lock); + gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); + tr->physical_address = gpa; + tr->valid = gpa != UNMAPPED_GVA; + tr->writeable = 1; + tr->usermode = 0; + spin_unlock(&kvm->lock); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq) +{ + struct kvm_vcpu *vcpu; + + if (irq->vcpu < 0 || irq->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + vcpu = vcpu_load(kvm, irq->vcpu); + if (!vcpu) + return -ENOENT; + + set_bit(irq->irq, vcpu->irq_pending); + set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_dev_ioctl_debug_guest(struct kvm *kvm, + struct kvm_debug_guest *dbg) +{ + struct kvm_vcpu *vcpu; + int r; + + if (dbg->vcpu < 0 || dbg->vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + vcpu = vcpu_load(kvm, dbg->vcpu); + if (!vcpu) + return -ENOENT; + + r = kvm_arch_ops->set_guest_debug(vcpu, dbg); + + vcpu_put(vcpu); + + return r; +} + +static long kvm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + int r = -EINVAL; + + switch (ioctl) { + case KVM_CREATE_VCPU: { + r = kvm_dev_ioctl_create_vcpu(kvm, arg); + if (r) + goto out; + break; + } + case KVM_RUN: { + struct kvm_run kvm_run; + + r = -EFAULT; + if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run)) + goto out; + r = kvm_dev_ioctl_run(kvm, &kvm_run); + if (r < 0) + goto out; + r = -EFAULT; + if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) + goto out; + r = 0; + break; + } + case KVM_GET_REGS: { + struct kvm_regs kvm_regs; + + r = -EFAULT; + if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs)) + goto out; + r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user((void *)arg, &kvm_regs, sizeof kvm_regs)) + goto out; + r = 0; + break; + } + case KVM_SET_REGS: { + struct kvm_regs kvm_regs; + + r = -EFAULT; + if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs)) + goto out; + r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_SREGS: { + struct kvm_sregs kvm_sregs; + + r = -EFAULT; + if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs)) + goto out; + r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user((void *)arg, &kvm_sregs, sizeof kvm_sregs)) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS: { + struct kvm_sregs kvm_sregs; + + r = -EFAULT; + if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs)) + goto out; + r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs); + if (r) + goto out; + r = 0; + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + + r = -EFAULT; + if (copy_from_user(&tr, (void *)arg, sizeof tr)) + goto out; + r = kvm_dev_ioctl_translate(kvm, &tr); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user((void *)arg, &tr, sizeof tr)) + goto out; + r = 0; + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, (void *)arg, sizeof irq)) + goto out; + r = kvm_dev_ioctl_interrupt(kvm, &irq); + if (r) + goto out; + r = 0; + break; + } + case KVM_DEBUG_GUEST: { + struct kvm_debug_guest dbg; + + r = -EFAULT; + if (copy_from_user(&dbg, (void *)arg, sizeof dbg)) + goto out; + r = kvm_dev_ioctl_debug_guest(kvm, &dbg); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_MEMORY_REGION: { + struct kvm_memory_region kvm_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_mem, (void *)arg, sizeof kvm_mem)) + goto out; + r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem); + if (r) + goto out; + break; + } + case KVM_GET_DIRTY_LOG: { + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, (void *)arg, sizeof log)) + goto out; + r = kvm_dev_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } + case KVM_GET_MSRS: + r = msr_io(kvm, (void __user *)arg, get_msr, 1); + break; + case KVM_SET_MSRS: + r = msr_io(kvm, (void __user *)arg, do_set_msr, 0); + break; + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list __user *user_msr_list = (void __user *)arg; + struct kvm_msr_list msr_list; + unsigned n; + + r = -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + goto out; + n = msr_list.nmsrs; + msr_list.nmsrs = ARRAY_SIZE(msrs_to_save); + if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + goto out; + r = -E2BIG; + if (n < ARRAY_SIZE(msrs_to_save)) + goto out; + r = -EFAULT; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + sizeof msrs_to_save)) + goto out; + r = 0; + } + default: + ; + } +out: + return r; +} + +static struct page *kvm_dev_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + struct kvm *kvm = vma->vm_file->private_data; + unsigned long pgoff; + struct kvm_memory_slot *slot; + struct page *page; + + *type = VM_FAULT_MINOR; + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + slot = gfn_to_memslot(kvm, pgoff); + if (!slot) + return NOPAGE_SIGBUS; + page = gfn_to_page(slot, pgoff); + if (!page) + return NOPAGE_SIGBUS; + get_page(page); + return page; +} + +static struct vm_operations_struct kvm_dev_vm_ops = { + .nopage = kvm_dev_nopage, +}; + +static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_dev_vm_ops; + return 0; +} + +static struct file_operations kvm_chardev_ops = { + .open = kvm_dev_open, + .release = kvm_dev_release, + .unlocked_ioctl = kvm_dev_ioctl, + .compat_ioctl = kvm_dev_ioctl, + .mmap = kvm_dev_mmap, +}; + +static struct miscdevice kvm_dev = { + MISC_DYNAMIC_MINOR, + "kvm", + &kvm_chardev_ops, +}; + +static int kvm_reboot(struct notifier_block *notifier, unsigned long val, + void *v) +{ + if (val == SYS_RESTART) { + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_reboot_notifier = { + .notifier_call = kvm_reboot, + .priority = 0, +}; + +static __init void kvm_init_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + debugfs_dir = debugfs_create_dir("kvm", 0); + for (p = debugfs_entries; p->name; ++p) + p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir, + p->data); +} + +static void kvm_exit_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + for (p = debugfs_entries; p->name; ++p) + debugfs_remove(p->dentry); + debugfs_remove(debugfs_dir); +} + +hpa_t bad_page_address; + +int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) +{ + int r; + + kvm_arch_ops = ops; + + if (!kvm_arch_ops->cpu_has_kvm_support()) { + printk(KERN_ERR "kvm: no hardware support\n"); + return -EOPNOTSUPP; + } + if (kvm_arch_ops->disabled_by_bios()) { + printk(KERN_ERR "kvm: disabled by bios\n"); + return -EOPNOTSUPP; + } + + r = kvm_arch_ops->hardware_setup(); + if (r < 0) + return r; + + on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1); + register_reboot_notifier(&kvm_reboot_notifier); + + kvm_chardev_ops.owner = module; + + r = misc_register(&kvm_dev); + if (r) { + printk (KERN_ERR "kvm: misc device register failed\n"); + goto out_free; + } + + return r; + +out_free: + unregister_reboot_notifier(&kvm_reboot_notifier); + on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); + kvm_arch_ops->hardware_unsetup(); + return r; +} + +void kvm_exit_arch(void) +{ + misc_deregister(&kvm_dev); + + unregister_reboot_notifier(&kvm_reboot_notifier); + on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); + kvm_arch_ops->hardware_unsetup(); +} + +static __init int kvm_init(void) +{ + static struct page *bad_page; + int r = 0; + + kvm_init_debug(); + + if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { + r = -ENOMEM; + goto out; + } + + bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; + memset(__va(bad_page_address), 0, PAGE_SIZE); + + return r; + +out: + kvm_exit_debug(); + return r; +} + +static __exit void kvm_exit(void) +{ + kvm_exit_debug(); + __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); +} + +module_init(kvm_init) +module_exit(kvm_exit) + +EXPORT_SYMBOL_GPL(kvm_init_arch); +EXPORT_SYMBOL_GPL(kvm_exit_arch); diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h new file mode 100644 index 00000000000..7d7f2aa1096 --- /dev/null +++ b/drivers/kvm/kvm_svm.h @@ -0,0 +1,44 @@ +#ifndef __KVM_SVM_H +#define __KVM_SVM_H + +#include <linux/types.h> +#include <linux/list.h> +#include <asm/msr.h> + +#include "svm.h" +#include "kvm.h" + +static const u32 host_save_msrs[] = { +#ifdef __x86_64__ + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, MSR_GS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_IA32_DEBUGCTLMSR, /*MSR_IA32_LASTBRANCHFROMIP, + MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/ +}; + +#define NR_HOST_SAVE_MSRS (sizeof(host_save_msrs) / sizeof(*host_save_msrs)) +#define NUM_DB_REGS 4 + +struct vcpu_svm { + struct vmcb *vmcb; + unsigned long vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + + unsigned long cr0; + unsigned long cr4; + unsigned long db_regs[NUM_DB_REGS]; + + u64 next_rip; + + u64 host_msrs[NR_HOST_SAVE_MSRS]; + unsigned long host_cr2; + unsigned long host_db_regs[NUM_DB_REGS]; + unsigned long host_dr6; + unsigned long host_dr7; +}; + +#endif + diff --git a/drivers/kvm/kvm_vmx.h b/drivers/kvm/kvm_vmx.h new file mode 100644 index 00000000000..87e12d2bfa1 --- /dev/null +++ b/drivers/kvm/kvm_vmx.h @@ -0,0 +1,14 @@ +#ifndef __KVM_VMX_H +#define __KVM_VMX_H + +#ifdef __x86_64__ +/* + * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt + * mechanism (cpu bug AA24) + */ +#define NR_BAD_MSRS 2 +#else +#define NR_BAD_MSRS 0 +#endif + +#endif diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c new file mode 100644 index 00000000000..4e29d9b7211 --- /dev/null +++ b/drivers/kvm/mmu.c @@ -0,0 +1,699 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include <linux/types.h> +#include <linux/string.h> +#include <asm/page.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/module.h> + +#include "vmx.h" +#include "kvm.h" + +#define pgprintk(x...) do { } while (0) + +#define ASSERT(x) \ + if (!(x)) { \ + printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x); \ + } + +#define PT64_ENT_PER_PAGE 512 +#define PT32_ENT_PER_PAGE 1024 + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_MASK (1ULL << 63) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + + +#define PT32_PTE_COPY_MASK \ + (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \ + PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_PAT_MASK | \ + PT_GLOBAL_MASK ) + +#define PT32_NON_PTE_COPY_MASK \ + (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \ + PT_ACCESSED_MASK | PT_DIRTY_MASK) + + +#define PT64_PTE_COPY_MASK \ + (PT64_NX_MASK | PT32_PTE_COPY_MASK) + +#define PT64_NON_PTE_COPY_MASK \ + (PT64_NX_MASK | PT32_NON_PTE_COPY_MASK) + + + +#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT64_SECOND_AVAIL_BITS_SHIFT 52 + +#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + +#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1) +#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT) + +#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1) +#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT)) + +#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT) + +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS ) + +#define PT64_LEVEL_MASK(level) \ + (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) + + +#define PT32_LEVEL_BITS 10 + +#define PT32_LEVEL_SHIFT(level) \ + ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS ) + +#define PT32_LEVEL_MASK(level) \ + (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) + +#define PT32_INDEX(address, level)\ + (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + + +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK) +#define PT64_DIR_BASE_ADDR_MASK \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) + +#define PT32_BASE_ADDR_MASK PAGE_MASK +#define PT32_DIR_BASE_ADDR_MASK \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) + + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 + +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +static int is_write_protection(struct kvm_vcpu *vcpu) +{ + return vcpu->cr0 & CR0_WP_MASK; +} + +static int is_cpuid_PSE36(void) +{ + return 1; +} + +static int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + +static int is_writeble_pte(unsigned long pte) +{ + return pte & PT_WRITABLE_MASK; +} + +static int is_io_pte(unsigned long pte) +{ + return pte & PT_SHADOW_IO_MARK; +} + +static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) +{ + struct kvm_mmu_page *page_head = page_header(page_hpa); + + list_del(&page_head->link); + page_head->page_hpa = page_hpa; + list_add(&page_head->link, &vcpu->free_pages); +} + +static int is_empty_shadow_page(hpa_t page_hpa) +{ + u32 *pos; + u32 *end; + for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32); + pos != end; pos++) + if (*pos != 0) + return 0; + return 1; +} + +static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) +{ + struct kvm_mmu_page *page; + + if (list_empty(&vcpu->free_pages)) + return INVALID_PAGE; + + page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); + list_del(&page->link); + list_add(&page->link, &vcpu->kvm->active_mmu_pages); + ASSERT(is_empty_shadow_page(page->page_hpa)); + page->slot_bitmap = 0; + page->global = 1; + page->parent_pte = parent_pte; + return page->page_hpa; +} + +static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) +{ + int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); + struct kvm_mmu_page *page_head = page_header(__pa(pte)); + + __set_bit(slot, &page_head->slot_bitmap); +} + +hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + hpa_t hpa = gpa_to_hpa(vcpu, gpa); + + return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; +} + +hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + struct kvm_memory_slot *slot; + struct page *page; + + ASSERT((gpa & HPA_ERR_MASK) == 0); + slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!slot) + return gpa | HPA_ERR_MASK; + page = gfn_to_page(slot, gpa >> PAGE_SHIFT); + return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) + | (gpa & (PAGE_SIZE-1)); +} + +hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return UNMAPPED_GVA; + return gpa_to_hpa(vcpu, gpa); +} + + +static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa, + int level) +{ + ASSERT(vcpu); + ASSERT(VALID_PAGE(page_hpa)); + ASSERT(level <= PT64_ROOT_LEVEL && level > 0); + + if (level == 1) + memset(__va(page_hpa), 0, PAGE_SIZE); + else { + u64 *pos; + u64 *end; + + for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE; + pos != end; pos++) { + u64 current_ent = *pos; + + *pos = 0; + if (is_present_pte(current_ent)) + release_pt_page_64(vcpu, + current_ent & + PT64_BASE_ADDR_MASK, + level - 1); + } + } + kvm_mmu_free_page(vcpu, page_hpa); +} + +static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) +{ +} + +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) +{ + int level = PT32E_ROOT_LEVEL; + hpa_t table_addr = vcpu->mmu.root_hpa; + + for (; ; level--) { + u32 index = PT64_INDEX(v, level); + u64 *table; + + ASSERT(VALID_PAGE(table_addr)); + table = __va(table_addr); + + if (level == 1) { + mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); + page_header_update_slot(vcpu->kvm, table, v); + table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | + PT_USER_MASK; + return 0; + } + + if (table[index] == 0) { + hpa_t new_table = kvm_mmu_alloc_page(vcpu, + &table[index]); + + if (!VALID_PAGE(new_table)) { + pgprintk("nonpaging_map: ENOMEM\n"); + return -ENOMEM; + } + + if (level == PT32E_ROOT_LEVEL) + table[index] = new_table | PT_PRESENT_MASK; + else + table[index] = new_table | PT_PRESENT_MASK | + PT_WRITABLE_MASK | PT_USER_MASK; + } + table_addr = table[index] & PT64_BASE_ADDR_MASK; + } +} + +static void nonpaging_flush(struct kvm_vcpu *vcpu) +{ + hpa_t root = vcpu->mmu.root_hpa; + + ++kvm_stat.tlb_flush; + pgprintk("nonpaging_flush\n"); + ASSERT(VALID_PAGE(root)); + release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); + root = kvm_mmu_alloc_page(vcpu, NULL); + ASSERT(VALID_PAGE(root)); + vcpu->mmu.root_hpa = root; + if (is_paging(vcpu)) + root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)); + kvm_arch_ops->set_cr3(vcpu, root); + kvm_arch_ops->tlb_flush(vcpu); +} + +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + return vaddr; +} + +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) +{ + int ret; + gpa_t addr = gva; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); + + for (;;) { + hpa_t paddr; + + paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); + + if (is_error_hpa(paddr)) + return 1; + + ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr); + if (ret) { + nonpaging_flush(vcpu); + continue; + } + break; + } + return ret; +} + +static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) +{ +} + +static void nonpaging_free(struct kvm_vcpu *vcpu) +{ + hpa_t root; + + ASSERT(vcpu); + root = vcpu->mmu.root_hpa; + if (VALID_PAGE(root)) + release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); + vcpu->mmu.root_hpa = INVALID_PAGE; +} + +static int nonpaging_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = nonpaging_page_fault; + context->inval_page = nonpaging_inval_page; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->free = nonpaging_free; + context->root_level = PT32E_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + ASSERT(VALID_PAGE(context->root_hpa)); + kvm_arch_ops->set_cr3(vcpu, context->root_hpa); + return 0; +} + + +static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *page, *npage; + + list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages, + link) { + if (page->global) + continue; + + if (!page->parent_pte) + continue; + + *page->parent_pte = 0; + release_pt_page_64(vcpu, page->page_hpa, 1); + } + ++kvm_stat.tlb_flush; + kvm_arch_ops->tlb_flush(vcpu); +} + +static void paging_new_cr3(struct kvm_vcpu *vcpu) +{ + kvm_mmu_flush_tlb(vcpu); +} + +static void mark_pagetable_nonglobal(void *shadow_pte) +{ + page_header(__pa(shadow_pte))->global = 0; +} + +static inline void set_pte_common(struct kvm_vcpu *vcpu, + u64 *shadow_pte, + gpa_t gaddr, + int dirty, + u64 access_bits) +{ + hpa_t paddr; + + *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; + if (!dirty) + access_bits &= ~PT_WRITABLE_MASK; + + if (access_bits & PT_WRITABLE_MASK) + mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + + *shadow_pte |= access_bits; + + paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); + + if (!(*shadow_pte & PT_GLOBAL_MASK)) + mark_pagetable_nonglobal(shadow_pte); + + if (is_error_hpa(paddr)) { + *shadow_pte |= gaddr; + *shadow_pte |= PT_SHADOW_IO_MARK; + *shadow_pte &= ~PT_PRESENT_MASK; + } else { + *shadow_pte |= paddr; + page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); + } +} + +static void inject_page_fault(struct kvm_vcpu *vcpu, + u64 addr, + u32 err_code) +{ + kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); +} + +static inline int fix_read_pf(u64 *shadow_ent) +{ + if ((*shadow_ent & PT_SHADOW_USER_MASK) && + !(*shadow_ent & PT_USER_MASK)) { + /* + * If supervisor write protect is disabled, we shadow kernel + * pages as user pages so we can trap the write access. + */ + *shadow_ent |= PT_USER_MASK; + *shadow_ent &= ~PT_WRITABLE_MASK; + + return 1; + + } + return 0; +} + +static int may_access(u64 pte, int write, int user) +{ + + if (user && !(pte & PT_USER_MASK)) + return 0; + if (write && !(pte & PT_WRITABLE_MASK)) + return 0; + return 1; +} + +/* + * Remove a shadow pte. + */ +static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) +{ + hpa_t page_addr = vcpu->mmu.root_hpa; + int level = vcpu->mmu.shadow_root_level; + + ++kvm_stat.invlpg; + + for (; ; level--) { + u32 index = PT64_INDEX(addr, level); + u64 *table = __va(page_addr); + + if (level == PT_PAGE_TABLE_LEVEL ) { + table[index] = 0; + return; + } + + if (!is_present_pte(table[index])) + return; + + page_addr = table[index] & PT64_BASE_ADDR_MASK; + + if (level == PT_DIRECTORY_LEVEL && + (table[index] & PT_SHADOW_PS_MARK)) { + table[index] = 0; + release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL); + + kvm_arch_ops->tlb_flush(vcpu); + return; + } + } +} + +static void paging_free(struct kvm_vcpu *vcpu) +{ + nonpaging_free(vcpu); +} + +#define PTTYPE 64 +#include "paging_tmpl.h" +#undef PTTYPE + +#define PTTYPE 32 +#include "paging_tmpl.h" +#undef PTTYPE + +static int paging64_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->mmu; + + ASSERT(is_pae(vcpu)); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging64_page_fault; + context->inval_page = paging_inval_page; + context->gva_to_gpa = paging64_gva_to_gpa; + context->free = paging_free; + context->root_level = PT64_ROOT_LEVEL; + context->shadow_root_level = PT64_ROOT_LEVEL; + context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + ASSERT(VALID_PAGE(context->root_hpa)); + kvm_arch_ops->set_cr3(vcpu, context->root_hpa | + (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + return 0; +} + +static int paging32_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->mmu; + + context->new_cr3 = paging_new_cr3; + context->page_fault = paging32_page_fault; + context->inval_page = paging_inval_page; + context->gva_to_gpa = paging32_gva_to_gpa; + context->free = paging_free; + context->root_level = PT32_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); + ASSERT(VALID_PAGE(context->root_hpa)); + kvm_arch_ops->set_cr3(vcpu, context->root_hpa | + (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + return 0; +} + +static int paging32E_init_context(struct kvm_vcpu *vcpu) +{ + int ret; + + if ((ret = paging64_init_context(vcpu))) + return ret; + + vcpu->mmu.root_level = PT32E_ROOT_LEVEL; + vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL; + return 0; +} + +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + + if (!is_paging(vcpu)) + return nonpaging_init_context(vcpu); + else if (kvm_arch_ops->is_long_mode(vcpu)) + return paging64_init_context(vcpu); + else if (is_pae(vcpu)) + return paging32E_init_context(vcpu); + else + return paging32_init_context(vcpu); +} + +static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + if (VALID_PAGE(vcpu->mmu.root_hpa)) { + vcpu->mmu.free(vcpu); + vcpu->mmu.root_hpa = INVALID_PAGE; + } +} + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + while (!list_empty(&vcpu->free_pages)) { + struct kvm_mmu_page *page; + + page = list_entry(vcpu->free_pages.next, + struct kvm_mmu_page, link); + list_del(&page->link); + __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); + page->page_hpa = INVALID_PAGE; + } +} + +static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +{ + int i; + + ASSERT(vcpu); + + for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { + struct page *page; + struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; + + INIT_LIST_HEAD(&page_header->link); + if ((page = alloc_page(GFP_KVM_MMU)) == NULL) + goto error_1; + page->private = (unsigned long)page_header; + page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; + memset(__va(page_header->page_hpa), 0, PAGE_SIZE); + list_add(&page_header->link, &vcpu->free_pages); + } + return 0; + +error_1: + free_mmu_pages(vcpu); + return -ENOMEM; +} + +int kvm_mmu_init(struct kvm_vcpu *vcpu) +{ + int r; + + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + ASSERT(list_empty(&vcpu->free_pages)); + + if ((r = alloc_mmu_pages(vcpu))) + return r; + + if ((r = init_kvm_mmu(vcpu))) { + free_mmu_pages(vcpu); + return r; + } + return 0; +} + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + struct kvm_mmu_page *page; + + list_for_each_entry(page, &kvm->active_mmu_pages, link) { + int i; + u64 *pt; + + if (!test_bit(slot, &page->slot_bitmap)) + continue; + + pt = __va(page->page_hpa); + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + /* avoid RMW */ + if (pt[i] & PT_WRITABLE_MASK) + pt[i] &= ~PT_WRITABLE_MASK; + + } +} diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h new file mode 100644 index 00000000000..765c2e1a048 --- /dev/null +++ b/drivers/kvm/paging_tmpl.h @@ -0,0 +1,397 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +/* + * We need the mmu code to access both 32-bit and 64-bit guest ptes, + * so the code in this file is compiled twice, once per pte size. + */ + +#if PTTYPE == 64 + #define pt_element_t u64 + #define guest_walker guest_walker64 + #define FNAME(name) paging##64_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) + #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK + #define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK +#elif PTTYPE == 32 + #define pt_element_t u32 + #define guest_walker guest_walker32 + #define FNAME(name) paging##32_##name + #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT32_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) + #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK + #define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK +#else + #error Invalid PTTYPE value +#endif + +/* + * The guest_walker structure emulates the behavior of the hardware page + * table walker. + */ +struct guest_walker { + int level; + pt_element_t *table; + pt_element_t inherited_ar; +}; + +static void FNAME(init_walker)(struct guest_walker *walker, + struct kvm_vcpu *vcpu) +{ + hpa_t hpa; + struct kvm_memory_slot *slot; + + walker->level = vcpu->mmu.root_level; + slot = gfn_to_memslot(vcpu->kvm, + (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK); + walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); + + ASSERT((!kvm_arch_ops->is_long_mode(vcpu) && is_pae(vcpu)) || + (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); + + walker->table = (pt_element_t *)( (unsigned long)walker->table | + (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) ); + walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; +} + +static void FNAME(release_walker)(struct guest_walker *walker) +{ + kunmap_atomic(walker->table, KM_USER0); +} + +static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, + u64 *shadow_pte, u64 access_bits) +{ + ASSERT(*shadow_pte == 0); + access_bits &= guest_pte; + *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); + set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, + guest_pte & PT_DIRTY_MASK, access_bits); +} + +static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, + u64 *shadow_pte, u64 access_bits, + int index) +{ + gpa_t gaddr; + + ASSERT(*shadow_pte == 0); + access_bits &= guest_pde; + gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index; + if (PTTYPE == 32 && is_cpuid_PSE36()) + gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << + (32 - PT32_DIR_PSE36_SHIFT); + *shadow_pte = (guest_pde & (PT_NON_PTE_COPY_MASK | PT_GLOBAL_MASK)) | + ((guest_pde & PT_DIR_PAT_MASK) >> + (PT_DIR_PAT_SHIFT - PT_PAT_SHIFT)); + set_pte_common(vcpu, shadow_pte, gaddr, + guest_pde & PT_DIRTY_MASK, access_bits); +} + +/* + * Fetch a guest pte from a specific level in the paging hierarchy. + */ +static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu, + struct guest_walker *walker, + int level, + gva_t addr) +{ + + ASSERT(level > 0 && level <= walker->level); + + for (;;) { + int index = PT_INDEX(addr, walker->level); + hpa_t paddr; + + ASSERT(((unsigned long)walker->table & PAGE_MASK) == + ((unsigned long)&walker->table[index] & PAGE_MASK)); + if (level == walker->level || + !is_present_pte(walker->table[index]) || + (walker->level == PT_DIRECTORY_LEVEL && + (walker->table[index] & PT_PAGE_SIZE_MASK) && + (PTTYPE == 64 || is_pse(vcpu)))) + return &walker->table[index]; + if (walker->level != 3 || kvm_arch_ops->is_long_mode(vcpu)) + walker->inherited_ar &= walker->table[index]; + paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK); + kunmap_atomic(walker->table, KM_USER0); + walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), + KM_USER0); + --walker->level; + } +} + +/* + * Fetch a shadow pte for a specific level in the paging hierarchy. + */ +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *walker) +{ + hpa_t shadow_addr; + int level; + u64 *prev_shadow_ent = NULL; + + shadow_addr = vcpu->mmu.root_hpa; + level = vcpu->mmu.shadow_root_level; + + for (; ; level--) { + u32 index = SHADOW_PT_INDEX(addr, level); + u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; + pt_element_t *guest_ent; + + if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { + if (level == PT_PAGE_TABLE_LEVEL) + return shadow_ent; + shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; + prev_shadow_ent = shadow_ent; + continue; + } + + if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) { + ASSERT(level == PT32E_ROOT_LEVEL); + guest_ent = FNAME(fetch_guest)(vcpu, walker, + PT32_ROOT_LEVEL, addr); + } else + guest_ent = FNAME(fetch_guest)(vcpu, walker, + level, addr); + + if (!is_present_pte(*guest_ent)) + return NULL; + + /* Don't set accessed bit on PAE PDPTRs */ + if (vcpu->mmu.root_level != 3 || walker->level != 3) + *guest_ent |= PT_ACCESSED_MASK; + + if (level == PT_PAGE_TABLE_LEVEL) { + + if (walker->level == PT_DIRECTORY_LEVEL) { + if (prev_shadow_ent) + *prev_shadow_ent |= PT_SHADOW_PS_MARK; + FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, + walker->inherited_ar, + PT_INDEX(addr, PT_PAGE_TABLE_LEVEL)); + } else { + ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); + FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar); + } + return shadow_ent; + } + + shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent); + if (!VALID_PAGE(shadow_addr)) + return ERR_PTR(-ENOMEM); + if (!kvm_arch_ops->is_long_mode(vcpu) && level == 3) + *shadow_ent = shadow_addr | + (*guest_ent & (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK)); + else { + *shadow_ent = shadow_addr | + (*guest_ent & PT_NON_PTE_COPY_MASK); + *shadow_ent |= (PT_WRITABLE_MASK | PT_USER_MASK); + } + prev_shadow_ent = shadow_ent; + } +} + +/* + * The guest faulted for write. We need to + * + * - check write permissions + * - update the guest pte dirty bit + * - update our own dirty page tracking structures + */ +static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, + u64 *shadow_ent, + struct guest_walker *walker, + gva_t addr, + int user) +{ + pt_element_t *guest_ent; + int writable_shadow; + gfn_t gfn; + + if (is_writeble_pte(*shadow_ent)) + return 0; + + writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK; + if (user) { + /* + * User mode access. Fail if it's a kernel page or a read-only + * page. + */ + if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow) + return 0; + ASSERT(*shadow_ent & PT_USER_MASK); + } else + /* + * Kernel mode access. Fail if it's a read-only page and + * supervisor write protection is enabled. + */ + if (!writable_shadow) { + if (is_write_protection(vcpu)) + return 0; + *shadow_ent &= ~PT_USER_MASK; + } + + guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr); + + if (!is_present_pte(*guest_ent)) { + *shadow_ent = 0; + return 0; + } + + gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + mark_page_dirty(vcpu->kvm, gfn); + *shadow_ent |= PT_WRITABLE_MASK; + *guest_ent |= PT_DIRTY_MASK; + + return 1; +} + +/* + * Page fault handler. There are several causes for a page fault: + * - there is no shadow pte for the guest pte + * - write access through a shadow pte marked read only so that we can set + * the dirty bit + * - write access to a shadow pte marked read only so we can update the page + * dirty bitmap, when userspace requests it + * - mmio access; in this case we will never install a present shadow pte + * - normal guest page fault due to the guest pte marked not present, not + * writable, or not executable + * + * Returns: 1 if we need to emulate the instruction, 0 otherwise + */ +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, + u32 error_code) +{ + int write_fault = error_code & PFERR_WRITE_MASK; + int pte_present = error_code & PFERR_PRESENT_MASK; + int user_fault = error_code & PFERR_USER_MASK; + struct guest_walker walker; + u64 *shadow_pte; + int fixed; + + /* + * Look up the shadow pte for the faulting address. + */ + for (;;) { + FNAME(init_walker)(&walker, vcpu); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker); + if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ + nonpaging_flush(vcpu); + FNAME(release_walker)(&walker); + continue; + } + break; + } + + /* + * The page is not mapped by the guest. Let the guest handle it. + */ + if (!shadow_pte) { + inject_page_fault(vcpu, addr, error_code); + FNAME(release_walker)(&walker); + return 0; + } + + /* + * Update the shadow pte. + */ + if (write_fault) + fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, + user_fault); + else + fixed = fix_read_pf(shadow_pte); + + FNAME(release_walker)(&walker); + + /* + * mmio: emulate if accessible, otherwise its a guest fault. + */ + if (is_io_pte(*shadow_pte)) { + if (may_access(*shadow_pte, write_fault, user_fault)) + return 1; + pgprintk("%s: io work, no access\n", __FUNCTION__); + inject_page_fault(vcpu, addr, + error_code | PFERR_PRESENT_MASK); + return 0; + } + + /* + * pte not present, guest page fault. + */ + if (pte_present && !fixed) { + inject_page_fault(vcpu, addr, error_code); + return 0; + } + + ++kvm_stat.pf_fixed; + + return 0; +} + +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + struct guest_walker walker; + pt_element_t guest_pte; + gpa_t gpa; + + FNAME(init_walker)(&walker, vcpu); + guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL, + vaddr); + FNAME(release_walker)(&walker); + + if (!is_present_pte(guest_pte)) + return UNMAPPED_GVA; + + if (walker.level == PT_DIRECTORY_LEVEL) { + ASSERT((guest_pte & PT_PAGE_SIZE_MASK)); + ASSERT(PTTYPE == 64 || is_pse(vcpu)); + + gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr & + (PT_LEVEL_MASK(PT_PAGE_TABLE_LEVEL) | ~PAGE_MASK)); + + if (PTTYPE == 32 && is_cpuid_PSE36()) + gpa |= (guest_pte & PT32_DIR_PSE36_MASK) << + (32 - PT32_DIR_PSE36_SHIFT); + } else { + gpa = (guest_pte & PT_BASE_ADDR_MASK); + gpa |= (vaddr & ~PAGE_MASK); + } + + return gpa; +} + +#undef pt_element_t +#undef guest_walker +#undef FNAME +#undef PT_BASE_ADDR_MASK +#undef PT_INDEX +#undef SHADOW_PT_INDEX +#undef PT_LEVEL_MASK +#undef PT_PTE_COPY_MASK +#undef PT_NON_PTE_COPY_MASK +#undef PT_DIR_BASE_ADDR_MASK diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h new file mode 100644 index 00000000000..71fdf458619 --- /dev/null +++ b/drivers/kvm/segment_descriptor.h @@ -0,0 +1,17 @@ +struct segment_descriptor { + u16 limit_low; + u16 base_low; + u8 base_mid; + u8 type : 4; + u8 system : 1; + u8 dpl : 2; + u8 present : 1; + u8 limit_high : 4; + u8 avl : 1; + u8 long_mode : 1; + u8 default_op : 1; + u8 granularity : 1; + u8 base_high; +} __attribute__((packed)); + + diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c new file mode 100644 index 00000000000..a33a89c6813 --- /dev/null +++ b/drivers/kvm/svm.c @@ -0,0 +1,1677 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <asm/desc.h> + +#include "kvm_svm.h" +#include "x86_emulate.h" + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +#define IOPM_ALLOC_ORDER 2 +#define MSRPM_ALLOC_ORDER 1 + +#define DB_VECTOR 1 +#define UD_VECTOR 6 +#define GP_VECTOR 13 + +#define DR7_GD_MASK (1 << 13) +#define DR6_BD_MASK (1 << 13) +#define CR4_DE_MASK (1UL << 3) + +#define SEG_TYPE_LDT 2 +#define SEG_TYPE_BUSY_TSS16 3 + +#define KVM_EFER_LMA (1 << 10) +#define KVM_EFER_LME (1 << 8) + +unsigned long iopm_base; +unsigned long msrpm_base; + +struct kvm_ldttss_desc { + u16 limit0; + u16 base0; + unsigned base1 : 8, type : 5, dpl : 2, p : 1; + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; + u32 base3; + u32 zero1; +} __attribute__((packed)); + +struct svm_cpu_data { + int cpu; + + uint64_t asid_generation; + uint32_t max_asid; + uint32_t next_asid; + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; +}; + +static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); + +struct svm_init_data { + int cpu; + int r; +}; + +static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; + +#define NUM_MSR_MAPS (sizeof(msrpm_ranges) / sizeof(*msrpm_ranges)) +#define MSRS_RANGE_SIZE 2048 +#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) + +#define MAX_INST_SIZE 15 + +static unsigned get_addr_size(struct kvm_vcpu *vcpu) +{ + struct vmcb_save_area *sa = &vcpu->svm->vmcb->save; + u16 cs_attrib; + + if (!(sa->cr0 & CR0_PE_MASK) || (sa->rflags & X86_EFLAGS_VM)) + return 2; + + cs_attrib = sa->cs.attrib; + + return (cs_attrib & SVM_SELECTOR_L_MASK) ? 8 : + (cs_attrib & SVM_SELECTOR_DB_MASK) ? 4 : 2; +} + +static inline u8 pop_irq(struct kvm_vcpu *vcpu) +{ + int word_index = __ffs(vcpu->irq_summary); + int bit_index = __ffs(vcpu->irq_pending[word_index]); + int irq = word_index * BITS_PER_LONG + bit_index; + + clear_bit(bit_index, &vcpu->irq_pending[word_index]); + if (!vcpu->irq_pending[word_index]) + clear_bit(word_index, &vcpu->irq_summary); + return irq; +} + +static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) +{ + set_bit(irq, vcpu->irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); +} + +static inline void clgi(void) +{ + asm volatile (SVM_CLGI); +} + +static inline void stgi(void) +{ + asm volatile (SVM_STGI); +} + +static inline void invlpga(unsigned long addr, u32 asid) +{ + asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); +} + +static inline unsigned long kvm_read_cr2(void) +{ + unsigned long cr2; + + asm volatile ("mov %%cr2, %0" : "=r" (cr2)); + return cr2; +} + +static inline void kvm_write_cr2(unsigned long val) +{ + asm volatile ("mov %0, %%cr2" :: "r" (val)); +} + +static inline unsigned long read_dr6(void) +{ + unsigned long dr6; + + asm volatile ("mov %%dr6, %0" : "=r" (dr6)); + return dr6; +} + +static inline void write_dr6(unsigned long val) +{ + asm volatile ("mov %0, %%dr6" :: "r" (val)); +} + +static inline unsigned long read_dr7(void) +{ + unsigned long dr7; + + asm volatile ("mov %%dr7, %0" : "=r" (dr7)); + return dr7; +} + +static inline void write_dr7(unsigned long val) +{ + asm volatile ("mov %0, %%dr7" :: "r" (val)); +} + +static inline int svm_is_long_mode(struct kvm_vcpu *vcpu) +{ + return vcpu->svm->vmcb->save.efer & KVM_EFER_LMA; +} + +static inline void force_new_asid(struct kvm_vcpu *vcpu) +{ + vcpu->svm->asid_generation--; +} + +static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + +static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (!(efer & KVM_EFER_LMA)) + efer &= ~KVM_EFER_LME; + + vcpu->svm->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; + vcpu->shadow_efer = efer; +} + +static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) +{ + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + GP_VECTOR; + vcpu->svm->vmcb->control.event_inj_err = error_code; +} + +static void inject_ud(struct kvm_vcpu *vcpu) +{ + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_TYPE_EXEPT | + UD_VECTOR; +} + +static void inject_db(struct kvm_vcpu *vcpu) +{ + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_TYPE_EXEPT | + DB_VECTOR; +} + +static int is_page_fault(uint32_t info) +{ + info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT); +} + +static int is_external_interrupt(u32 info) +{ + info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + if (!vcpu->svm->next_rip) { + printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); + return; + } + if (vcpu->svm->next_rip - vcpu->svm->vmcb->save.rip > 15) { + printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", + __FUNCTION__, + vcpu->svm->vmcb->save.rip, + vcpu->svm->next_rip); + } + + vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; + vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; +} + +static int has_svm(void) +{ + uint32_t eax, ebx, ecx, edx; + + if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) { + printk(KERN_INFO "has_svm: not amd\n"); + return 0; + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if (eax < SVM_CPUID_FUNC) { + printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); + return 0; + } + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + printk(KERN_DEBUG "has_svm: svm not available\n"); + return 0; + } + return 1; +} + +static void svm_hardware_disable(void *garbage) +{ + struct svm_cpu_data *svm_data + = per_cpu(svm_data, raw_smp_processor_id()); + + if (svm_data) { + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); + per_cpu(svm_data, raw_smp_processor_id()) = 0; + __free_page(svm_data->save_area); + kfree(svm_data); + } +} + +static void svm_hardware_enable(void *garbage) +{ + + struct svm_cpu_data *svm_data; + uint64_t efer; +#ifdef __x86_64__ + struct desc_ptr gdt_descr; +#else + struct Xgt_desc_struct gdt_descr; +#endif + struct desc_struct *gdt; + int me = raw_smp_processor_id(); + + if (!has_svm()) { + printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); + return; + } + svm_data = per_cpu(svm_data, me); + + if (!svm_data) { + printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", + me); + return; + } + + svm_data->asid_generation = 1; + svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; + svm_data->next_asid = svm_data->max_asid + 1; + + asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); + gdt = (struct desc_struct *)gdt_descr.address; + svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); + + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); + + wrmsrl(MSR_VM_HSAVE_PA, + page_to_pfn(svm_data->save_area) << PAGE_SHIFT); +} + +static int svm_cpu_init(int cpu) +{ + struct svm_cpu_data *svm_data; + int r; + + svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); + if (!svm_data) + return -ENOMEM; + svm_data->cpu = cpu; + svm_data->save_area = alloc_page(GFP_KERNEL); + r = -ENOMEM; + if (!svm_data->save_area) + goto err_1; + + per_cpu(svm_data, cpu) = svm_data; + + return 0; + +err_1: + kfree(svm_data); + return r; + +} + +static int set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) +{ + int i; + + for (i = 0; i < NUM_MSR_MAPS; i++) { + if (msr >= msrpm_ranges[i] && + msr < msrpm_ranges[i] + MSRS_IN_RANGE) { + u32 msr_offset = (i * MSRS_IN_RANGE + msr - + msrpm_ranges[i]) * 2; + + u32 *base = msrpm + (msr_offset / 32); + u32 msr_shift = msr_offset % 32; + u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); + *base = (*base & ~(0x3 << msr_shift)) | + (mask << msr_shift); + return 1; + } + } + printk(KERN_DEBUG "%s: not found 0x%x\n", __FUNCTION__, msr); + return 0; +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + struct page *msrpm_pages; + void *msrpm_va; + int r; + + + iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); + + if (!iopm_pages) + return -ENOMEM; + memset(page_address(iopm_pages), 0xff, + PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + + msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + + r = -ENOMEM; + if (!msrpm_pages) + goto err_1; + + msrpm_va = page_address(msrpm_pages); + memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; + +#ifdef __x86_64__ + set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_STAR, 1, 1); + set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); + set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); + set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); +#endif + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); + + for_each_online_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err_2; + } + return 0; + +err_2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); + msrpm_base = 0; +err_1: + __free_pages(iopm_pages, IOPM_ALLOC_ORDER); + iopm_base = 0; + return r; +} + +static __exit void svm_hardware_unsetup(void) +{ + __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); + __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); + iopm_base = msrpm_base = 0; +} + +static void init_seg(struct vmcb_seg *seg) +{ + seg->selector = 0; + seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | + SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ + seg->limit = 0xffff; + seg->base = 0; +} + +static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) +{ + seg->selector = 0; + seg->attrib = SVM_SELECTOR_P_MASK | type; + seg->limit = 0xffff; + seg->base = 0; +} + +static int svm_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static void init_vmcb(struct vmcb *vmcb) +{ + struct vmcb_control_area *control = &vmcb->control; + struct vmcb_save_area *save = &vmcb->save; + u64 tsc; + + control->intercept_cr_read = INTERCEPT_CR0_MASK | + INTERCEPT_CR3_MASK | + INTERCEPT_CR4_MASK; + + control->intercept_cr_write = INTERCEPT_CR0_MASK | + INTERCEPT_CR3_MASK | + INTERCEPT_CR4_MASK; + + control->intercept_dr_read = INTERCEPT_DR0_MASK | + INTERCEPT_DR1_MASK | + INTERCEPT_DR2_MASK | + INTERCEPT_DR3_MASK; + + control->intercept_dr_write = INTERCEPT_DR0_MASK | + INTERCEPT_DR1_MASK | + INTERCEPT_DR2_MASK | + INTERCEPT_DR3_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR7_MASK; + + control->intercept_exceptions = 1 << PF_VECTOR; + + + control->intercept = (1ULL << INTERCEPT_INTR) | + (1ULL << INTERCEPT_NMI) | + /* + * selective cr0 intercept bug? + * 0: 0f 22 d8 mov %eax,%cr3 + * 3: 0f 20 c0 mov %cr0,%eax + * 6: 0d 00 00 00 80 or $0x80000000,%eax + * b: 0f 22 c0 mov %eax,%cr0 + * set cr3 ->interception + * get cr0 ->interception + * set cr0 -> no interception + */ + /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ + (1ULL << INTERCEPT_CPUID) | + (1ULL << INTERCEPT_HLT) | + (1ULL << INTERCEPT_INVLPG) | + (1ULL << INTERCEPT_INVLPGA) | + (1ULL << INTERCEPT_IOIO_PROT) | + (1ULL << INTERCEPT_MSR_PROT) | + (1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_VMRUN) | + (1ULL << INTERCEPT_VMMCALL) | + (1ULL << INTERCEPT_VMLOAD) | + (1ULL << INTERCEPT_VMSAVE) | + (1ULL << INTERCEPT_STGI) | + (1ULL << INTERCEPT_CLGI) | + (1ULL << INTERCEPT_SKINIT); + + control->iopm_base_pa = iopm_base; + control->msrpm_base_pa = msrpm_base; + rdtscll(tsc); + control->tsc_offset = -tsc; + control->int_ctl = V_INTR_MASKING_MASK; + + init_seg(&save->es); + init_seg(&save->ss); + init_seg(&save->ds); + init_seg(&save->fs); + init_seg(&save->gs); + + save->cs.selector = 0xf000; + /* Executable/Readable Code Segment */ + save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | + SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; + save->cs.limit = 0xffff; + save->cs.base = 0xffff0000; + + save->gdtr.limit = 0xffff; + save->idtr.limit = 0xffff; + + init_sys_seg(&save->ldtr, SEG_TYPE_LDT); + init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); + + save->efer = MSR_EFER_SVME_MASK; + + save->dr6 = 0xffff0ff0; + save->dr7 = 0x400; + save->rflags = 2; + save->rip = 0x0000fff0; + + /* + * cr0 val on cpu init should be 0x60000010, we enable cpu + * cache by default. the orderly way is to enable cache in bios. + */ + save->cr0 = 0x00000010 | CR0_PG_MASK; + save->cr4 = CR4_PAE_MASK; + /* rdx = ?? */ +} + +static int svm_create_vcpu(struct kvm_vcpu *vcpu) +{ + struct page *page; + int r; + + r = -ENOMEM; + vcpu->svm = kzalloc(sizeof *vcpu->svm, GFP_KERNEL); + if (!vcpu->svm) + goto out1; + page = alloc_page(GFP_KERNEL); + if (!page) + goto out2; + + vcpu->svm->vmcb = page_address(page); + memset(vcpu->svm->vmcb, 0, PAGE_SIZE); + vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; + vcpu->svm->cr0 = 0x00000010; + vcpu->svm->asid_generation = 0; + memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); + init_vmcb(vcpu->svm->vmcb); + + return 0; + +out2: + kfree(vcpu->svm); +out1: + return r; +} + +static void svm_free_vcpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->svm) + return; + if (vcpu->svm->vmcb) + __free_page(pfn_to_page(vcpu->svm->vmcb_pa >> PAGE_SHIFT)); + kfree(vcpu->svm); +} + +static struct kvm_vcpu *svm_vcpu_load(struct kvm_vcpu *vcpu) +{ + get_cpu(); + return vcpu; +} + +static void svm_vcpu_put(struct kvm_vcpu *vcpu) +{ + put_cpu(); +} + +static void svm_cache_regs(struct kvm_vcpu *vcpu) +{ + vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax; + vcpu->regs[VCPU_REGS_RSP] = vcpu->svm->vmcb->save.rsp; + vcpu->rip = vcpu->svm->vmcb->save.rip; +} + +static void svm_decache_regs(struct kvm_vcpu *vcpu) +{ + vcpu->svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; + vcpu->svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; + vcpu->svm->vmcb->save.rip = vcpu->rip; +} + +static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) +{ + return vcpu->svm->vmcb->save.rflags; +} + +static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + vcpu->svm->vmcb->save.rflags = rflags; +} + +static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) +{ + struct vmcb_save_area *save = &vcpu->svm->vmcb->save; + + switch (seg) { + case VCPU_SREG_CS: return &save->cs; + case VCPU_SREG_DS: return &save->ds; + case VCPU_SREG_ES: return &save->es; + case VCPU_SREG_FS: return &save->fs; + case VCPU_SREG_GS: return &save->gs; + case VCPU_SREG_SS: return &save->ss; + case VCPU_SREG_TR: return &save->tr; + case VCPU_SREG_LDTR: return &save->ldtr; + } + BUG(); + return 0; +} + +static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + return s->base; +} + +static void svm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + var->base = s->base; + var->limit = s->limit; + var->selector = s->selector; + var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; + var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; + var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; + var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; + var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; + var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; + var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + var->unusable = !var->present; +} + +static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct vmcb_seg *s = svm_seg(vcpu, VCPU_SREG_CS); + + *db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; + *l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; +} + +static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vcpu->svm->vmcb->save.ldtr.limit; + dt->base = vcpu->svm->vmcb->save.ldtr.base; +} + +static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vcpu->svm->vmcb->save.ldtr.limit = dt->limit; + vcpu->svm->vmcb->save.ldtr.base = dt->base ; +} + +static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vcpu->svm->vmcb->save.gdtr.limit; + dt->base = vcpu->svm->vmcb->save.gdtr.base; +} + +static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vcpu->svm->vmcb->save.gdtr.limit = dt->limit; + vcpu->svm->vmcb->save.gdtr.base = dt->base ; +} + +static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ +#ifdef __x86_64__ + if (vcpu->shadow_efer & KVM_EFER_LME) { + if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { + vcpu->shadow_efer |= KVM_EFER_LMA; + vcpu->svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; + } + + if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK) ) { + vcpu->shadow_efer &= ~KVM_EFER_LMA; + vcpu->svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); + } + } +#endif + vcpu->svm->cr0 = cr0; + vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK; + vcpu->cr0 = cr0; +} + +static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + vcpu->cr4 = cr4; + vcpu->svm->vmcb->save.cr4 = cr4 | CR4_PAE_MASK; +} + +static void svm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + s->base = var->base; + s->limit = var->limit; + s->selector = var->selector; + if (var->unusable) + s->attrib = 0; + else { + s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); + s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; + s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; + s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; + s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; + s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; + s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; + s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; + } + if (seg == VCPU_SREG_CS) + vcpu->svm->vmcb->save.cpl + = (vcpu->svm->vmcb->save.cs.attrib + >> SVM_SELECTOR_DPL_SHIFT) & 3; + +} + +/* FIXME: + + vcpu->svm->vmcb->control.int_ctl &= ~V_TPR_MASK; + vcpu->svm->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); + +*/ + +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +{ + return -EOPNOTSUPP; +} + +static void load_host_msrs(struct kvm_vcpu *vcpu) +{ + int i; + + for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) + wrmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]); +} + +static void save_host_msrs(struct kvm_vcpu *vcpu) +{ + int i; + + for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) + rdmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]); +} + +static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data) +{ + if (svm_data->next_asid > svm_data->max_asid) { + ++svm_data->asid_generation; + svm_data->next_asid = 1; + vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; + } + + vcpu->cpu = svm_data->cpu; + vcpu->svm->asid_generation = svm_data->asid_generation; + vcpu->svm->vmcb->control.asid = svm_data->next_asid++; +} + +static void svm_invlpg(struct kvm_vcpu *vcpu, gva_t address) +{ + invlpga(address, vcpu->svm->vmcb->control.asid); // is needed? +} + +static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +{ + return vcpu->svm->db_regs[dr]; +} + +static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, + int *exception) +{ + *exception = 0; + + if (vcpu->svm->vmcb->save.dr7 & DR7_GD_MASK) { + vcpu->svm->vmcb->save.dr7 &= ~DR7_GD_MASK; + vcpu->svm->vmcb->save.dr6 |= DR6_BD_MASK; + *exception = DB_VECTOR; + return; + } + + switch (dr) { + case 0 ... 3: + vcpu->svm->db_regs[dr] = value; + return; + case 4 ... 5: + if (vcpu->cr4 & CR4_DE_MASK) { + *exception = UD_VECTOR; + return; + } + case 7: { + if (value & ~((1ULL << 32) - 1)) { + *exception = GP_VECTOR; + return; + } + vcpu->svm->vmcb->save.dr7 = value; + return; + } + default: + printk(KERN_DEBUG "%s: unexpected dr %u\n", + __FUNCTION__, dr); + *exception = UD_VECTOR; + return; + } +} + +static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 exit_int_info = vcpu->svm->vmcb->control.exit_int_info; + u64 fault_address; + u32 error_code; + enum emulation_result er; + + if (is_external_interrupt(exit_int_info)) + push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); + + spin_lock(&vcpu->kvm->lock); + + fault_address = vcpu->svm->vmcb->control.exit_info_2; + error_code = vcpu->svm->vmcb->control.exit_info_1; + if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) { + spin_unlock(&vcpu->kvm->lock); + return 1; + } + er = emulate_instruction(vcpu, kvm_run, fault_address, error_code); + spin_unlock(&vcpu->kvm->lock); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++kvm_stat.mmio_exits; + kvm_run->exit_reason = KVM_EXIT_MMIO; + return 0; + case EMULATE_FAIL: + vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); + break; + default: + BUG(); + } + + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + return 0; +} + +static int io_get_override(struct kvm_vcpu *vcpu, + struct vmcb_seg **seg, + int *addr_override) +{ + u8 inst[MAX_INST_SIZE]; + unsigned ins_length; + gva_t rip; + int i; + + rip = vcpu->svm->vmcb->save.rip; + ins_length = vcpu->svm->next_rip - rip; + rip += vcpu->svm->vmcb->save.cs.base; + + if (ins_length > MAX_INST_SIZE) + printk(KERN_DEBUG + "%s: inst length err, cs base 0x%llx rip 0x%llx " + "next rip 0x%llx ins_length %u\n", + __FUNCTION__, + vcpu->svm->vmcb->save.cs.base, + vcpu->svm->vmcb->save.rip, + vcpu->svm->vmcb->control.exit_info_2, + ins_length); + + if (kvm_read_guest(vcpu, rip, ins_length, inst) != ins_length) + /* #PF */ + return 0; + + *addr_override = 0; + *seg = 0; + for (i = 0; i < ins_length; i++) + switch (inst[i]) { + case 0xf0: + case 0xf2: + case 0xf3: + case 0x66: + continue; + case 0x67: + *addr_override = 1; + continue; + case 0x2e: + *seg = &vcpu->svm->vmcb->save.cs; + continue; + case 0x36: + *seg = &vcpu->svm->vmcb->save.ss; + continue; + case 0x3e: + *seg = &vcpu->svm->vmcb->save.ds; + continue; + case 0x26: + *seg = &vcpu->svm->vmcb->save.es; + continue; + case 0x64: + *seg = &vcpu->svm->vmcb->save.fs; + continue; + case 0x65: + *seg = &vcpu->svm->vmcb->save.gs; + continue; + default: + return 1; + } + printk(KERN_DEBUG "%s: unexpected\n", __FUNCTION__); + return 0; +} + +static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address) +{ + unsigned long addr_mask; + unsigned long *reg; + struct vmcb_seg *seg; + int addr_override; + struct vmcb_save_area *save_area = &vcpu->svm->vmcb->save; + u16 cs_attrib = save_area->cs.attrib; + unsigned addr_size = get_addr_size(vcpu); + + if (!io_get_override(vcpu, &seg, &addr_override)) + return 0; + + if (addr_override) + addr_size = (addr_size == 2) ? 4: (addr_size >> 1); + + if (ins) { + reg = &vcpu->regs[VCPU_REGS_RDI]; + seg = &vcpu->svm->vmcb->save.es; + } else { + reg = &vcpu->regs[VCPU_REGS_RSI]; + seg = (seg) ? seg : &vcpu->svm->vmcb->save.ds; + } + + addr_mask = ~0ULL >> (64 - (addr_size * 8)); + + if ((cs_attrib & SVM_SELECTOR_L_MASK) && + !(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_VM)) { + *address = (*reg & addr_mask); + return addr_mask; + } + + if (!(seg->attrib & SVM_SELECTOR_P_SHIFT)) { + svm_inject_gp(vcpu, 0); + return 0; + } + + *address = (*reg & addr_mask) + seg->base; + return addr_mask; +} + +static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug? + int _in = io_info & SVM_IOIO_TYPE_MASK; + + ++kvm_stat.io_exits; + + vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2; + + kvm_run->exit_reason = KVM_EXIT_IO; + kvm_run->io.port = io_info >> 16; + kvm_run->io.direction = (_in) ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT); + kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0; + kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0; + + if (kvm_run->io.string) { + unsigned addr_mask; + + addr_mask = io_adress(vcpu, _in, &kvm_run->io.address); + if (!addr_mask) { + printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__); + return 1; + } + + if (kvm_run->io.rep) { + kvm_run->io.count = vcpu->regs[VCPU_REGS_RCX] & addr_mask; + kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags + & X86_EFLAGS_DF) != 0; + } + } else { + kvm_run->io.value = vcpu->svm->vmcb->save.rax; + } + return 0; +} + + +static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + return 1; +} + +static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; + skip_emulated_instruction(vcpu); + if (vcpu->irq_summary && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)) + return 1; + + kvm_run->exit_reason = KVM_EXIT_HLT; + return 0; +} + +static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + inject_ud(vcpu); + return 1; +} + +static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + printk(KERN_DEBUG "%s: task swiche is unsupported\n", __FUNCTION__); + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + return 0; +} + +static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; + kvm_run->exit_reason = KVM_EXIT_CPUID; + return 0; +} + +static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (emulate_instruction(vcpu, 0, 0, 0) != EMULATE_DONE) + printk(KERN_ERR "%s: failed\n", __FUNCTION__); + return 1; +} + +static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) +{ + switch (ecx) { + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + *data = 0; + break; + case MSR_IA32_TIME_STAMP_COUNTER: { + u64 tsc; + + rdtscll(tsc); + *data = vcpu->svm->vmcb->control.tsc_offset + tsc; + break; + } + case MSR_EFER: + *data = vcpu->shadow_efer; + break; + case MSR_IA32_APICBASE: + *data = vcpu->apic_base; + break; +#ifdef __x86_64__ + case MSR_STAR: + *data = vcpu->svm->vmcb->save.star; + break; + case MSR_LSTAR: + *data = vcpu->svm->vmcb->save.lstar; + break; + case MSR_CSTAR: + *data = vcpu->svm->vmcb->save.cstar; + break; + case MSR_KERNEL_GS_BASE: + *data = vcpu->svm->vmcb->save.kernel_gs_base; + break; + case MSR_SYSCALL_MASK: + *data = vcpu->svm->vmcb->save.sfmask; + break; +#endif + case MSR_IA32_SYSENTER_CS: + *data = vcpu->svm->vmcb->save.sysenter_cs; + break; + case MSR_IA32_SYSENTER_EIP: + *data = vcpu->svm->vmcb->save.sysenter_eip; + break; + case MSR_IA32_SYSENTER_ESP: + *data = vcpu->svm->vmcb->save.sysenter_esp; + break; + default: + printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", ecx); + return 1; + } + return 0; +} + +static int rdmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data; + + if (svm_get_msr(vcpu, ecx, &data)) + svm_inject_gp(vcpu, 0); + else { + vcpu->svm->vmcb->save.rax = data & 0xffffffff; + vcpu->regs[VCPU_REGS_RDX] = data >> 32; + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; + skip_emulated_instruction(vcpu); + } + return 1; +} + +static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +{ + switch (ecx) { +#ifdef __x86_64__ + case MSR_EFER: + set_efer(vcpu, data); + break; +#endif + case MSR_IA32_MC0_STATUS: + printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n" + , __FUNCTION__, data); + break; + case MSR_IA32_TIME_STAMP_COUNTER: { + u64 tsc; + + rdtscll(tsc); + vcpu->svm->vmcb->control.tsc_offset = data - tsc; + break; + } + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + vcpu->apic_base = data; + break; +#ifdef __x86_64___ + case MSR_STAR: + vcpu->svm->vmcb->save.star = data; + break; + case MSR_LSTAR: + vcpu->svm->vmcb->save.lstar = data; + break; + case MSR_CSTAR: + vcpu->svm->vmcb->save.cstar = data; + break; + case MSR_KERNEL_GS_BASE: + vcpu->svm->vmcb->save.kernel_gs_base = data; + break; + case MSR_SYSCALL_MASK: + vcpu->svm->vmcb->save.sfmask = data; + break; +#endif + case MSR_IA32_SYSENTER_CS: + vcpu->svm->vmcb->save.sysenter_cs = data; + break; + case MSR_IA32_SYSENTER_EIP: + vcpu->svm->vmcb->save.sysenter_eip = data; + break; + case MSR_IA32_SYSENTER_ESP: + vcpu->svm->vmcb->save.sysenter_esp = data; + break; + default: + printk(KERN_ERR "kvm: unhandled wrmsr: %x\n", ecx); + return 1; + } + return 0; +} + +static int wrmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data = (vcpu->svm->vmcb->save.rax & -1u) + | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; + if (svm_set_msr(vcpu, ecx, data)) + svm_inject_gp(vcpu, 0); + else + skip_emulated_instruction(vcpu); + return 1; +} + +static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (vcpu->svm->vmcb->control.exit_info_1) + return wrmsr_interception(vcpu, kvm_run); + else + return rdmsr_interception(vcpu, kvm_run); +} + +static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) = { + [SVM_EXIT_READ_CR0] = emulate_on_interception, + [SVM_EXIT_READ_CR3] = emulate_on_interception, + [SVM_EXIT_READ_CR4] = emulate_on_interception, + /* for now: */ + [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR3] = emulate_on_interception, + [SVM_EXIT_WRITE_CR4] = emulate_on_interception, + [SVM_EXIT_READ_DR0] = emulate_on_interception, + [SVM_EXIT_READ_DR1] = emulate_on_interception, + [SVM_EXIT_READ_DR2] = emulate_on_interception, + [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR0] = emulate_on_interception, + [SVM_EXIT_WRITE_DR1] = emulate_on_interception, + [SVM_EXIT_WRITE_DR2] = emulate_on_interception, + [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR7] = emulate_on_interception, + [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, + [SVM_EXIT_INTR] = nop_on_interception, + [SVM_EXIT_NMI] = nop_on_interception, + [SVM_EXIT_SMI] = nop_on_interception, + [SVM_EXIT_INIT] = nop_on_interception, + /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ + [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_HLT] = halt_interception, + [SVM_EXIT_INVLPG] = emulate_on_interception, + [SVM_EXIT_INVLPGA] = invalid_op_interception, + [SVM_EXIT_IOIO] = io_interception, + [SVM_EXIT_MSR] = msr_interception, + [SVM_EXIT_TASK_SWITCH] = task_switch_interception, + [SVM_EXIT_VMRUN] = invalid_op_interception, + [SVM_EXIT_VMMCALL] = invalid_op_interception, + [SVM_EXIT_VMLOAD] = invalid_op_interception, + [SVM_EXIT_VMSAVE] = invalid_op_interception, + [SVM_EXIT_STGI] = invalid_op_interception, + [SVM_EXIT_CLGI] = invalid_op_interception, + [SVM_EXIT_SKINIT] = invalid_op_interception, +}; + + +static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 exit_code = vcpu->svm->vmcb->control.exit_code; + + kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; + + if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) && + exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) + printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " + "exit_code 0x%x\n", + __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info, + exit_code); + + if (exit_code >= sizeof(svm_exit_handlers) / sizeof(*svm_exit_handlers) + || svm_exit_handlers[exit_code] == 0) { + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n", + __FUNCTION__, + exit_code, + vcpu->svm->vmcb->save.rip, + vcpu->cr0, + vcpu->svm->vmcb->save.rflags); + return 0; + } + + return svm_exit_handlers[exit_code](vcpu, kvm_run); +} + +static void reload_tss(struct kvm_vcpu *vcpu) +{ + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + svm_data->tss_desc->type = 9; //available 32/64-bit TSS + load_TR_desc(); +} + +static void pre_svm_run(struct kvm_vcpu *vcpu) +{ + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + + vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + if (vcpu->cpu != cpu || + vcpu->svm->asid_generation != svm_data->asid_generation) + new_asid(vcpu, svm_data); +} + + +static inline void kvm_try_inject_irq(struct kvm_vcpu *vcpu) +{ + struct vmcb_control_area *control; + + if (!vcpu->irq_summary) + return; + + control = &vcpu->svm->vmcb->control; + + control->int_vector = pop_irq(vcpu); + control->int_ctl &= ~V_INTR_PRIO_MASK; + control->int_ctl |= V_IRQ_MASK | + ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); +} + +static void kvm_reput_irq(struct kvm_vcpu *vcpu) +{ + struct vmcb_control_area *control = &vcpu->svm->vmcb->control; + + if (control->int_ctl & V_IRQ_MASK) { + control->int_ctl &= ~V_IRQ_MASK; + push_irq(vcpu, control->int_vector); + } +} + +static void save_db_regs(unsigned long *db_regs) +{ +#ifdef __x86_64__ + asm ("mov %%dr0, %%rax \n\t" + "mov %%rax, %[dr0] \n\t" + "mov %%dr1, %%rax \n\t" + "mov %%rax, %[dr1] \n\t" + "mov %%dr2, %%rax \n\t" + "mov %%rax, %[dr2] \n\t" + "mov %%dr3, %%rax \n\t" + "mov %%rax, %[dr3] \n\t" + : [dr0] "=m"(db_regs[0]), + [dr1] "=m"(db_regs[1]), + [dr2] "=m"(db_regs[2]), + [dr3] "=m"(db_regs[3]) + : : "rax"); +#else + asm ("mov %%dr0, %%eax \n\t" + "mov %%eax, %[dr0] \n\t" + "mov %%dr1, %%eax \n\t" + "mov %%eax, %[dr1] \n\t" + "mov %%dr2, %%eax \n\t" + "mov %%eax, %[dr2] \n\t" + "mov %%dr3, %%eax \n\t" + "mov %%eax, %[dr3] \n\t" + : [dr0] "=m"(db_regs[0]), + [dr1] "=m"(db_regs[1]), + [dr2] "=m"(db_regs[2]), + [dr3] "=m"(db_regs[3]) + : : "eax"); +#endif +} + +static void load_db_regs(unsigned long *db_regs) +{ + asm volatile ("mov %[dr0], %%dr0 \n\t" + "mov %[dr1], %%dr1 \n\t" + "mov %[dr2], %%dr2 \n\t" + "mov %[dr3], %%dr3 \n\t" + : + : [dr0] "r"(db_regs[0]), + [dr1] "r"(db_regs[1]), + [dr2] "r"(db_regs[2]), + [dr3] "r"(db_regs[3]) +#ifdef __x86_64__ + : "rax"); +#else + : "eax"); +#endif +} + +static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u16 fs_selector; + u16 gs_selector; + u16 ldt_selector; + +again: + kvm_try_inject_irq(vcpu); + + clgi(); + + pre_svm_run(vcpu); + + save_host_msrs(vcpu); + fs_selector = read_fs(); + gs_selector = read_gs(); + ldt_selector = read_ldt(); + vcpu->svm->host_cr2 = kvm_read_cr2(); + vcpu->svm->host_dr6 = read_dr6(); + vcpu->svm->host_dr7 = read_dr7(); + vcpu->svm->vmcb->save.cr2 = vcpu->cr2; + + if (vcpu->svm->vmcb->save.dr7 & 0xff) { + write_dr7(0); + save_db_regs(vcpu->svm->host_db_regs); + load_db_regs(vcpu->svm->db_regs); + } + asm volatile ( +#ifdef __x86_64__ + "push %%rbx; push %%rcx; push %%rdx;" + "push %%rsi; push %%rdi; push %%rbp;" + "push %%r8; push %%r9; push %%r10; push %%r11;" + "push %%r12; push %%r13; push %%r14; push %%r15;" +#else + "push %%ebx; push %%ecx; push %%edx;" + "push %%esi; push %%edi; push %%ebp;" +#endif + +#ifdef __x86_64__ + "mov %c[rbx](%[vcpu]), %%rbx \n\t" + "mov %c[rcx](%[vcpu]), %%rcx \n\t" + "mov %c[rdx](%[vcpu]), %%rdx \n\t" + "mov %c[rsi](%[vcpu]), %%rsi \n\t" + "mov %c[rdi](%[vcpu]), %%rdi \n\t" + "mov %c[rbp](%[vcpu]), %%rbp \n\t" + "mov %c[r8](%[vcpu]), %%r8 \n\t" + "mov %c[r9](%[vcpu]), %%r9 \n\t" + "mov %c[r10](%[vcpu]), %%r10 \n\t" + "mov %c[r11](%[vcpu]), %%r11 \n\t" + "mov %c[r12](%[vcpu]), %%r12 \n\t" + "mov %c[r13](%[vcpu]), %%r13 \n\t" + "mov %c[r14](%[vcpu]), %%r14 \n\t" + "mov %c[r15](%[vcpu]), %%r15 \n\t" +#else + "mov %c[rbx](%[vcpu]), %%ebx \n\t" + "mov %c[rcx](%[vcpu]), %%ecx \n\t" + "mov %c[rdx](%[vcpu]), %%edx \n\t" + "mov %c[rsi](%[vcpu]), %%esi \n\t" + "mov %c[rdi](%[vcpu]), %%edi \n\t" + "mov %c[rbp](%[vcpu]), %%ebp \n\t" +#endif + +#ifdef __x86_64__ + /* Enter guest mode */ + "push %%rax \n\t" + "mov %c[svm](%[vcpu]), %%rax \n\t" + "mov %c[vmcb](%%rax), %%rax \n\t" + SVM_VMLOAD "\n\t" + SVM_VMRUN "\n\t" + SVM_VMSAVE "\n\t" + "pop %%rax \n\t" +#else + /* Enter guest mode */ + "push %%eax \n\t" + "mov %c[svm](%[vcpu]), %%eax \n\t" + "mov %c[vmcb](%%eax), %%eax \n\t" + SVM_VMLOAD "\n\t" + SVM_VMRUN "\n\t" + SVM_VMSAVE "\n\t" + "pop %%eax \n\t" +#endif + + /* Save guest registers, load host registers */ +#ifdef __x86_64__ + "mov %%rbx, %c[rbx](%[vcpu]) \n\t" + "mov %%rcx, %c[rcx](%[vcpu]) \n\t" + "mov %%rdx, %c[rdx](%[vcpu]) \n\t" + "mov %%rsi, %c[rsi](%[vcpu]) \n\t" + "mov %%rdi, %c[rdi](%[vcpu]) \n\t" + "mov %%rbp, %c[rbp](%[vcpu]) \n\t" + "mov %%r8, %c[r8](%[vcpu]) \n\t" + "mov %%r9, %c[r9](%[vcpu]) \n\t" + "mov %%r10, %c[r10](%[vcpu]) \n\t" + "mov %%r11, %c[r11](%[vcpu]) \n\t" + "mov %%r12, %c[r12](%[vcpu]) \n\t" + "mov %%r13, %c[r13](%[vcpu]) \n\t" + "mov %%r14, %c[r14](%[vcpu]) \n\t" + "mov %%r15, %c[r15](%[vcpu]) \n\t" + + "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" + "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" + "pop %%rbp; pop %%rdi; pop %%rsi;" + "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" +#else + "mov %%ebx, %c[rbx](%[vcpu]) \n\t" + "mov %%ecx, %c[rcx](%[vcpu]) \n\t" + "mov %%edx, %c[rdx](%[vcpu]) \n\t" + "mov %%esi, %c[rsi](%[vcpu]) \n\t" + "mov %%edi, %c[rdi](%[vcpu]) \n\t" + "mov %%ebp, %c[rbp](%[vcpu]) \n\t" + + "pop %%ebp; pop %%edi; pop %%esi;" + "pop %%edx; pop %%ecx; pop %%ebx; \n\t" +#endif + : + : [vcpu]"a"(vcpu), + [svm]"i"(offsetof(struct kvm_vcpu, svm)), + [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), + [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])) +#ifdef __x86_64__ + ,[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), + [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), + [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])) +#endif + : "cc", "memory" ); + + if ((vcpu->svm->vmcb->save.dr7 & 0xff)) + load_db_regs(vcpu->svm->host_db_regs); + + vcpu->cr2 = vcpu->svm->vmcb->save.cr2; + + write_dr6(vcpu->svm->host_dr6); + write_dr7(vcpu->svm->host_dr7); + kvm_write_cr2(vcpu->svm->host_cr2); + + load_fs(fs_selector); + load_gs(gs_selector); + load_ldt(ldt_selector); + load_host_msrs(vcpu); + + reload_tss(vcpu); + + stgi(); + + kvm_reput_irq(vcpu); + + vcpu->svm->next_rip = 0; + + if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { + kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; + kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; + return 0; + } + + if (handle_exit(vcpu, kvm_run)) { + if (signal_pending(current)) { + ++kvm_stat.signal_exits; + return -EINTR; + } + kvm_resched(vcpu); + goto again; + } + return 0; +} + +static void svm_flush_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + +static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) +{ + vcpu->svm->vmcb->save.cr3 = root; + force_new_asid(vcpu); +} + +static void svm_inject_page_fault(struct kvm_vcpu *vcpu, + unsigned long addr, + uint32_t err_code) +{ + uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info; + + ++kvm_stat.pf_guest; + + if (is_page_fault(exit_int_info)) { + + vcpu->svm->vmcb->control.event_inj_err = 0; + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + DF_VECTOR; + return; + } + vcpu->cr2 = addr; + vcpu->svm->vmcb->save.cr2 = addr; + vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + PF_VECTOR; + vcpu->svm->vmcb->control.event_inj_err = err_code; +} + + +static int is_disabled(void) +{ + return 0; +} + +static struct kvm_arch_ops svm_arch_ops = { + .cpu_has_kvm_support = has_svm, + .disabled_by_bios = is_disabled, + .hardware_setup = svm_hardware_setup, + .hardware_unsetup = svm_hardware_unsetup, + .hardware_enable = svm_hardware_enable, + .hardware_disable = svm_hardware_disable, + + .vcpu_create = svm_create_vcpu, + .vcpu_free = svm_free_vcpu, + + .vcpu_load = svm_vcpu_load, + .vcpu_put = svm_vcpu_put, + + .set_guest_debug = svm_guest_debug, + .get_msr = svm_get_msr, + .set_msr = svm_set_msr, + .get_segment_base = svm_get_segment_base, + .get_segment = svm_get_segment, + .set_segment = svm_set_segment, + .is_long_mode = svm_is_long_mode, + .get_cs_db_l_bits = svm_get_cs_db_l_bits, + .set_cr0 = svm_set_cr0, + .set_cr0_no_modeswitch = svm_set_cr0, + .set_cr3 = svm_set_cr3, + .set_cr4 = svm_set_cr4, + .set_efer = svm_set_efer, + .get_idt = svm_get_idt, + .set_idt = svm_set_idt, + .get_gdt = svm_get_gdt, + .set_gdt = svm_set_gdt, + .get_dr = svm_get_dr, + .set_dr = svm_set_dr, + .cache_regs = svm_cache_regs, + .decache_regs = svm_decache_regs, + .get_rflags = svm_get_rflags, + .set_rflags = svm_set_rflags, + + .invlpg = svm_invlpg, + .tlb_flush = svm_flush_tlb, + .inject_page_fault = svm_inject_page_fault, + + .inject_gp = svm_inject_gp, + + .run = svm_vcpu_run, + .skip_emulated_instruction = skip_emulated_instruction, + .vcpu_setup = svm_vcpu_setup, +}; + +static int __init svm_init(void) +{ + kvm_emulator_want_group7_invlpg(); + kvm_init_arch(&svm_arch_ops, THIS_MODULE); + return 0; +} + +static void __exit svm_exit(void) +{ + kvm_exit_arch(); +} + +module_init(svm_init) +module_exit(svm_exit) diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h new file mode 100644 index 00000000000..df731c3fb58 --- /dev/null +++ b/drivers/kvm/svm.h @@ -0,0 +1,315 @@ +#ifndef __SVM_H +#define __SVM_H + +enum { + INTERCEPT_INTR, + INTERCEPT_NMI, + INTERCEPT_SMI, + INTERCEPT_INIT, + INTERCEPT_VINTR, + INTERCEPT_SELECTIVE_CR0, + INTERCEPT_STORE_IDTR, + INTERCEPT_STORE_GDTR, + INTERCEPT_STORE_LDTR, + INTERCEPT_STORE_TR, + INTERCEPT_LOAD_IDTR, + INTERCEPT_LOAD_GDTR, + INTERCEPT_LOAD_LDTR, + INTERCEPT_LOAD_TR, + INTERCEPT_RDTSC, + INTERCEPT_RDPMC, + INTERCEPT_PUSHF, + INTERCEPT_POPF, + INTERCEPT_CPUID, + INTERCEPT_RSM, + INTERCEPT_IRET, + INTERCEPT_INTn, + INTERCEPT_INVD, + INTERCEPT_PAUSE, + INTERCEPT_HLT, + INTERCEPT_INVLPG, + INTERCEPT_INVLPGA, + INTERCEPT_IOIO_PROT, + INTERCEPT_MSR_PROT, + INTERCEPT_TASK_SWITCH, + INTERCEPT_FERR_FREEZE, + INTERCEPT_SHUTDOWN, + INTERCEPT_VMRUN, + INTERCEPT_VMMCALL, + INTERCEPT_VMLOAD, + INTERCEPT_VMSAVE, + INTERCEPT_STGI, + INTERCEPT_CLGI, + INTERCEPT_SKINIT, + INTERCEPT_RDTSCP, + INTERCEPT_ICEBP, + INTERCEPT_WBINVD, +}; + + +struct __attribute__ ((__packed__)) vmcb_control_area { + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; + u32 intercept_exceptions; + u64 intercept; + u8 reserved_1[44]; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u8 reserved_2[3]; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u8 reserved_3[4]; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u8 reserved_4[16]; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 lbr_ctl; + u8 reserved_5[832]; +}; + + +#define TLB_CONTROL_DO_NOTHING 0 +#define TLB_CONTROL_FLUSH_ALL_ASID 1 + +#define V_TPR_MASK 0x0f + +#define V_IRQ_SHIFT 8 +#define V_IRQ_MASK (1 << V_IRQ_SHIFT) + +#define V_INTR_PRIO_SHIFT 16 +#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) + +#define V_IGN_TPR_SHIFT 20 +#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) + +#define V_INTR_MASKING_SHIFT 24 +#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) + +#define SVM_INTERRUPT_SHADOW_MASK 1 + +#define SVM_IOIO_STR_SHIFT 2 +#define SVM_IOIO_REP_SHIFT 3 +#define SVM_IOIO_SIZE_SHIFT 4 +#define SVM_IOIO_ASIZE_SHIFT 7 + +#define SVM_IOIO_TYPE_MASK 1 +#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) +#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) +#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) +#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) + +struct __attribute__ ((__packed__)) vmcb_seg { + u16 selector; + u16 attrib; + u32 limit; + u64 base; +}; + +struct __attribute__ ((__packed__)) vmcb_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u8 reserved_5[24]; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_6[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; +}; + +struct __attribute__ ((__packed__)) vmcb { + struct vmcb_control_area control; + struct vmcb_save_area save; +}; + +#define SVM_CPUID_FEATURE_SHIFT 2 +#define SVM_CPUID_FUNC 0x8000000a + +#define MSR_EFER_SVME_MASK (1ULL << 12) +#define MSR_VM_HSAVE_PA 0xc0010117ULL + +#define SVM_SELECTOR_S_SHIFT 4 +#define SVM_SELECTOR_DPL_SHIFT 5 +#define SVM_SELECTOR_P_SHIFT 7 +#define SVM_SELECTOR_AVL_SHIFT 8 +#define SVM_SELECTOR_L_SHIFT 9 +#define SVM_SELECTOR_DB_SHIFT 10 +#define SVM_SELECTOR_G_SHIFT 11 + +#define SVM_SELECTOR_TYPE_MASK (0xf) +#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) +#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) +#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) +#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) +#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) +#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) +#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) + +#define SVM_SELECTOR_WRITE_MASK (1 << 1) +#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK +#define SVM_SELECTOR_CODE_MASK (1 << 3) + +#define INTERCEPT_CR0_MASK 1 +#define INTERCEPT_CR3_MASK (1 << 3) +#define INTERCEPT_CR4_MASK (1 << 4) + +#define INTERCEPT_DR0_MASK 1 +#define INTERCEPT_DR1_MASK (1 << 1) +#define INTERCEPT_DR2_MASK (1 << 2) +#define INTERCEPT_DR3_MASK (1 << 3) +#define INTERCEPT_DR4_MASK (1 << 4) +#define INTERCEPT_DR5_MASK (1 << 5) +#define INTERCEPT_DR6_MASK (1 << 6) +#define INTERCEPT_DR7_MASK (1 << 7) + +#define SVM_EVTINJ_VEC_MASK 0xff + +#define SVM_EVTINJ_TYPE_SHIFT 8 +#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_VALID (1 << 31) +#define SVM_EVTINJ_VALID_ERR (1 << 11) + +#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK + +#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR +#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI +#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT +#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT + +#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID +#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR + +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_NPF 0x400 + +#define SVM_EXIT_ERR -1 + +#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP + +#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" +#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" +#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" +#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" +#define SVM_STGI ".byte 0x0f, 0x01, 0xdc" +#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" + +#endif + diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c new file mode 100644 index 00000000000..bda7a7ae216 --- /dev/null +++ b/drivers/kvm/vmx.c @@ -0,0 +1,2002 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "kvm.h" +#include "vmx.h" +#include "kvm_vmx.h" +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <asm/io.h> + +#include "segment_descriptor.h" + +#define MSR_IA32_FEATURE_CONTROL 0x03a + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +static DEFINE_PER_CPU(struct vmcs *, vmxarea); +static DEFINE_PER_CPU(struct vmcs *, current_vmcs); + +#ifdef __x86_64__ +#define HOST_IS_64 1 +#else +#define HOST_IS_64 0 +#endif + +static struct vmcs_descriptor { + int size; + int order; + u32 revision_id; +} vmcs_descriptor; + +#define VMX_SEGMENT_FIELD(seg) \ + [VCPU_SREG_##seg] = { \ + .selector = GUEST_##seg##_SELECTOR, \ + .base = GUEST_##seg##_BASE, \ + .limit = GUEST_##seg##_LIMIT, \ + .ar_bytes = GUEST_##seg##_AR_BYTES, \ + } + +static struct kvm_vmx_segment_field { + unsigned selector; + unsigned base; + unsigned limit; + unsigned ar_bytes; +} kvm_vmx_segment_fields[] = { + VMX_SEGMENT_FIELD(CS), + VMX_SEGMENT_FIELD(DS), + VMX_SEGMENT_FIELD(ES), + VMX_SEGMENT_FIELD(FS), + VMX_SEGMENT_FIELD(GS), + VMX_SEGMENT_FIELD(SS), + VMX_SEGMENT_FIELD(TR), + VMX_SEGMENT_FIELD(LDTR), +}; + +static const u32 vmx_msr_index[] = { +#ifdef __x86_64__ + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, +#endif + MSR_EFER, MSR_K6_STAR, +}; +#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index)) + +struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr); + +static inline int is_page_fault(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline int is_external_interrupt(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + +static void vmcs_clear(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", + vmcs, phys_addr); +} + +static void __vcpu_clear(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + int cpu = smp_processor_id(); + + if (vcpu->cpu == cpu) + vmcs_clear(vcpu->vmcs); + if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) + per_cpu(current_vmcs, cpu) = NULL; +} + +static unsigned long vmcs_readl(unsigned long field) +{ + unsigned long value; + + asm volatile (ASM_VMX_VMREAD_RDX_RAX + : "=a"(value) : "d"(field) : "cc"); + return value; +} + +static u16 vmcs_read16(unsigned long field) +{ + return vmcs_readl(field); +} + +static u32 vmcs_read32(unsigned long field) +{ + return vmcs_readl(field); +} + +static u64 vmcs_read64(unsigned long field) +{ +#ifdef __x86_64__ + return vmcs_readl(field); +#else + return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); +#endif +} + +static void vmcs_writel(unsigned long field, unsigned long value) +{ + u8 error; + + asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" + : "=q"(error) : "a"(value), "d"(field) : "cc" ); + if (error) + printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + +static void vmcs_write16(unsigned long field, u16 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write32(unsigned long field, u32 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write64(unsigned long field, u64 value) +{ +#ifdef __x86_64__ + vmcs_writel(field, value); +#else + vmcs_writel(field, value); + asm volatile (""); + vmcs_writel(field+1, value >> 32); +#endif +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +static struct kvm_vcpu *vmx_vcpu_load(struct kvm_vcpu *vcpu) +{ + u64 phys_addr = __pa(vcpu->vmcs); + int cpu; + + cpu = get_cpu(); + + if (vcpu->cpu != cpu) { + smp_call_function(__vcpu_clear, vcpu, 0, 1); + vcpu->launched = 0; + } + + if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) { + u8 error; + + per_cpu(current_vmcs, cpu) = vcpu->vmcs; + asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vcpu->vmcs, phys_addr); + } + + if (vcpu->cpu != cpu) { + struct descriptor_table dt; + unsigned long sysenter_esp; + + vcpu->cpu = cpu; + /* + * Linux uses per-cpu TSS and GDT, so set these when switching + * processors. + */ + vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ + get_gdt(&dt); + vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ + + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); + vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + } + return vcpu; +} + +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +{ + put_cpu(); +} + +static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +{ + return vmcs_readl(GUEST_RFLAGS); +} + +static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + vmcs_writel(GUEST_RFLAGS, rflags); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rip; + u32 interruptibility; + + rip = vmcs_readl(GUEST_RIP); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_writel(GUEST_RIP, rip); + + /* + * We emulated an instruction, so temporary interrupt blocking + * should be removed, if set. + */ + interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + if (interruptibility & 3) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + interruptibility & ~3); +} + +static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) +{ + printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", + vmcs_readl(GUEST_RIP)); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + GP_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); +} + +/* + * reads and returns guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset -- 21.3 + */ +static u64 guest_read_tsc(void) +{ + u64 host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + +/* + * writes 'guest_tsc' into guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + */ +static void guest_write_tsc(u64 guest_tsc) +{ + u64 host_tsc; + + rdtscll(host_tsc); + vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); +} + +static void reload_tss(void) +{ +#ifndef __x86_64__ + + /* + * VT restores TR but not its size. Useless. + */ + struct descriptor_table gdt; + struct segment_descriptor *descs; + + get_gdt(&gdt); + descs = (void *)gdt.base; + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ + load_TR_desc(); +#endif +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + u64 data; + struct vmx_msr_entry *msr; + + if (!pdata) { + printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); + return -EINVAL; + } + + switch (msr_index) { +#ifdef __x86_64__ + case MSR_FS_BASE: + data = vmcs_readl(GUEST_FS_BASE); + break; + case MSR_GS_BASE: + data = vmcs_readl(GUEST_GS_BASE); + break; + case MSR_EFER: + data = vcpu->shadow_efer; + break; +#endif + case MSR_IA32_TIME_STAMP_COUNTER: + data = guest_read_tsc(); + break; + case MSR_IA32_SYSENTER_CS: + data = vmcs_read32(GUEST_SYSENTER_CS); + break; + case MSR_IA32_SYSENTER_EIP: + data = vmcs_read32(GUEST_SYSENTER_EIP); + break; + case MSR_IA32_SYSENTER_ESP: + data = vmcs_read32(GUEST_SYSENTER_ESP); + break; + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + data = 0; + break; + case MSR_IA32_APICBASE: + data = vcpu->apic_base; + break; + default: + msr = find_msr_entry(vcpu, msr_index); + if (!msr) { + printk(KERN_ERR "kvm: unhandled rdmsr: %x\n", msr_index); + return 1; + } + data = msr->data; + break; + } + + *pdata = data; + return 0; +} + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + struct vmx_msr_entry *msr; + switch (msr_index) { +#ifdef __x86_64__ + case MSR_FS_BASE: + vmcs_writel(GUEST_FS_BASE, data); + break; + case MSR_GS_BASE: + vmcs_writel(GUEST_GS_BASE, data); + break; +#endif + case MSR_IA32_SYSENTER_CS: + vmcs_write32(GUEST_SYSENTER_CS, data); + break; + case MSR_IA32_SYSENTER_EIP: + vmcs_write32(GUEST_SYSENTER_EIP, data); + break; + case MSR_IA32_SYSENTER_ESP: + vmcs_write32(GUEST_SYSENTER_ESP, data); + break; +#ifdef __x86_64 + case MSR_EFER: + set_efer(vcpu, data); + break; + case MSR_IA32_MC0_STATUS: + printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n" + , __FUNCTION__, data); + break; +#endif + case MSR_IA32_TIME_STAMP_COUNTER: { + guest_write_tsc(data); + break; + } + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + vcpu->apic_base = data; + break; + default: + msr = find_msr_entry(vcpu, msr_index); + if (!msr) { + printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr_index); + return 1; + } + msr->data = data; + break; + } + + return 0; +} + +/* + * Sync the rsp and rip registers into the vcpu structure. This allows + * registers to be accessed by indexing vcpu->regs. + */ +static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) +{ + vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + vcpu->rip = vmcs_readl(GUEST_RIP); +} + +/* + * Syncs rsp and rip back into the vmcs. Should be called after possible + * modification. + */ +static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +{ + vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); + vmcs_writel(GUEST_RIP, vcpu->rip); +} + +static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +{ + unsigned long dr7 = 0x400; + u32 exception_bitmap; + int old_singlestep; + + exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); + old_singlestep = vcpu->guest_debug.singlestep; + + vcpu->guest_debug.enabled = dbg->enabled; + if (vcpu->guest_debug.enabled) { + int i; + + dr7 |= 0x200; /* exact */ + for (i = 0; i < 4; ++i) { + if (!dbg->breakpoints[i].enabled) + continue; + vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; + dr7 |= 2 << (i*2); /* global enable */ + dr7 |= 0 << (i*4+16); /* execution breakpoint */ + } + + exception_bitmap |= (1u << 1); /* Trap debug exceptions */ + + vcpu->guest_debug.singlestep = dbg->singlestep; + } else { + exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */ + vcpu->guest_debug.singlestep = 0; + } + + if (old_singlestep && !vcpu->guest_debug.singlestep) { + unsigned long flags; + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + vmcs_writel(GUEST_RFLAGS, flags); + } + + vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); + vmcs_writel(GUEST_DR7, dr7); + + return 0; +} + +static __init int cpu_has_kvm_support(void) +{ + unsigned long ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + +static __init int vmx_disabled_by_bios(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + return (msr & 5) == 1; /* locked but not enabled */ +} + +static __init void hardware_enable(void *garbage) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + u64 old; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); + if ((old & 5) == 0) + /* enable and lock */ + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5); + write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */ + asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) + : "memory", "cc"); +} + +static void hardware_disable(void *garbage) +{ + asm volatile (ASM_VMX_VMXOFF : : : "cc"); +} + +static __init void setup_vmcs_descriptor(void) +{ + u32 vmx_msr_low, vmx_msr_high; + + rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high); + vmcs_descriptor.size = vmx_msr_high & 0x1fff; + vmcs_descriptor.order = get_order(vmcs_descriptor.size); + vmcs_descriptor.revision_id = vmx_msr_low; +}; + +static struct vmcs *alloc_vmcs_cpu(int cpu) +{ + int node = cpu_to_node(cpu); + struct page *pages; + struct vmcs *vmcs; + + pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order); + if (!pages) + return NULL; + vmcs = page_address(pages); + memset(vmcs, 0, vmcs_descriptor.size); + vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */ + return vmcs; +} + +static struct vmcs *alloc_vmcs(void) +{ + return alloc_vmcs_cpu(smp_processor_id()); +} + +static void free_vmcs(struct vmcs *vmcs) +{ + free_pages((unsigned long)vmcs, vmcs_descriptor.order); +} + +static __exit void free_kvm_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) + free_vmcs(per_cpu(vmxarea, cpu)); +} + +extern struct vmcs *alloc_vmcs_cpu(int cpu); + +static __init int alloc_kvm_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct vmcs *vmcs; + + vmcs = alloc_vmcs_cpu(cpu); + if (!vmcs) { + free_kvm_area(); + return -ENOMEM; + } + + per_cpu(vmxarea, cpu) = vmcs; + } + return 0; +} + +static __init int hardware_setup(void) +{ + setup_vmcs_descriptor(); + return alloc_kvm_area(); +} + +static __exit void hardware_unsetup(void) +{ + free_kvm_area(); +} + +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + if (vcpu->rmode.active) + vmcs_write32(EXCEPTION_BITMAP, ~0); + else + vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); +} + +static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + if (vmcs_readl(sf->base) == save->base) { + vmcs_write16(sf->selector, save->selector); + vmcs_writel(sf->base, save->base); + vmcs_write32(sf->limit, save->limit); + vmcs_write32(sf->ar_bytes, save->ar); + } else { + u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) + << AR_DPL_SHIFT; + vmcs_write32(sf->ar_bytes, 0x93 | dpl); + } +} + +static void enter_pmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + vcpu->rmode.active = 0; + + vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= ~(IOPL_MASK | X86_EFLAGS_VM); + flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); + vmcs_writel(GUEST_RFLAGS, flags); + + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) | + (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK)); + + update_exception_bitmap(vcpu); + + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); + + vmcs_write16(GUEST_SS_SELECTOR, 0); + vmcs_write32(GUEST_SS_AR_BYTES, 0x93); + + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); +} + +static int rmode_tss_base(struct kvm* kvm) +{ + gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; + return base_gfn << PAGE_SHIFT; +} + +static void fix_rmode_seg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + save->selector = vmcs_read16(sf->selector); + save->base = vmcs_readl(sf->base); + save->limit = vmcs_read32(sf->limit); + save->ar = vmcs_read32(sf->ar_bytes); + vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); + vmcs_write32(sf->limit, 0xffff); + vmcs_write32(sf->ar_bytes, 0xf3); +} + +static void enter_rmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + vcpu->rmode.active = 1; + + vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); + + vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + + vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + flags = vmcs_readl(GUEST_RFLAGS); + vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT; + + flags |= IOPL_MASK | X86_EFLAGS_VM; + + vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK); + update_exception_bitmap(vcpu); + + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + + fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); +} + +#ifdef __x86_64__ + +static void enter_lmode(struct kvm_vcpu *vcpu) +{ + u32 guest_tr_ar; + + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { + printk(KERN_DEBUG "%s: tss fixup for long mode. \n", + __FUNCTION__); + vmcs_write32(GUEST_TR_AR_BYTES, + (guest_tr_ar & ~AR_TYPE_MASK) + | AR_TYPE_BUSY_64_TSS); + } + + vcpu->shadow_efer |= EFER_LMA; + + find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME; + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) + | VM_ENTRY_CONTROLS_IA32E_MASK); +} + +static void exit_lmode(struct kvm_vcpu *vcpu) +{ + vcpu->shadow_efer &= ~EFER_LMA; + + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) + & ~VM_ENTRY_CONTROLS_IA32E_MASK); +} + +#endif + +static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) + enter_pmode(vcpu); + + if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) + enter_rmode(vcpu); + +#ifdef __x86_64__ + if (vcpu->shadow_efer & EFER_LME) { + if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) + enter_lmode(vcpu); + if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK)) + exit_lmode(vcpu); + } +#endif + + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, + (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vcpu->cr0 = cr0; +} + +/* + * Used when restoring the VM to avoid corrupting segment registers + */ +static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); + update_exception_bitmap(vcpu); + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, + (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vcpu->cr0 = cr0; +} + +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + vmcs_writel(GUEST_CR3, cr3); +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + vmcs_writel(CR4_READ_SHADOW, cr4); + vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); + vcpu->cr4 = cr4; +} + +#ifdef __x86_64__ + +static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER); + + vcpu->shadow_efer = efer; + if (efer & EFER_LMA) { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) | + VM_ENTRY_CONTROLS_IA32E_MASK); + msr->data = efer; + + } else { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) & + ~VM_ENTRY_CONTROLS_IA32E_MASK); + + msr->data = efer & ~EFER_LME; + } +} + +#endif + +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + return vmcs_readl(sf->base); +} + +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + var->base = vmcs_readl(sf->base); + var->limit = vmcs_read32(sf->limit); + var->selector = vmcs_read16(sf->selector); + ar = vmcs_read32(sf->ar_bytes); + if (ar & AR_UNUSABLE_MASK) + ar = 0; + var->type = ar & 15; + var->s = (ar >> 4) & 1; + var->dpl = (ar >> 5) & 3; + var->present = (ar >> 7) & 1; + var->avl = (ar >> 12) & 1; + var->l = (ar >> 13) & 1; + var->db = (ar >> 14) & 1; + var->g = (ar >> 15) & 1; + var->unusable = (ar >> 16) & 1; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + if (var->unusable) + ar = 1 << 16; + else { + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + } + vmcs_write32(sf->ar_bytes, ar); +} + +static int vmx_is_long_mode(struct kvm_vcpu *vcpu) +{ + return vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_CONTROLS_IA32E_MASK; +} + +static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); + + *db = (ar >> 14) & 1; + *l = (ar >> 13) & 1; +} + +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); + dt->base = vmcs_readl(GUEST_IDTR_BASE); +} + +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_IDTR_BASE, dt->base); +} + +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); + dt->base = vmcs_readl(GUEST_GDTR_BASE); +} + +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_GDTR_BASE, dt->base); +} + +static int init_rmode_tss(struct kvm* kvm) +{ + struct page *p1, *p2, *p3; + gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + char *page; + + p1 = _gfn_to_page(kvm, fn++); + p2 = _gfn_to_page(kvm, fn++); + p3 = _gfn_to_page(kvm, fn); + + if (!p1 || !p2 || !p3) { + kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); + return 0; + } + + page = kmap_atomic(p1, KM_USER0); + memset(page, 0, PAGE_SIZE); + *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + kunmap_atomic(page, KM_USER0); + + page = kmap_atomic(p2, KM_USER0); + memset(page, 0, PAGE_SIZE); + kunmap_atomic(page, KM_USER0); + + page = kmap_atomic(p3, KM_USER0); + memset(page, 0, PAGE_SIZE); + *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; + kunmap_atomic(page, KM_USER0); + + return 1; +} + +static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val) +{ + u32 msr_high, msr_low; + + rdmsr(msr, msr_low, msr_high); + + val &= msr_high; + val |= msr_low; + vmcs_write32(vmcs_field, val); +} + +static void seg_setup(int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + vmcs_write16(sf->selector, 0); + vmcs_writel(sf->base, 0); + vmcs_write32(sf->limit, 0xffff); + vmcs_write32(sf->ar_bytes, 0x93); +} + +/* + * Sets up the vmcs for emulated real mode. + */ +static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) +{ + u32 host_sysenter_cs; + u32 junk; + unsigned long a; + struct descriptor_table dt; + int i; + int ret = 0; + int nr_good_msrs; + extern asmlinkage void kvm_vmx_return(void); + + if (!init_rmode_tss(vcpu->kvm)) { + ret = -ENOMEM; + goto out; + } + + memset(vcpu->regs, 0, sizeof(vcpu->regs)); + vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); + vcpu->cr8 = 0; + vcpu->apic_base = 0xfee00000 | + /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | + MSR_IA32_APICBASE_ENABLE; + + fx_init(vcpu); + + /* + * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode + * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. + */ + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0x000f0000); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); + + seg_setup(VCPU_SREG_DS); + seg_setup(VCPU_SREG_ES); + seg_setup(VCPU_SREG_FS); + seg_setup(VCPU_SREG_GS); + seg_setup(VCPU_SREG_SS); + + vmcs_write16(GUEST_TR_SELECTOR, 0); + vmcs_writel(GUEST_TR_BASE, 0); + vmcs_write32(GUEST_TR_LIMIT, 0xffff); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + vmcs_write16(GUEST_LDTR_SELECTOR, 0); + vmcs_writel(GUEST_LDTR_BASE, 0); + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + + vmcs_writel(GUEST_RFLAGS, 0x02); + vmcs_writel(GUEST_RIP, 0xfff0); + vmcs_writel(GUEST_RSP, 0); + + vmcs_writel(GUEST_CR3, 0); + + //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 + vmcs_writel(GUEST_DR7, 0x400); + + vmcs_writel(GUEST_GDTR_BASE, 0); + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + + vmcs_writel(GUEST_IDTR_BASE, 0); + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + + vmcs_write32(GUEST_ACTIVITY_STATE, 0); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); + + /* I/O */ + vmcs_write64(IO_BITMAP_A, 0); + vmcs_write64(IO_BITMAP_B, 0); + + guest_write_tsc(0); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + + /* Special registers */ + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + /* Control */ + vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, + PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_EXT_INTR_MASK /* 20.6.1 */ + | PIN_BASED_NMI_EXITING /* 20.6.1 */ + ); + vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, + CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_HLT_EXITING /* 20.6.2 */ + | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ + | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ + | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ + | CPU_BASED_INVDPG_EXITING + | CPU_BASED_MOV_DR_EXITING + | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ + ); + + vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ + vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ + vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#ifdef __x86_64__ + rdmsrl(MSR_FS_BASE, a); + vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ + rdmsrl(MSR_GS_BASE, a); + vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ +#else + vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ +#endif + + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + + get_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ + + + vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ + + rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); + vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); + rdmsrl(MSR_IA32_SYSENTER_ESP, a); + vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ + rdmsrl(MSR_IA32_SYSENTER_EIP, a); + vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + + ret = -ENOMEM; + vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vcpu->guest_msrs) + goto out; + vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vcpu->host_msrs) + goto out_free_guest_msrs; + + for (i = 0; i < NR_VMX_MSR; ++i) { + u32 index = vmx_msr_index[i]; + u32 data_low, data_high; + u64 data; + int j = vcpu->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + data = data_low | ((u64)data_high << 32); + vcpu->host_msrs[j].index = index; + vcpu->host_msrs[j].reserved = 0; + vcpu->host_msrs[j].data = data; + vcpu->guest_msrs[j] = vcpu->host_msrs[j]; + ++vcpu->nmsrs; + } + printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs); + + nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; + vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, + virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); + vmcs_writel(VM_EXIT_MSR_STORE_ADDR, + virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); + vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, + virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS)); + vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, + (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + + + /* 22.2.1, 20.8.1 */ + vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, + VM_ENTRY_CONTROLS, 0); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + + vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0); + vmcs_writel(TPR_THRESHOLD, 0); + + vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK); + vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + + vcpu->cr0 = 0x60000010; + vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode + vmx_set_cr4(vcpu, 0); +#ifdef __x86_64__ + vmx_set_efer(vcpu, 0); +#endif + + return 0; + +out_free_guest_msrs: + kfree(vcpu->guest_msrs); +out: + return ret; +} + +static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) +{ + u16 ent[2]; + u16 cs; + u16 ip; + unsigned long flags; + unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); + u16 sp = vmcs_readl(GUEST_RSP); + u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); + + if (sp > ss_limit || sp - 6 > sp) { + vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", + __FUNCTION__, + vmcs_readl(GUEST_RSP), + vmcs_readl(GUEST_SS_BASE), + vmcs_read32(GUEST_SS_LIMIT)); + return; + } + + if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) != + sizeof(ent)) { + vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); + return; + } + + flags = vmcs_readl(GUEST_RFLAGS); + cs = vmcs_readl(GUEST_CS_BASE) >> 4; + ip = vmcs_readl(GUEST_RIP); + + + if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 || + kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 || + kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) { + vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); + return; + } + + vmcs_writel(GUEST_RFLAGS, flags & + ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); + vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; + vmcs_writel(GUEST_CS_BASE, ent[1] << 4); + vmcs_writel(GUEST_RIP, ent[0]); + vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); +} + +static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) +{ + int word_index = __ffs(vcpu->irq_summary); + int bit_index = __ffs(vcpu->irq_pending[word_index]); + int irq = word_index * BITS_PER_LONG + bit_index; + + clear_bit(bit_index, &vcpu->irq_pending[word_index]); + if (!vcpu->irq_pending[word_index]) + clear_bit(word_index, &vcpu->irq_summary); + + if (vcpu->rmode.active) { + inject_rmode_irq(vcpu, irq); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + +static void kvm_try_inject_irq(struct kvm_vcpu *vcpu) +{ + if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) + && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0) + /* + * Interrupts enabled, and not blocked by sti or mov ss. Good. + */ + kvm_do_inject_irq(vcpu); + else + /* + * Interrupts blocked. Wait for unblock. + */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) + | CPU_BASED_VIRTUAL_INTR_PENDING); +} + +static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) +{ + struct kvm_guest_debug *dbg = &vcpu->guest_debug; + + set_debugreg(dbg->bp[0], 0); + set_debugreg(dbg->bp[1], 1); + set_debugreg(dbg->bp[2], 2); + set_debugreg(dbg->bp[3], 3); + + if (dbg->singlestep) { + unsigned long flags; + + flags = vmcs_readl(GUEST_RFLAGS); + flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + vmcs_writel(GUEST_RFLAGS, flags); + } +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + if (!vcpu->rmode.active) + return 0; + + if (vec == GP_VECTOR && err_code == 0) + if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) + return 1; + return 0; +} + +static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 intr_info, error_code; + unsigned long cr2, rip; + u32 vect_info; + enum emulation_result er; + + vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !is_page_fault(intr_info)) { + printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " + "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); + } + + if (is_external_interrupt(vect_info)) { + int irq = vect_info & VECTORING_INFO_VECTOR_MASK; + set_bit(irq, vcpu->irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + } + + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ + asm ("int $2"); + return 1; + } + error_code = 0; + rip = vmcs_readl(GUEST_RIP); + if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) + error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + if (is_page_fault(intr_info)) { + cr2 = vmcs_readl(EXIT_QUALIFICATION); + + spin_lock(&vcpu->kvm->lock); + if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) { + spin_unlock(&vcpu->kvm->lock); + return 1; + } + + er = emulate_instruction(vcpu, kvm_run, cr2, error_code); + spin_unlock(&vcpu->kvm->lock); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++kvm_stat.mmio_exits; + kvm_run->exit_reason = KVM_EXIT_MMIO; + return 0; + case EMULATE_FAIL: + vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); + break; + default: + BUG(); + } + } + + if (vcpu->rmode.active && + handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, + error_code)) + return 1; + + if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { + kvm_run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; + kvm_run->ex.error_code = error_code; + return 0; +} + +static int handle_external_interrupt(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + ++kvm_stat.irq_exits; + return 1; +} + + +static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) +{ + u64 inst; + gva_t rip; + int countr_size; + int i, n; + + if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) { + countr_size = 2; + } else { + u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES); + + countr_size = (cs_ar & AR_L_MASK) ? 8: + (cs_ar & AR_DB_MASK) ? 4: 2; + } + + rip = vmcs_readl(GUEST_RIP); + if (countr_size != 8) + rip += vmcs_readl(GUEST_CS_BASE); + + n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst); + + for (i = 0; i < n; i++) { + switch (((u8*)&inst)[i]) { + case 0xf0: + case 0xf2: + case 0xf3: + case 0x2e: + case 0x36: + case 0x3e: + case 0x26: + case 0x64: + case 0x65: + case 0x66: + break; + case 0x67: + countr_size = (countr_size == 2) ? 4: (countr_size >> 1); + default: + goto done; + } + } + return 0; +done: + countr_size *= 8; + *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); + return 1; +} + +static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + + ++kvm_stat.io_exits; + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + kvm_run->exit_reason = KVM_EXIT_IO; + if (exit_qualification & 8) + kvm_run->io.direction = KVM_EXIT_IO_IN; + else + kvm_run->io.direction = KVM_EXIT_IO_OUT; + kvm_run->io.size = (exit_qualification & 7) + 1; + kvm_run->io.string = (exit_qualification & 16) != 0; + kvm_run->io.string_down + = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; + kvm_run->io.rep = (exit_qualification & 32) != 0; + kvm_run->io.port = exit_qualification >> 16; + if (kvm_run->io.string) { + if (!get_io_count(vcpu, &kvm_run->io.count)) + return 1; + kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); + } else + kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ + return 0; +} + +static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 address = vmcs_read64(EXIT_QUALIFICATION); + int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + spin_lock(&vcpu->kvm->lock); + vcpu->mmu.inval_page(vcpu, address); + spin_unlock(&vcpu->kvm->lock); + vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length); + return 1; +} + +static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + int cr; + int reg; + + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + cr = exit_qualification & 15; + reg = (exit_qualification >> 8) & 15; + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + switch (cr) { + case 0: + vcpu_load_rsp_rip(vcpu); + set_cr0(vcpu, vcpu->regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 3: + vcpu_load_rsp_rip(vcpu); + set_cr3(vcpu, vcpu->regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 4: + vcpu_load_rsp_rip(vcpu); + set_cr4(vcpu, vcpu->regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 8: + vcpu_load_rsp_rip(vcpu); + set_cr8(vcpu, vcpu->regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + }; + break; + case 1: /*mov from cr*/ + switch (cr) { + case 3: + vcpu_load_rsp_rip(vcpu); + vcpu->regs[reg] = vcpu->cr3; + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; + case 8: + printk(KERN_DEBUG "handle_cr: read CR8 " + "cpu erratum AA15\n"); + vcpu_load_rsp_rip(vcpu); + vcpu->regs[reg] = vcpu->cr8; + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case 3: /* lmsw */ + lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + + skip_emulated_instruction(vcpu); + return 1; + default: + break; + } + kvm_run->exit_reason = 0; + printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n", + (int)(exit_qualification >> 4) & 3, cr); + return 0; +} + +static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + unsigned long val; + int dr, reg; + + /* + * FIXME: this code assumes the host is debugging the guest. + * need to deal with guest debugging itself too. + */ + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + dr = exit_qualification & 7; + reg = (exit_qualification >> 8) & 15; + vcpu_load_rsp_rip(vcpu); + if (exit_qualification & 16) { + /* mov from dr */ + switch (dr) { + case 6: + val = 0xffff0ff0; + break; + case 7: + val = 0x400; + break; + default: + val = 0; + } + vcpu->regs[reg] = val; + } else { + /* mov to dr */ + } + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_run->exit_reason = KVM_EXIT_CPUID; + return 0; +} + +static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data; + + if (vmx_get_msr(vcpu, ecx, &data)) { + vmx_inject_gp(vcpu, 0); + return 1; + } + + /* FIXME: handling of bits 32:63 of rax, rdx */ + vcpu->regs[VCPU_REGS_RAX] = data & -1u; + vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) + | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); + + if (vmx_set_msr(vcpu, ecx, data) != 0) { + vmx_inject_gp(vcpu, 0); + return 1; + } + + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_interrupt_window(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + /* Turn off interrupt window reporting. */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) + & ~CPU_BASED_VIRTUAL_INTR_PENDING); + return 1; +} + +static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + skip_emulated_instruction(vcpu); + if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) + return 1; + + kvm_run->exit_reason = KVM_EXIT_HLT; + return 0; +} + +/* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs + * to be done to userspace and return 0. + */ +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) = { + [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, + [EXIT_REASON_IO_INSTRUCTION] = handle_io, + [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_CR_ACCESS] = handle_cr, + [EXIT_REASON_DR_ACCESS] = handle_dr, + [EXIT_REASON_CPUID] = handle_cpuid, + [EXIT_REASON_MSR_READ] = handle_rdmsr, + [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_HLT] = handle_halt, +}; + +static const int kvm_vmx_max_exit_handlers = + sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers); + +/* + * The guest has exited. See if we can fix it or if we need userspace + * assistance. + */ +static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + u32 exit_reason = vmcs_read32(VM_EXIT_REASON); + + if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && + exit_reason != EXIT_REASON_EXCEPTION_NMI ) + printk(KERN_WARNING "%s: unexpected, valid vectoring info and " + "exit reason is 0x%x\n", __FUNCTION__, exit_reason); + kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + if (exit_reason < kvm_vmx_max_exit_handlers + && kvm_vmx_exit_handlers[exit_reason]) + return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); + else { + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = exit_reason; + } + return 0; +} + +static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u8 fail; + u16 fs_sel, gs_sel, ldt_sel; + int fs_gs_ldt_reload_needed; + +again: + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + fs_sel = read_fs(); + gs_sel = read_gs(); + ldt_sel = read_ldt(); + fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; + if (!fs_gs_ldt_reload_needed) { + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + } else { + vmcs_write16(HOST_FS_SELECTOR, 0); + vmcs_write16(HOST_GS_SELECTOR, 0); + } + +#ifdef __x86_64__ + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); +#endif + + if (vcpu->irq_summary && + !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) + kvm_try_inject_irq(vcpu); + + if (vcpu->guest_debug.enabled) + kvm_guest_debug_pre(vcpu); + + fx_save(vcpu->host_fx_image); + fx_restore(vcpu->guest_fx_image); + + save_msrs(vcpu->host_msrs, vcpu->nmsrs); + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + + asm ( + /* Store host registers */ + "pushf \n\t" +#ifdef __x86_64__ + "push %%rax; push %%rbx; push %%rdx;" + "push %%rsi; push %%rdi; push %%rbp;" + "push %%r8; push %%r9; push %%r10; push %%r11;" + "push %%r12; push %%r13; push %%r14; push %%r15;" + "push %%rcx \n\t" + ASM_VMX_VMWRITE_RSP_RDX "\n\t" +#else + "pusha; push %%ecx \n\t" + ASM_VMX_VMWRITE_RSP_RDX "\n\t" +#endif + /* Check if vmlaunch of vmresume is needed */ + "cmp $0, %1 \n\t" + /* Load guest registers. Don't clobber flags. */ +#ifdef __x86_64__ + "mov %c[cr2](%3), %%rax \n\t" + "mov %%rax, %%cr2 \n\t" + "mov %c[rax](%3), %%rax \n\t" + "mov %c[rbx](%3), %%rbx \n\t" + "mov %c[rdx](%3), %%rdx \n\t" + "mov %c[rsi](%3), %%rsi \n\t" + "mov %c[rdi](%3), %%rdi \n\t" + "mov %c[rbp](%3), %%rbp \n\t" + "mov %c[r8](%3), %%r8 \n\t" + "mov %c[r9](%3), %%r9 \n\t" + "mov %c[r10](%3), %%r10 \n\t" + "mov %c[r11](%3), %%r11 \n\t" + "mov %c[r12](%3), %%r12 \n\t" + "mov %c[r13](%3), %%r13 \n\t" + "mov %c[r14](%3), %%r14 \n\t" + "mov %c[r15](%3), %%r15 \n\t" + "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ +#else + "mov %c[cr2](%3), %%eax \n\t" + "mov %%eax, %%cr2 \n\t" + "mov %c[rax](%3), %%eax \n\t" + "mov %c[rbx](%3), %%ebx \n\t" + "mov %c[rdx](%3), %%edx \n\t" + "mov %c[rsi](%3), %%esi \n\t" + "mov %c[rdi](%3), %%edi \n\t" + "mov %c[rbp](%3), %%ebp \n\t" + "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ +#endif + /* Enter guest mode */ + "jne launched \n\t" + ASM_VMX_VMLAUNCH "\n\t" + "jmp kvm_vmx_return \n\t" + "launched: " ASM_VMX_VMRESUME "\n\t" + ".globl kvm_vmx_return \n\t" + "kvm_vmx_return: " + /* Save guest registers, load host registers, keep flags */ +#ifdef __x86_64__ + "xchg %3, 0(%%rsp) \n\t" + "mov %%rax, %c[rax](%3) \n\t" + "mov %%rbx, %c[rbx](%3) \n\t" + "pushq 0(%%rsp); popq %c[rcx](%3) \n\t" + "mov %%rdx, %c[rdx](%3) \n\t" + "mov %%rsi, %c[rsi](%3) \n\t" + "mov %%rdi, %c[rdi](%3) \n\t" + "mov %%rbp, %c[rbp](%3) \n\t" + "mov %%r8, %c[r8](%3) \n\t" + "mov %%r9, %c[r9](%3) \n\t" + "mov %%r10, %c[r10](%3) \n\t" + "mov %%r11, %c[r11](%3) \n\t" + "mov %%r12, %c[r12](%3) \n\t" + "mov %%r13, %c[r13](%3) \n\t" + "mov %%r14, %c[r14](%3) \n\t" + "mov %%r15, %c[r15](%3) \n\t" + "mov %%cr2, %%rax \n\t" + "mov %%rax, %c[cr2](%3) \n\t" + "mov 0(%%rsp), %3 \n\t" + + "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" + "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" + "pop %%rbp; pop %%rdi; pop %%rsi;" + "pop %%rdx; pop %%rbx; pop %%rax \n\t" +#else + "xchg %3, 0(%%esp) \n\t" + "mov %%eax, %c[rax](%3) \n\t" + "mov %%ebx, %c[rbx](%3) \n\t" + "pushl 0(%%esp); popl %c[rcx](%3) \n\t" + "mov %%edx, %c[rdx](%3) \n\t" + "mov %%esi, %c[rsi](%3) \n\t" + "mov %%edi, %c[rdi](%3) \n\t" + "mov %%ebp, %c[rbp](%3) \n\t" + "mov %%cr2, %%eax \n\t" + "mov %%eax, %c[cr2](%3) \n\t" + "mov 0(%%esp), %3 \n\t" + + "pop %%ecx; popa \n\t" +#endif + "setbe %0 \n\t" + "popf \n\t" + : "=g" (fail) + : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), + "c"(vcpu), + [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), +#ifdef __x86_64__ + [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), + [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), + [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), +#endif + [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) + : "cc", "memory" ); + + ++kvm_stat.exits; + + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + + fx_save(vcpu->guest_fx_image); + fx_restore(vcpu->host_fx_image); + +#ifndef __x86_64__ + asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); +#endif + + kvm_run->exit_type = 0; + if (fail) { + kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; + kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + } else { + if (fs_gs_ldt_reload_needed) { + load_ldt(ldt_sel); + load_fs(fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(gs_sel); +#ifdef __x86_64__ + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } + vcpu->launched = 1; + kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; + if (kvm_handle_exit(kvm_run, vcpu)) { + /* Give scheduler a change to reschedule. */ + if (signal_pending(current)) { + ++kvm_stat.signal_exits; + return -EINTR; + } + kvm_resched(vcpu); + goto again; + } + } + return 0; +} + +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); +} + +static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, + unsigned long addr, + u32 err_code) +{ + u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + + ++kvm_stat.pf_guest; + + if (is_page_fault(vect_info)) { + printk(KERN_DEBUG "inject_page_fault: " + "double fault 0x%lx @ 0x%lx\n", + addr, vmcs_readl(GUEST_RIP)); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + DF_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); + return; + } + vcpu->cr2 = addr; + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + PF_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); + +} + +static void vmx_free_vmcs(struct kvm_vcpu *vcpu) +{ + if (vcpu->vmcs) { + on_each_cpu(__vcpu_clear, vcpu, 0, 1); + free_vmcs(vcpu->vmcs); + vcpu->vmcs = NULL; + } +} + +static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +{ + vmx_free_vmcs(vcpu); +} + +static int vmx_create_vcpu(struct kvm_vcpu *vcpu) +{ + struct vmcs *vmcs; + + vmcs = alloc_vmcs(); + if (!vmcs) + return -ENOMEM; + vmcs_clear(vmcs); + vcpu->vmcs = vmcs; + vcpu->launched = 0; + return 0; +} + +static struct kvm_arch_ops vmx_arch_ops = { + .cpu_has_kvm_support = cpu_has_kvm_support, + .disabled_by_bios = vmx_disabled_by_bios, + .hardware_setup = hardware_setup, + .hardware_unsetup = hardware_unsetup, + .hardware_enable = hardware_enable, + .hardware_disable = hardware_disable, + + .vcpu_create = vmx_create_vcpu, + .vcpu_free = vmx_free_vcpu, + + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .set_guest_debug = set_guest_debug, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .is_long_mode = vmx_is_long_mode, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .set_cr0 = vmx_set_cr0, + .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, + .set_cr3 = vmx_set_cr3, + .set_cr4 = vmx_set_cr4, +#ifdef __x86_64__ + .set_efer = vmx_set_efer, +#endif + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .cache_regs = vcpu_load_rsp_rip, + .decache_regs = vcpu_put_rsp_rip, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + + .tlb_flush = vmx_flush_tlb, + .inject_page_fault = vmx_inject_page_fault, + + .inject_gp = vmx_inject_gp, + + .run = vmx_vcpu_run, + .skip_emulated_instruction = skip_emulated_instruction, + .vcpu_setup = vmx_vcpu_setup, +}; + +static int __init vmx_init(void) +{ + kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + return 0; +} + +static void __exit vmx_exit(void) +{ + kvm_exit_arch(); +} + +module_init(vmx_init) +module_exit(vmx_exit) diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h new file mode 100644 index 00000000000..79727834158 --- /dev/null +++ b/drivers/kvm/vmx.h @@ -0,0 +1,296 @@ +#ifndef VMX_H +#define VMX_H + +/* + * vmx.h: VMX Architecture related definitions + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * A few random additions are: + * Copyright (C) 2006 Qumranet + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + */ + +#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_HLT_EXITING 0x00000080 +#define CPU_BASED_INVDPG_EXITING 0x00000200 +#define CPU_BASED_MWAIT_EXITING 0x00000400 +#define CPU_BASED_RDPMC_EXITING 0x00000800 +#define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 +#define CPU_BASED_CR8_STORE_EXITING 0x00100000 +#define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_MOV_DR_EXITING 0x00800000 +#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 +#define CPU_BASED_ACTIVATE_IO_BITMAP 0x02000000 +#define CPU_BASED_MSR_BITMAPS 0x10000000 +#define CPU_BASED_MONITOR_EXITING 0x20000000 +#define CPU_BASED_PAUSE_EXITING 0x40000000 + +#define PIN_BASED_EXT_INTR_MASK 0x1 +#define PIN_BASED_NMI_EXITING 0x8 + +#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_HOST_ADD_SPACE_SIZE 0x00000200 + + +/* VMCS Encodings */ +enum vmcs_field { + GUEST_ES_SELECTOR = 0x00000800, + GUEST_CS_SELECTOR = 0x00000802, + GUEST_SS_SELECTOR = 0x00000804, + GUEST_DS_SELECTOR = 0x00000806, + GUEST_FS_SELECTOR = 0x00000808, + GUEST_GS_SELECTOR = 0x0000080a, + GUEST_LDTR_SELECTOR = 0x0000080c, + GUEST_TR_SELECTOR = 0x0000080e, + HOST_ES_SELECTOR = 0x00000c00, + HOST_CS_SELECTOR = 0x00000c02, + HOST_SS_SELECTOR = 0x00000c04, + HOST_DS_SELECTOR = 0x00000c06, + HOST_FS_SELECTOR = 0x00000c08, + HOST_GS_SELECTOR = 0x00000c0a, + HOST_TR_SELECTOR = 0x00000c0c, + IO_BITMAP_A = 0x00002000, + IO_BITMAP_A_HIGH = 0x00002001, + IO_BITMAP_B = 0x00002002, + IO_BITMAP_B_HIGH = 0x00002003, + MSR_BITMAP = 0x00002004, + MSR_BITMAP_HIGH = 0x00002005, + VM_EXIT_MSR_STORE_ADDR = 0x00002006, + VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, + VM_EXIT_MSR_LOAD_ADDR = 0x00002008, + VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, + VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, + VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + TSC_OFFSET = 0x00002010, + TSC_OFFSET_HIGH = 0x00002011, + VIRTUAL_APIC_PAGE_ADDR = 0x00002012, + VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + VMCS_LINK_POINTER = 0x00002800, + VMCS_LINK_POINTER_HIGH = 0x00002801, + GUEST_IA32_DEBUGCTL = 0x00002802, + GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + PIN_BASED_VM_EXEC_CONTROL = 0x00004000, + CPU_BASED_VM_EXEC_CONTROL = 0x00004002, + EXCEPTION_BITMAP = 0x00004004, + PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, + PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, + CR3_TARGET_COUNT = 0x0000400a, + VM_EXIT_CONTROLS = 0x0000400c, + VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + VM_ENTRY_CONTROLS = 0x00004012, + VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + VM_ENTRY_INTR_INFO_FIELD = 0x00004016, + VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, + VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, + TPR_THRESHOLD = 0x0000401c, + SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + VM_INSTRUCTION_ERROR = 0x00004400, + VM_EXIT_REASON = 0x00004402, + VM_EXIT_INTR_INFO = 0x00004404, + VM_EXIT_INTR_ERROR_CODE = 0x00004406, + IDT_VECTORING_INFO_FIELD = 0x00004408, + IDT_VECTORING_ERROR_CODE = 0x0000440a, + VM_EXIT_INSTRUCTION_LEN = 0x0000440c, + VMX_INSTRUCTION_INFO = 0x0000440e, + GUEST_ES_LIMIT = 0x00004800, + GUEST_CS_LIMIT = 0x00004802, + GUEST_SS_LIMIT = 0x00004804, + GUEST_DS_LIMIT = 0x00004806, + GUEST_FS_LIMIT = 0x00004808, + GUEST_GS_LIMIT = 0x0000480a, + GUEST_LDTR_LIMIT = 0x0000480c, + GUEST_TR_LIMIT = 0x0000480e, + GUEST_GDTR_LIMIT = 0x00004810, + GUEST_IDTR_LIMIT = 0x00004812, + GUEST_ES_AR_BYTES = 0x00004814, + GUEST_CS_AR_BYTES = 0x00004816, + GUEST_SS_AR_BYTES = 0x00004818, + GUEST_DS_AR_BYTES = 0x0000481a, + GUEST_FS_AR_BYTES = 0x0000481c, + GUEST_GS_AR_BYTES = 0x0000481e, + GUEST_LDTR_AR_BYTES = 0x00004820, + GUEST_TR_AR_BYTES = 0x00004822, + GUEST_INTERRUPTIBILITY_INFO = 0x00004824, + GUEST_ACTIVITY_STATE = 0X00004826, + GUEST_SYSENTER_CS = 0x0000482A, + HOST_IA32_SYSENTER_CS = 0x00004c00, + CR0_GUEST_HOST_MASK = 0x00006000, + CR4_GUEST_HOST_MASK = 0x00006002, + CR0_READ_SHADOW = 0x00006004, + CR4_READ_SHADOW = 0x00006006, + CR3_TARGET_VALUE0 = 0x00006008, + CR3_TARGET_VALUE1 = 0x0000600a, + CR3_TARGET_VALUE2 = 0x0000600c, + CR3_TARGET_VALUE3 = 0x0000600e, + EXIT_QUALIFICATION = 0x00006400, + GUEST_LINEAR_ADDRESS = 0x0000640a, + GUEST_CR0 = 0x00006800, + GUEST_CR3 = 0x00006802, + GUEST_CR4 = 0x00006804, + GUEST_ES_BASE = 0x00006806, + GUEST_CS_BASE = 0x00006808, + GUEST_SS_BASE = 0x0000680a, + GUEST_DS_BASE = 0x0000680c, + GUEST_FS_BASE = 0x0000680e, + GUEST_GS_BASE = 0x00006810, + GUEST_LDTR_BASE = 0x00006812, + GUEST_TR_BASE = 0x00006814, + GUEST_GDTR_BASE = 0x00006816, + GUEST_IDTR_BASE = 0x00006818, + GUEST_DR7 = 0x0000681a, + GUEST_RSP = 0x0000681c, + GUEST_RIP = 0x0000681e, + GUEST_RFLAGS = 0x00006820, + GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, + GUEST_SYSENTER_ESP = 0x00006824, + GUEST_SYSENTER_EIP = 0x00006826, + HOST_CR0 = 0x00006c00, + HOST_CR3 = 0x00006c02, + HOST_CR4 = 0x00006c04, + HOST_FS_BASE = 0x00006c06, + HOST_GS_BASE = 0x00006c08, + HOST_TR_BASE = 0x00006c0a, + HOST_GDTR_BASE = 0x00006c0c, + HOST_IDTR_BASE = 0x00006c0e, + HOST_IA32_SYSENTER_ESP = 0x00006c10, + HOST_IA32_SYSENTER_EIP = 0x00006c12, + HOST_RSP = 0x00006c14, + HOST_RIP = 0x00006c16, +}; + +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 + +#define EXIT_REASON_PENDING_INTERRUPT 7 + +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 + +/* + * Interruption-information format + */ +#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ +#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ +#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ + +#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK +#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK +#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK +#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK + +#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ + +/* + * Exit Qualifications for MOV for Control Register Access + */ +#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ +#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ +#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ +#define LMSW_SOURCE_DATA_SHIFT 16 +#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ +#define REG_EAX (0 << 8) +#define REG_ECX (1 << 8) +#define REG_EDX (2 << 8) +#define REG_EBX (3 << 8) +#define REG_ESP (4 << 8) +#define REG_EBP (5 << 8) +#define REG_ESI (6 << 8) +#define REG_EDI (7 << 8) +#define REG_R8 (8 << 8) +#define REG_R9 (9 << 8) +#define REG_R10 (10 << 8) +#define REG_R11 (11 << 8) +#define REG_R12 (12 << 8) +#define REG_R13 (13 << 8) +#define REG_R14 (14 << 8) +#define REG_R15 (15 << 8) + +/* + * Exit Qualifications for MOV for Debug Register Access + */ +#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ +#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ +#define TYPE_MOV_TO_DR (0 << 4) +#define TYPE_MOV_FROM_DR (1 << 4) +#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ + + +/* segment AR */ +#define SEGMENT_AR_L_MASK (1 << 13) + +/* entry controls */ +#define VM_ENTRY_CONTROLS_IA32E_MASK (1 << 9) + +#define AR_TYPE_ACCESSES_MASK 1 +#define AR_TYPE_READABLE_MASK (1 << 1) +#define AR_TYPE_WRITEABLE_MASK (1 << 2) +#define AR_TYPE_CODE_MASK (1 << 3) +#define AR_TYPE_MASK 0x0f +#define AR_TYPE_BUSY_64_TSS 11 +#define AR_TYPE_BUSY_32_TSS 11 +#define AR_TYPE_BUSY_16_TSS 3 +#define AR_TYPE_LDT 2 + +#define AR_UNUSABLE_MASK (1 << 16) +#define AR_S_MASK (1 << 4) +#define AR_P_MASK (1 << 7) +#define AR_L_MASK (1 << 13) +#define AR_DB_MASK (1 << 14) +#define AR_G_MASK (1 << 15) +#define AR_DPL_SHIFT 5 +#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) + +#define AR_RESERVD_MASK 0xfffe0f00 + +#define CR4_VMXE 0x2000 + +#define MSR_IA32_VMX_BASIC_MSR 0x480 +#define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_IA32_VMX_PINBASED_CTLS_MSR 0x481 +#define MSR_IA32_VMX_PROCBASED_CTLS_MSR 0x482 +#define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483 +#define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484 + +#endif diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c new file mode 100644 index 00000000000..7e838bf0592 --- /dev/null +++ b/drivers/kvm/x86_emulate.c @@ -0,0 +1,1409 @@ +/****************************************************************************** + * x86_emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * Linux coding style, mod r/m decoder, segment base fixes, real-mode + * privieged instructions: + * + * Copyright (C) 2006 Qumranet + * + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __KERNEL__ +#include <stdio.h> +#include <stdint.h> +#include <public/xen.h> +#define DPRINTF(_f, _a ...) printf( _f , ## _a ) +#else +#include "kvm.h" +#define DPRINTF(x...) do {} while (0) +#endif +#include "x86_emulate.h" +#include <linux/module.h> + +/* + * Opcode effective-address decode tables. + * Note that we only emulate instructions that have at least one memory + * operand (excluding implicit stack references). We assume that stack + * references and instruction fetches will never occur in special memory + * areas that require emulation. So, for example, 'mov <imm>,<reg>' need + * not be handled. + */ + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstMask (3<<1) +/* Source operand type. */ +#define SrcNone (0<<3) /* No source operand. */ +#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<3) /* Register operand. */ +#define SrcMem (2<<3) /* Memory operand. */ +#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ +#define SrcImm (5<<3) /* Immediate operand. */ +#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<3) +/* Generic ModRM decode. */ +#define ModRM (1<<6) +/* Destination is only written; never read. */ +#define Mov (1<<7) + +static u8 opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x08 - 0x0F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x10 - 0x17 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x18 - 0x1F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x20 - 0x27 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x28 - 0x2F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x38 - 0x3F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x40 - 0x4F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x87 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + /* 0x88 - 0x8F */ + ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, + ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + 0, 0, 0, DstMem | SrcNone | ModRM | Mov, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, + ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, + ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps, ImplicitOps, + /* 0xA8 - 0xAF */ + 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps, ImplicitOps, + /* 0xB0 - 0xBF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xC0 - 0xC7 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0, + 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, + DstMem | SrcImm | ModRM | Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xD7 */ + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + 0, 0, 0, 0, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + /* 0xF8 - 0xFF */ + 0, 0, 0, 0, + 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM +}; + +static u8 twobyte_table[256] = { + /* 0x00 - 0x0F */ + 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, + 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + /* 0x20 - 0x2F */ + ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x3F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x48 - 0x4F */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x8F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0, + /* 0xA8 - 0xAF */ + 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, + DstMem | SrcReg | ModRM, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xB8 - 0xBF */ + 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xC0 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* + * Tell the emulator that of the Group 7 instructions (sgdt, lidt, etc.) we + * are interested only in invlpg and not in any of the rest. + * + * invlpg is a special instruction in that the data it references may not + * be mapped. + */ +void kvm_emulator_want_group7_invlpg(void) +{ + twobyte_table[1] &= ~SrcMem; +} +EXPORT_SYMBOL_GPL(kvm_emulator_want_group7_invlpg); + +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM } type; + unsigned int bytes; + unsigned long val, orig_val, *ptr; +}; + +/* EFLAGS bit definitions. */ +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(__x86_64__) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ + /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \ + "push %"_sav"; " \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pushf; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pop %"_tmp"; " \ + "orl %"_LO32 _tmp",("_STK"); " \ + "popf; " \ + /* _sav &= ~msk; */ \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",%"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ + /* _sav |= EFLAGS & _msk; */ \ + "pushf; " \ + "pop %"_tmp"; " \ + "andl %"_msk",%"_LO32 _tmp"; " \ + "orl %"_LO32 _tmp",%"_sav"; " + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"w %"_wx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _wy ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"l %"_lx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _ly ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + case 8: \ + __emulate_2op_8byte(_op, _src, _dst, \ + _eflags, _qx, _qy); \ + break; \ + } \ + } while (0) + +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + switch ( (_dst).bytes ) \ + { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"b %"_bx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _by ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + default: \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + _wx, _wy, _lx, _ly, _qx, _qy); \ + break; \ + } \ + } while (0) + +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") + +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") + +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + unsigned long _tmp; \ + \ + switch ( (_dst).bytes ) \ + { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"b %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"w %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"l %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 8: \ + __emulate_1op_8byte(_op, _dst, _eflags); \ + break; \ + } \ + } while (0) + +/* Emulate an instruction with quadword operands (x86/64 only). */ +#if defined(__x86_64__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"q %"_qx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _qy ((_src).val), "i" (EFLAGS_MASK) ); \ + } while (0) + +#define __emulate_1op_8byte(_op, _dst, _eflags) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"q %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + } while (0) + +#elif defined(__i386__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) +#define __emulate_1op_8byte(_op, _dst, _eflags) +#endif /* __i386__ */ + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ + (_size), ctxt); \ + if ( rc != 0 ) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +/* Access/update address held in a register, based on addressing mode. */ +#define register_address(base, reg) \ + ((base) + ((ad_bytes == sizeof(unsigned long)) ? (reg) : \ + ((reg) & ((1UL << (ad_bytes << 3)) - 1)))) + +#define register_address_increment(reg, inc) \ + do { \ + /* signed type ensures sign extension to long */ \ + int _inc = (inc); \ + if ( ad_bytes == sizeof(unsigned long) ) \ + (reg) += _inc; \ + else \ + (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ + (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ + } while (0) + +void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) +{ + void *p; + + p = ®s[modrm_reg]; + if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) + p = (unsigned char *)®s[modrm_reg & 3] + 1; + return p; +} + +static int read_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *ptr, + u16 *size, unsigned long *address, int op_bytes) +{ + int rc; + + if (op_bytes == 2) + op_bytes = 3; + *address = 0; + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt); + if (rc) + return rc; + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt); + return rc; +} + +int +x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + u8 b, d, sib, twobyte = 0, rex_prefix = 0; + u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; + unsigned long *override_base = NULL; + unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; + int rc = 0; + struct operand src, dst; + unsigned long cr2 = ctxt->cr2; + int mode = ctxt->mode; + unsigned long modrm_ea; + int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; + + /* Shadow copy of register state. Committed on successful emulation. */ + unsigned long _regs[NR_VCPU_REGS]; + unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; + unsigned long modrm_val = 0; + + memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_PROT16: + op_bytes = ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + op_bytes = ad_bytes = 4; + break; +#ifdef __x86_64__ + case X86EMUL_MODE_PROT64: + op_bytes = 4; + ad_bytes = 8; + break; +#endif + default: + return -1; + } + + /* Legacy prefixes. */ + for (i = 0; i < 8; i++) { + switch (b = insn_fetch(u8, 1, _eip)) { + case 0x66: /* operand-size override */ + op_bytes ^= 6; /* switch between 2/4 bytes */ + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + ad_bytes ^= 12; /* switch between 4/8 bytes */ + else + ad_bytes ^= 6; /* switch between 2/4 bytes */ + break; + case 0x2e: /* CS override */ + override_base = &ctxt->cs_base; + break; + case 0x3e: /* DS override */ + override_base = &ctxt->ds_base; + break; + case 0x26: /* ES override */ + override_base = &ctxt->es_base; + break; + case 0x64: /* FS override */ + override_base = &ctxt->fs_base; + break; + case 0x65: /* GS override */ + override_base = &ctxt->gs_base; + break; + case 0x36: /* SS override */ + override_base = &ctxt->ss_base; + break; + case 0xf0: /* LOCK */ + lock_prefix = 1; + break; + case 0xf3: /* REP/REPE/REPZ */ + rep_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + break; + default: + goto done_prefixes; + } + } + +done_prefixes: + + /* REX prefix. */ + if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) { + rex_prefix = b; + if (b & 8) + op_bytes = 8; /* REX.W */ + modrm_reg = (b & 4) << 1; /* REX.R */ + index_reg = (b & 2) << 2; /* REX.X */ + modrm_rm = base_reg = (b & 1) << 3; /* REG.B */ + b = insn_fetch(u8, 1, _eip); + } + + /* Opcode byte(s). */ + d = opcode_table[b]; + if (d == 0) { + /* Two-byte opcode? */ + if (b == 0x0f) { + twobyte = 1; + b = insn_fetch(u8, 1, _eip); + d = twobyte_table[b]; + } + + /* Unrecognised? */ + if (d == 0) + goto cannot_emulate; + } + + /* ModRM and SIB bytes. */ + if (d & ModRM) { + modrm = insn_fetch(u8, 1, _eip); + modrm_mod |= (modrm & 0xc0) >> 6; + modrm_reg |= (modrm & 0x38) >> 3; + modrm_rm |= (modrm & 0x07); + modrm_ea = 0; + use_modrm_ea = 1; + + if (modrm_mod == 3) { + modrm_val = *(unsigned long *) + decode_register(modrm_rm, _regs, d & ByteOp); + goto modrm_done; + } + + if (ad_bytes == 2) { + unsigned bx = _regs[VCPU_REGS_RBX]; + unsigned bp = _regs[VCPU_REGS_RBP]; + unsigned si = _regs[VCPU_REGS_RSI]; + unsigned di = _regs[VCPU_REGS_RDI]; + + /* 16-bit ModR/M decode. */ + switch (modrm_mod) { + case 0: + if (modrm_rm == 6) + modrm_ea += insn_fetch(u16, 2, _eip); + break; + case 1: + modrm_ea += insn_fetch(s8, 1, _eip); + break; + case 2: + modrm_ea += insn_fetch(u16, 2, _eip); + break; + } + switch (modrm_rm) { + case 0: + modrm_ea += bx + si; + break; + case 1: + modrm_ea += bx + di; + break; + case 2: + modrm_ea += bp + si; + break; + case 3: + modrm_ea += bp + di; + break; + case 4: + modrm_ea += si; + break; + case 5: + modrm_ea += di; + break; + case 6: + if (modrm_mod != 0) + modrm_ea += bp; + break; + case 7: + modrm_ea += bx; + break; + } + if (modrm_rm == 2 || modrm_rm == 3 || + (modrm_rm == 6 && modrm_mod != 0)) + if (!override_base) + override_base = &ctxt->ss_base; + modrm_ea = (u16)modrm_ea; + } else { + /* 32/64-bit ModR/M decode. */ + switch (modrm_rm) { + case 4: + case 12: + sib = insn_fetch(u8, 1, _eip); + index_reg |= (sib >> 3) & 7; + base_reg |= sib & 7; + scale = sib >> 6; + + switch (base_reg) { + case 5: + if (modrm_mod != 0) + modrm_ea += _regs[base_reg]; + else + modrm_ea += insn_fetch(s32, 4, _eip); + break; + default: + modrm_ea += _regs[base_reg]; + } + switch (index_reg) { + case 4: + break; + default: + modrm_ea += _regs[index_reg] << scale; + + } + break; + case 5: + if (modrm_mod != 0) + modrm_ea += _regs[modrm_rm]; + else if (mode == X86EMUL_MODE_PROT64) + rip_relative = 1; + break; + default: + modrm_ea += _regs[modrm_rm]; + break; + } + switch (modrm_mod) { + case 0: + if (modrm_rm == 5) + modrm_ea += insn_fetch(s32, 4, _eip); + break; + case 1: + modrm_ea += insn_fetch(s8, 1, _eip); + break; + case 2: + modrm_ea += insn_fetch(s32, 4, _eip); + break; + } + } + if (!override_base) + override_base = &ctxt->ds_base; + if (mode == X86EMUL_MODE_PROT64 && + override_base != &ctxt->fs_base && + override_base != &ctxt->gs_base) + override_base = NULL; + + if (override_base) + modrm_ea += *override_base; + + if (rip_relative) { + modrm_ea += _eip; + switch (d & SrcMask) { + case SrcImmByte: + modrm_ea += 1; + break; + case SrcImm: + if (d & ByteOp) + modrm_ea += 1; + else + if (op_bytes == 8) + modrm_ea += 4; + else + modrm_ea += op_bytes; + } + } + if (ad_bytes != 8) + modrm_ea = (u32)modrm_ea; + cr2 = modrm_ea; + modrm_done: + ; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch (d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + goto special_insn; + case DstReg: + dst.type = OP_REG; + if ((d & ByteOp) + && !(twobyte_table && (b == 0xb6 || b == 0xb7))) { + dst.ptr = decode_register(modrm_reg, _regs, + (rex_prefix == 0)); + dst.val = *(u8 *) dst.ptr; + dst.bytes = 1; + } else { + dst.ptr = decode_register(modrm_reg, _regs, 0); + switch ((dst.bytes = op_bytes)) { + case 2: + dst.val = *(u16 *)dst.ptr; + break; + case 4: + dst.val = *(u32 *)dst.ptr; + break; + case 8: + dst.val = *(u64 *)dst.ptr; + break; + } + } + break; + case DstMem: + dst.type = OP_MEM; + dst.ptr = (unsigned long *)cr2; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + if (!(d & Mov) && /* optimisation - avoid slow emulated read */ + ((rc = ops->read_emulated((unsigned long)dst.ptr, + &dst.val, dst.bytes, ctxt)) != 0)) + goto done; + break; + } + dst.orig_val = dst.val; + + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (d & SrcMask) { + case SrcNone: + break; + case SrcReg: + src.type = OP_REG; + if (d & ByteOp) { + src.ptr = decode_register(modrm_reg, _regs, + (rex_prefix == 0)); + src.val = src.orig_val = *(u8 *) src.ptr; + src.bytes = 1; + } else { + src.ptr = decode_register(modrm_reg, _regs, 0); + switch ((src.bytes = op_bytes)) { + case 2: + src.val = src.orig_val = *(u16 *) src.ptr; + break; + case 4: + src.val = src.orig_val = *(u32 *) src.ptr; + break; + case 8: + src.val = src.orig_val = *(u64 *) src.ptr; + break; + } + } + break; + case SrcMem16: + src.bytes = 2; + goto srcmem_common; + case SrcMem32: + src.bytes = 4; + goto srcmem_common; + case SrcMem: + src.bytes = (d & ByteOp) ? 1 : op_bytes; + srcmem_common: + src.type = OP_MEM; + src.ptr = (unsigned long *)cr2; + if ((rc = ops->read_emulated((unsigned long)src.ptr, + &src.val, src.bytes, ctxt)) != 0) + goto done; + src.orig_val = src.val; + break; + case SrcImm: + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = (d & ByteOp) ? 1 : op_bytes; + if (src.bytes == 8) + src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (src.bytes) { + case 1: + src.val = insn_fetch(s8, 1, _eip); + break; + case 2: + src.val = insn_fetch(s16, 2, _eip); + break; + case 4: + src.val = insn_fetch(s32, 4, _eip); + break; + } + break; + case SrcImmByte: + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = 1; + src.val = insn_fetch(s8, 1, _eip); + break; + } + + if (twobyte) + goto twobyte_insn; + + switch (b) { + case 0x00 ... 0x05: + add: /* add */ + emulate_2op_SrcV("add", src, dst, _eflags); + break; + case 0x08 ... 0x0d: + or: /* or */ + emulate_2op_SrcV("or", src, dst, _eflags); + break; + case 0x10 ... 0x15: + adc: /* adc */ + emulate_2op_SrcV("adc", src, dst, _eflags); + break; + case 0x18 ... 0x1d: + sbb: /* sbb */ + emulate_2op_SrcV("sbb", src, dst, _eflags); + break; + case 0x20 ... 0x25: + and: /* and */ + emulate_2op_SrcV("and", src, dst, _eflags); + break; + case 0x28 ... 0x2d: + sub: /* sub */ + emulate_2op_SrcV("sub", src, dst, _eflags); + break; + case 0x30 ... 0x35: + xor: /* xor */ + emulate_2op_SrcV("xor", src, dst, _eflags); + break; + case 0x38 ... 0x3d: + cmp: /* cmp */ + emulate_2op_SrcV("cmp", src, dst, _eflags); + break; + case 0x63: /* movsxd */ + if (mode != X86EMUL_MODE_PROT64) + goto cannot_emulate; + dst.val = (s32) src.val; + break; + case 0x80 ... 0x83: /* Grp1 */ + switch (modrm_reg) { + case 0: + goto add; + case 1: + goto or; + case 2: + goto adc; + case 3: + goto sbb; + case 4: + goto and; + case 5: + goto sub; + case 6: + goto xor; + case 7: + goto cmp; + } + break; + case 0x84 ... 0x85: + test: /* test */ + emulate_2op_SrcV("test", src, dst, _eflags); + break; + case 0x86 ... 0x87: /* xchg */ + /* Write back the register source. */ + switch (dst.bytes) { + case 1: + *(u8 *) src.ptr = (u8) dst.val; + break; + case 2: + *(u16 *) src.ptr = (u16) dst.val; + break; + case 4: + *src.ptr = (u32) dst.val; + break; /* 64b reg: zero-extend */ + case 8: + *src.ptr = dst.val; + break; + } + /* + * Write back the memory destination with implicit LOCK + * prefix. + */ + dst.val = src.val; + lock_prefix = 1; + break; + case 0xa0 ... 0xa1: /* mov */ + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + dst.val = src.val; + _eip += ad_bytes; /* skip src displacement */ + break; + case 0xa2 ... 0xa3: /* mov */ + dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; + _eip += ad_bytes; /* skip dst displacement */ + break; + case 0x88 ... 0x8b: /* mov */ + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + dst.val = src.val; + break; + case 0x8f: /* pop (sole member of Grp1a) */ + /* 64-bit mode: POP always pops a 64-bit operand. */ + if (mode == X86EMUL_MODE_PROT64) + dst.bytes = 8; + if ((rc = ops->read_std(register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), + &dst.val, dst.bytes, ctxt)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); + break; + case 0xc0 ... 0xc1: + grp2: /* Grp2 */ + switch (modrm_reg) { + case 0: /* rol */ + emulate_2op_SrcB("rol", src, dst, _eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", src, dst, _eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", src, dst, _eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", src, dst, _eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", src, dst, _eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", src, dst, _eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", src, dst, _eflags); + break; + } + break; + case 0xd0 ... 0xd1: /* Grp2 */ + src.val = 1; + goto grp2; + case 0xd2 ... 0xd3: /* Grp2 */ + src.val = _regs[VCPU_REGS_RCX]; + goto grp2; + case 0xf6 ... 0xf7: /* Grp3 */ + switch (modrm_reg) { + case 0 ... 1: /* test */ + /* + * Special case in Grp3: test has an immediate + * source operand. + */ + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = (d & ByteOp) ? 1 : op_bytes; + if (src.bytes == 8) + src.bytes = 4; + switch (src.bytes) { + case 1: + src.val = insn_fetch(s8, 1, _eip); + break; + case 2: + src.val = insn_fetch(s16, 2, _eip); + break; + case 4: + src.val = insn_fetch(s32, 4, _eip); + break; + } + goto test; + case 2: /* not */ + dst.val = ~dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", dst, _eflags); + break; + default: + goto cannot_emulate; + } + break; + case 0xfe ... 0xff: /* Grp4/Grp5 */ + switch (modrm_reg) { + case 0: /* inc */ + emulate_1op("inc", dst, _eflags); + break; + case 1: /* dec */ + emulate_1op("dec", dst, _eflags); + break; + case 6: /* push */ + /* 64-bit mode: PUSH always pushes a 64-bit operand. */ + if (mode == X86EMUL_MODE_PROT64) { + dst.bytes = 8; + if ((rc = ops->read_std((unsigned long)dst.ptr, + &dst.val, 8, + ctxt)) != 0) + goto done; + } + register_address_increment(_regs[VCPU_REGS_RSP], + -dst.bytes); + if ((rc = ops->write_std( + register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), + dst.val, dst.bytes, ctxt)) != 0) + goto done; + dst.val = dst.orig_val; /* skanky: disable writeback */ + break; + default: + goto cannot_emulate; + } + break; + } + +writeback: + if ((d & Mov) || (dst.orig_val != dst.val)) { + switch (dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (dst.bytes) { + case 1: + *(u8 *)dst.ptr = (u8)dst.val; + break; + case 2: + *(u16 *)dst.ptr = (u16)dst.val; + break; + case 4: + *dst.ptr = (u32)dst.val; + break; /* 64b: zero-ext */ + case 8: + *dst.ptr = dst.val; + break; + } + break; + case OP_MEM: + if (lock_prefix) + rc = ops->cmpxchg_emulated((unsigned long)dst. + ptr, dst.orig_val, + dst.val, dst.bytes, + ctxt); + else + rc = ops->write_emulated((unsigned long)dst.ptr, + dst.val, dst.bytes, + ctxt); + if (rc != 0) + goto done; + default: + break; + } + } + + /* Commit shadow register state. */ + memcpy(ctxt->vcpu->regs, _regs, sizeof _regs); + ctxt->eflags = _eflags; + ctxt->vcpu->rip = _eip; + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + +special_insn: + if (twobyte) + goto twobyte_special_insn; + if (rep_prefix) { + if (_regs[VCPU_REGS_RCX] == 0) { + ctxt->vcpu->rip = _eip; + goto done; + } + _regs[VCPU_REGS_RCX]--; + _eip = ctxt->vcpu->rip; + } + switch (b) { + case 0xa4 ... 0xa5: /* movs */ + dst.type = OP_MEM; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)register_address(ctxt->es_base, + _regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated(register_address( + override_base ? *override_base : ctxt->ds_base, + _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + register_address_increment(_regs[VCPU_REGS_RDI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xa6 ... 0xa7: /* cmps */ + DPRINTF("Urk! I don't handle CMPS.\n"); + goto cannot_emulate; + case 0xaa ... 0xab: /* stos */ + dst.type = OP_MEM; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)cr2; + dst.val = _regs[VCPU_REGS_RAX]; + register_address_increment(_regs[VCPU_REGS_RDI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xac ... 0xad: /* lods */ + dst.type = OP_REG; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, ctxt)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xae ... 0xaf: /* scas */ + DPRINTF("Urk! I don't handle SCAS.\n"); + goto cannot_emulate; + } + goto writeback; + +twobyte_insn: + switch (b) { + case 0x01: /* lgdt, lidt, lmsw */ + switch (modrm_reg) { + u16 size; + unsigned long address; + + case 2: /* lgdt */ + rc = read_descriptor(ctxt, ops, src.ptr, + &size, &address, op_bytes); + if (rc) + goto done; + realmode_lgdt(ctxt->vcpu, size, address); + break; + case 3: /* lidt */ + rc = read_descriptor(ctxt, ops, src.ptr, + &size, &address, op_bytes); + if (rc) + goto done; + realmode_lidt(ctxt->vcpu, size, address); + break; + case 4: /* smsw */ + if (modrm_mod != 3) + goto cannot_emulate; + *(u16 *)&_regs[modrm_rm] + = realmode_get_cr(ctxt->vcpu, 0); + break; + case 6: /* lmsw */ + if (modrm_mod != 3) + goto cannot_emulate; + realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags); + break; + case 7: /* invlpg*/ + emulate_invlpg(ctxt->vcpu, cr2); + break; + default: + goto cannot_emulate; + } + break; + case 0x21: /* mov from dr to reg */ + if (modrm_mod != 3) + goto cannot_emulate; + rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]); + break; + case 0x23: /* mov from reg to dr */ + if (modrm_mod != 3) + goto cannot_emulate; + rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]); + break; + case 0x40 ... 0x4f: /* cmov */ + dst.val = dst.orig_val = src.val; + d &= ~Mov; /* default to no move */ + /* + * First, assume we're decoding an even cmov opcode + * (lsb == 0). + */ + switch ((b & 15) >> 1) { + case 0: /* cmovo */ + d |= (_eflags & EFLG_OF) ? Mov : 0; + break; + case 1: /* cmovb/cmovc/cmovnae */ + d |= (_eflags & EFLG_CF) ? Mov : 0; + break; + case 2: /* cmovz/cmove */ + d |= (_eflags & EFLG_ZF) ? Mov : 0; + break; + case 3: /* cmovbe/cmovna */ + d |= (_eflags & (EFLG_CF | EFLG_ZF)) ? Mov : 0; + break; + case 4: /* cmovs */ + d |= (_eflags & EFLG_SF) ? Mov : 0; + break; + case 5: /* cmovp/cmovpe */ + d |= (_eflags & EFLG_PF) ? Mov : 0; + break; + case 7: /* cmovle/cmovng */ + d |= (_eflags & EFLG_ZF) ? Mov : 0; + /* fall through */ + case 6: /* cmovl/cmovnge */ + d |= (!(_eflags & EFLG_SF) != + !(_eflags & EFLG_OF)) ? Mov : 0; + break; + } + /* Odd cmov opcodes (lsb == 1) have inverted sense. */ + d ^= (b & 1) ? Mov : 0; + break; + case 0xb0 ... 0xb1: /* cmpxchg */ + /* + * Save real source value, then compare EAX against + * destination. + */ + src.orig_val = src.val; + src.val = _regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", src, dst, _eflags); + /* Always write back. The question is: where to? */ + d |= Mov; + if (_eflags & EFLG_ZF) { + /* Success: write back to memory. */ + dst.val = src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + dst.type = OP_REG; + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + } + break; + case 0xa3: + bt: /* bt */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); + break; + case 0xb3: + btr: /* btr */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); + break; + case 0xab: + bts: /* bts */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); + break; + case 0xb6 ... 0xb7: /* movzx */ + dst.bytes = op_bytes; + dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; + break; + case 0xbb: + btc: /* btc */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); + break; + case 0xba: /* Grp8 */ + switch (modrm_reg & 3) { + case 0: + goto bt; + case 1: + goto bts; + case 2: + goto btr; + case 3: + goto btc; + } + break; + case 0xbe ... 0xbf: /* movsx */ + dst.bytes = op_bytes; + dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; + break; + } + goto writeback; + +twobyte_special_insn: + /* Disable writeback. */ + dst.orig_val = dst.val; + switch (b) { + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + break; + case 0x06: + emulate_clts(ctxt->vcpu); + break; + case 0x20: /* mov cr, reg */ + if (modrm_mod != 3) + goto cannot_emulate; + _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg); + break; + case 0x22: /* mov reg, cr */ + if (modrm_mod != 3) + goto cannot_emulate; + realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); + break; + case 0xc7: /* Grp9 (cmpxchg8b) */ +#if defined(__i386__) + { + unsigned long old_lo, old_hi; + if (((rc = ops->read_emulated(cr2 + 0, &old_lo, 4, + ctxt)) != 0) + || ((rc = ops->read_emulated(cr2 + 4, &old_hi, 4, + ctxt)) != 0)) + goto done; + if ((old_lo != _regs[VCPU_REGS_RAX]) + || (old_hi != _regs[VCPU_REGS_RDI])) { + _regs[VCPU_REGS_RAX] = old_lo; + _regs[VCPU_REGS_RDX] = old_hi; + _eflags &= ~EFLG_ZF; + } else if (ops->cmpxchg8b_emulated == NULL) { + rc = X86EMUL_UNHANDLEABLE; + goto done; + } else { + if ((rc = ops->cmpxchg8b_emulated(cr2, old_lo, + old_hi, + _regs[VCPU_REGS_RBX], + _regs[VCPU_REGS_RCX], + ctxt)) != 0) + goto done; + _eflags |= EFLG_ZF; + } + break; + } +#elif defined(__x86_64__) + { + unsigned long old, new; + if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0) + goto done; + if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { + _regs[VCPU_REGS_RAX] = (u32) (old >> 0); + _regs[VCPU_REGS_RDX] = (u32) (old >> 32); + _eflags &= ~EFLG_ZF; + } else { + new = (_regs[VCPU_REGS_RCX] << 32) | (u32) _regs[VCPU_REGS_RBX]; + if ((rc = ops->cmpxchg_emulated(cr2, old, + new, 8, ctxt)) != 0) + goto done; + _eflags |= EFLG_ZF; + } + break; + } +#endif + } + goto writeback; + +cannot_emulate: + DPRINTF("Cannot emulate %02x\n", b); + return -1; +} + +#ifdef __XEN__ + +#include <asm/mm.h> +#include <asm/uaccess.h> + +int +x86_emulate_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, struct x86_emulate_ctxt *ctxt) +{ + unsigned int rc; + + *val = 0; + + if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) { + propagate_page_fault(addr + bytes - rc, 0); /* read fault */ + return X86EMUL_PROPAGATE_FAULT; + } + + return X86EMUL_CONTINUE; +} + +int +x86_emulate_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, struct x86_emulate_ctxt *ctxt) +{ + unsigned int rc; + + if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) { + propagate_page_fault(addr + bytes - rc, PGERR_write_access); + return X86EMUL_PROPAGATE_FAULT; + } + + return X86EMUL_CONTINUE; +} + +#endif diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h new file mode 100644 index 00000000000..658b58de30f --- /dev/null +++ b/drivers/kvm/x86_emulate.h @@ -0,0 +1,185 @@ +/****************************************************************************** + * x86_emulate.h + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __X86_EMULATE_H__ +#define __X86_EMULATE_H__ + +struct x86_emulate_ctxt; + +/* + * x86_emulate_ops: + * + * These operations represent the instruction emulator's interface to memory. + * There are two categories of operation: those that act on ordinary memory + * regions (*_std), and those that act on memory regions known to require + * special treatment or emulation (*_emulated). + * + * The emulator assumes that an instruction accesses only one 'emulated memory' + * location, that this location is the given linear faulting address (cr2), and + * that this is one of the instruction's data operands. Instruction fetches and + * stack operations are assumed never to access emulated memory. The emulator + * automatically deduces which operand of a string-move operation is accessing + * emulated memory, and assumes that the other operand accesses normal memory. + * + * NOTES: + * 1. The emulator isn't very smart about emulated vs. standard memory. + * 'Emulated memory' access addresses should be checked for sanity. + * 'Normal memory' accesses may fault, and the caller must arrange to + * detect and handle reentrancy into the emulator via recursive faults. + * Accesses may be unaligned and may cross page boundaries. + * 2. If the access fails (cannot emulate, or a standard access faults) then + * it is up to the memop to propagate the fault to the guest VM via + * some out-of-band mechanism, unknown to the emulator. The memop signals + * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will + * then immediately bail. + * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only + * cmpxchg8b_emulated need support 8-byte accesses. + * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. + */ +/* Access completed successfully: continue emulation as normal. */ +#define X86EMUL_CONTINUE 0 +/* Access is unhandleable: bail from emulation and return error to caller. */ +#define X86EMUL_UNHANDLEABLE 1 +/* Terminate emulation but return success to the caller. */ +#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ +#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ +struct x86_emulate_ops { + /* + * read_std: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch, stack operations, and others. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_std)(unsigned long addr, + unsigned long *val, + unsigned int bytes, struct x86_emulate_ctxt * ctxt); + + /* + * write_std: Write bytes of standard (non-emulated/special) memory. + * Used for stack operations, and others. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_std)(unsigned long addr, + unsigned long val, + unsigned int bytes, struct x86_emulate_ctxt * ctxt); + + /* + * read_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_emulated) (unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt * ctxt); + + /* + * write_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_emulated) (unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt * ctxt); + + /* + * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an + * emulated/special memory area. + * @addr: [IN ] Linear address to access. + * @old: [IN ] Value expected to be current at @addr. + * @new: [IN ] Value to write to @addr. + * @bytes: [IN ] Number of bytes to access using CMPXCHG. + */ + int (*cmpxchg_emulated) (unsigned long addr, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt * ctxt); + + /* + * cmpxchg8b_emulated: Emulate an atomic (LOCKed) CMPXCHG8B operation on an + * emulated/special memory area. + * @addr: [IN ] Linear address to access. + * @old: [IN ] Value expected to be current at @addr. + * @new: [IN ] Value to write to @addr. + * NOTES: + * 1. This function is only ever called when emulating a real CMPXCHG8B. + * 2. This function is *never* called on x86/64 systems. + * 2. Not defining this function (i.e., specifying NULL) is equivalent + * to defining a function that always returns X86EMUL_UNHANDLEABLE. + */ + int (*cmpxchg8b_emulated) (unsigned long addr, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt * ctxt); +}; + +struct cpu_user_regs; + +struct x86_emulate_ctxt { + /* Register state before/after emulation. */ + struct kvm_vcpu *vcpu; + + /* Linear faulting address (if emulating a page-faulting instruction). */ + unsigned long eflags; + unsigned long cr2; + + /* Emulated execution mode, represented by an X86EMUL_MODE value. */ + int mode; + + unsigned long cs_base; + unsigned long ds_base; + unsigned long es_base; + unsigned long ss_base; + unsigned long gs_base; + unsigned long fs_base; +}; + +/* Execution mode, passed to the emulator. */ +#define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ +#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ +#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ + +/* Host execution mode. */ +#if defined(__i386__) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 +#elif defined(__x86_64__) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 +#endif + +/* + * x86_emulate_memop: Emulate an instruction that faulted attempting to + * read/write a 'special' memory area. + * Returns -1 on failure, 0 on success. + */ +int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs); + +#endif /* __X86_EMULATE_H__ */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 53bd46dba0c..21e2a7b0884 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3314,6 +3314,10 @@ static int do_md_stop(mddev_t * mddev, int mode) module_put(mddev->pers->owner); mddev->pers = NULL; + + set_capacity(disk, 0); + mddev->changed = 1; + if (mddev->ro) mddev->ro = 0; } @@ -3333,7 +3337,7 @@ static int do_md_stop(mddev_t * mddev, int mode) if (mode == 0) { mdk_rdev_t *rdev; struct list_head *tmp; - struct gendisk *disk; + printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); @@ -3358,10 +3362,6 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->raid_disks = 0; mddev->recovery_cp = 0; - disk = mddev->gendisk; - if (disk) - set_capacity(disk, 0); - mddev->changed = 1; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); @@ -3371,6 +3371,7 @@ out: return err; } +#ifndef MODULE static void autorun_array(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -3485,6 +3486,7 @@ static void autorun_devices(int part) } printk(KERN_INFO "md: ... autorun DONE.\n"); } +#endif /* !MODULE */ static int get_version(void __user * arg) { @@ -3722,6 +3724,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (err) export_rdev(rdev); + md_update_sb(mddev, 1); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); return err; @@ -5273,7 +5276,6 @@ void md_do_sync(mddev_t *mddev) mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -5297,6 +5299,7 @@ void md_do_sync(mddev_t *mddev) rdev->recovery_offset = mddev->curr_resync; } } + set_bit(MD_CHANGE_DEVS, &mddev->flags); skip: mddev->curr_resync = 0; @@ -5593,7 +5596,7 @@ static void autostart_arrays(int part) autorun_devices(part); } -#endif +#endif /* !MODULE */ static __exit void md_exit(void) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 656fae912fe..b3c5e12f081 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1951,6 +1951,7 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + conf->fullsync = 1; } } if (mddev->degraded == conf->raid_disks) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 52914d5cec7..377f8bc9b78 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (!test_bit(STRIPE_EXPANDING, &sh->state)) { list_add_tail(&sh->lru, &conf->inactive_list); wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); } } } @@ -542,35 +544,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, } if (uptodate) { -#if 0 - struct bio *bio; - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - /* we can return a buffer if we bypassed the cache or - * if the top buffer is not in highmem. If there are - * multiple buffers, leave the extra work to - * handle_stripe - */ - buffer = sh->bh_read[i]; - if (buffer && - (!PageHighMem(buffer->b_page) - || buffer->b_page == bh->b_page ) - ) { - sh->bh_read[i] = buffer->b_reqnext; - buffer->b_reqnext = NULL; - } else - buffer = NULL; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (sh->bh_page[i]==bh->b_page) - set_buffer_uptodate(bh); - if (buffer) { - if (buffer->b_page != bh->b_page) - memcpy(buffer->b_data, bh->b_data, bh->b_size); - buffer->b_end_io(buffer, 1); - } -#else set_bit(R5_UPTODATE, &sh->dev[i].flags); -#endif if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", @@ -616,14 +590,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); -#if 0 - /* must restore b_page before unlocking buffer... */ - if (sh->bh_page[i] != bh->b_page) { - bh->b_page = sh->bh_page[i]; - bh->b_data = page_address(bh->b_page); - clear_buffer_uptodate(bh); - } -#endif clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -821,7 +787,8 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, static sector_t compute_blocknr(struct stripe_head *sh, int i) { raid5_conf_t *conf = sh->raid_conf; - int raid_disks = sh->disks, data_disks = raid_disks - 1; + int raid_disks = sh->disks; + int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; @@ -857,7 +824,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) } break; case 6: - data_disks = raid_disks - 2; if (i == raid6_next_disk(sh->pd_idx, raid_disks)) return 0; /* It is the Q disk */ switch (conf->algorithm) { @@ -1353,8 +1319,10 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) int pd_idx, dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); - raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk - + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf); + raid5_compute_sector(stripe * (disks - conf->max_degraded) + *sectors_per_chunk + chunk_offset, + disks, disks - conf->max_degraded, + &dd_idx, &pd_idx, conf); return pd_idx; } @@ -1615,15 +1583,6 @@ static void handle_stripe5(struct stripe_head *sh) } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); @@ -1641,9 +1600,6 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 -|| sh->bh_page[i]!=bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags) @@ -1655,9 +1611,6 @@ static void handle_stripe5(struct stripe_head *sh) /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 -|| sh->bh_page[i] != bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -1865,7 +1818,9 @@ static void handle_stripe5(struct stripe_head *sh) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2193,15 +2148,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); @@ -2220,9 +2166,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (!test_bit(R5_OVERWRITE, &dev->flags) && i != pd_idx && i != qd_idx && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 - || sh->bh_page[i] != bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -2418,7 +2361,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2611,6 +2556,180 @@ static int raid5_congested(void *data, int bits) return 0; } +/* We want read requests to align with chunks where possible, + * but write requests don't need to. + */ +static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) +{ + mddev_t *mddev = q->queuedata; + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + int max; + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + if (bio_data_dir(bio)) + return biovec->bv_len; /* always allow writes to be mergeable */ + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (max < 0) max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + else + return max; +} + + +static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) +{ + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + return chunk_sectors >= + ((sector & (chunk_sectors - 1)) + bio_sectors); +} + +/* + * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) + * later sampled by raid5d. + */ +static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) +{ + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + bi->bi_next = conf->retry_read_aligned_list; + conf->retry_read_aligned_list = bi; + + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(conf->mddev->thread); +} + + +static struct bio *remove_bio_from_retry(raid5_conf_t *conf) +{ + struct bio *bi; + + bi = conf->retry_read_aligned; + if (bi) { + conf->retry_read_aligned = NULL; + return bi; + } + bi = conf->retry_read_aligned_list; + if(bi) { + conf->retry_read_aligned = bi->bi_next; + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* biased count of active stripes */ + bi->bi_hw_segments = 0; /* count of processed stripes */ + } + + return bi; +} + + +/* + * The "raid5_align_endio" should check if the read succeeded and if it + * did, call bio_endio on the original bio (having bio_put the new bio + * first). + * If the read failed.. + */ +static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) +{ + struct bio* raid_bi = bi->bi_private; + mddev_t *mddev; + raid5_conf_t *conf; + int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + mdk_rdev_t *rdev; + + if (bi->bi_size) + return 1; + bio_put(bi); + + mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; + conf = mddev_to_conf(mddev); + rdev = (void*)raid_bi->bi_next; + raid_bi->bi_next = NULL; + + rdev_dec_pending(rdev, conf->mddev); + + if (!error && uptodate) { + bio_endio(raid_bi, bytes, 0); + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return 0; + } + + + PRINTK("raid5_align_endio : io error...handing IO for a retry\n"); + + add_bio_to_retry(raid_bi, conf); + return 0; +} + +static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) +{ + mddev_t *mddev = q->queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + const unsigned int raid_disks = conf->raid_disks; + const unsigned int data_disks = raid_disks - conf->max_degraded; + unsigned int dd_idx, pd_idx; + struct bio* align_bi; + mdk_rdev_t *rdev; + + if (!in_chunk_boundary(mddev, raid_bio)) { + printk("chunk_aligned_read : non aligned\n"); + return 0; + } + /* + * use bio_clone to make a copy of the bio + */ + align_bi = bio_clone(raid_bio, GFP_NOIO); + if (!align_bi) + return 0; + /* + * set bi_end_io to a new function, and set bi_private to the + * original bio. + */ + align_bi->bi_end_io = raid5_align_endio; + align_bi->bi_private = raid_bio; + /* + * compute position + */ + align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, + raid_disks, + data_disks, + &dd_idx, + &pd_idx, + conf); + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[dd_idx].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + raid_bio->bi_next = (void*)rdev; + align_bi->bi_bdev = rdev->bdev; + align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); + align_bi->bi_sector += rdev->data_offset; + + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + + generic_make_request(align_bi); + return 1; + } else { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } +} + + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -2632,6 +2751,11 @@ static int make_request(request_queue_t *q, struct bio * bi) disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); + if (bio_data_dir(bi) == READ && + mddev->reshape_position == MaxSector && + chunk_aligned_read(q,bi)) + return 0; + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; @@ -2739,7 +2863,9 @@ static int make_request(request_queue_t *q, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } return 0; } @@ -2950,6 +3076,74 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return STRIPE_SECTORS; } +static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) +{ + /* We may not be able to submit a whole bio at once as there + * may not be enough stripe_heads available. + * We cannot pre-allocate enough stripe_heads as we may need + * more than exist in the cache (if we allow ever large chunks). + * So we do one stripe head at a time and record in + * ->bi_hw_segments how many have been done. + * + * We *know* that this entire raid_bio is in one chunk, so + * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. + */ + struct stripe_head *sh; + int dd_idx, pd_idx; + sector_t sector, logical_sector, last_sector; + int scnt = 0; + int remaining; + int handled = 0; + + logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + sector = raid5_compute_sector( logical_sector, + conf->raid_disks, + conf->raid_disks - conf->max_degraded, + &dd_idx, + &pd_idx, + conf); + last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS, scnt++) { + + if (scnt < raid_bio->bi_hw_segments) + /* already done this stripe */ + continue; + + sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + + if (!sh) { + /* failed to get a stripe - must wait */ + raid_bio->bi_hw_segments = scnt; + conf->retry_read_aligned = raid_bio; + return handled; + } + + set_bit(R5_ReadError, &sh->dev[dd_idx].flags); + add_stripe_bio(sh, raid_bio, dd_idx, 0); + handle_stripe(sh, NULL); + release_stripe(sh); + handled++; + } + spin_lock_irq(&conf->device_lock); + remaining = --raid_bio->bi_phys_segments; + spin_unlock_irq(&conf->device_lock); + if (remaining == 0) { + int bytes = raid_bio->bi_size; + + raid_bio->bi_size = 0; + raid_bio->bi_end_io(raid_bio, bytes, + test_bit(BIO_UPTODATE, &raid_bio->bi_flags) + ? 0 : -EIO); + } + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return handled; +} + + + /* * This is our raid5 kernel thread. * @@ -2971,6 +3165,7 @@ static void raid5d (mddev_t *mddev) spin_lock_irq(&conf->device_lock); while (1) { struct list_head *first; + struct bio *bio; if (conf->seq_flush != conf->seq_write) { int seq = conf->seq_flush; @@ -2987,6 +3182,16 @@ static void raid5d (mddev_t *mddev) !list_empty(&conf->delayed_list)) raid5_activate_delayed(conf); + while ((bio = remove_bio_from_retry(conf))) { + int ok; + spin_unlock_irq(&conf->device_lock); + ok = retry_aligned_read(conf, bio); + spin_lock_irq(&conf->device_lock); + if (!ok) + break; + handled++; + } + if (list_empty(&conf->handle_list)) break; @@ -3174,6 +3379,7 @@ static int run(mddev_t *mddev) INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); PRINTK("raid5: run(%s) called.\n", mdname(mddev)); @@ -3320,6 +3526,8 @@ static int run(mddev_t *mddev) mddev->array_size = mddev->size * (conf->previous_raid_disks - conf->max_degraded); + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + return 0; abort: if (conf) { @@ -3694,7 +3902,8 @@ static void raid5_quiesce(mddev_t *mddev, int state) spin_lock_irq(&conf->device_lock); conf->quiesce = 1; wait_event_lock_irq(conf->wait_for_stripe, - atomic_read(&conf->active_stripes) == 0, + atomic_read(&conf->active_stripes) == 0 && + atomic_read(&conf->active_aligned_reads) == 0, conf->device_lock, /* nothing */); spin_unlock_irq(&conf->device_lock); break; diff --git a/drivers/parport/Kconfig b/drivers/parport/Kconfig index c7fa28a28b9..36c6a1bfe55 100644 --- a/drivers/parport/Kconfig +++ b/drivers/parport/Kconfig @@ -82,9 +82,6 @@ config PARPORT_PC_PCMCIA Say Y here if you need PCMCIA support for your PC-style parallel ports. If unsure, say N. -config PARPORT_NOT_PC - bool - config PARPORT_IP32 tristate "SGI IP32 builtin port (EXPERIMENTAL)" depends on SGI_IP32 && PARPORT && EXPERIMENTAL @@ -158,5 +155,8 @@ config PARPORT_1284 transfer modes. Also say Y if you want device ID information to appear in /proc/sys/dev/parport/*/autoprobe*. It is safe to say N. +config PARPORT_NOT_PC + bool + endmenu diff --git a/drivers/pci/hotplug/Kconfig b/drivers/pci/hotplug/Kconfig index 6e780db9454..adce4204d87 100644 --- a/drivers/pci/hotplug/Kconfig +++ b/drivers/pci/hotplug/Kconfig @@ -76,7 +76,8 @@ config HOTPLUG_PCI_IBM config HOTPLUG_PCI_ACPI tristate "ACPI PCI Hotplug driver" - depends on (!ACPI_DOCK && ACPI && HOTPLUG_PCI) || (ACPI_DOCK && HOTPLUG_PCI) + depends on HOTPLUG_PCI + depends on (!ACPI_DOCK && ACPI) || (ACPI_DOCK) help Say Y here if you have a system that supports PCI Hotplug using ACPI. diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 2a63ab2b47f..09660e2ab05 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -288,7 +288,7 @@ config RTC_DRV_PL031 To compile this driver as a module, choose M here: the module will be called rtc-pl031. -config RTC_DRV_AT91 +config RTC_DRV_AT91RM9200 tristate "AT91RM9200" depends on RTC_CLASS && ARCH_AT91RM9200 help diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index bd4c45d333f..e6beedacc96 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -35,5 +35,5 @@ obj-$(CONFIG_RTC_DRV_VR41XX) += rtc-vr41xx.o obj-$(CONFIG_RTC_DRV_PL031) += rtc-pl031.o obj-$(CONFIG_RTC_DRV_MAX6902) += rtc-max6902.o obj-$(CONFIG_RTC_DRV_V3020) += rtc-v3020.o -obj-$(CONFIG_RTC_DRV_AT91) += rtc-at91.o +obj-$(CONFIG_RTC_DRV_AT91RM9200)+= rtc-at91rm9200.o obj-$(CONFIG_RTC_DRV_SH) += rtc-sh.o diff --git a/drivers/rtc/rtc-at91.c b/drivers/rtc/rtc-at91rm9200.c index 5c8addcaf1f..5c8addcaf1f 100644 --- a/drivers/rtc/rtc-at91.c +++ b/drivers/rtc/rtc-at91rm9200.c diff --git a/drivers/rtc/rtc-ds1672.c b/drivers/rtc/rtc-ds1672.c index dfef1637bfb..205fa28593b 100644 --- a/drivers/rtc/rtc-ds1672.c +++ b/drivers/rtc/rtc-ds1672.c @@ -199,7 +199,7 @@ static int ds1672_probe(struct i2c_adapter *adapter, int address, int kind) struct i2c_client *client; struct rtc_device *rtc; - dev_dbg(&adapter->dev, "%s\n", __FUNCTION__); + dev_dbg(adapter->class_dev.dev, "%s\n", __FUNCTION__); if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) { err = -ENODEV; diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c index ba795a4db1e..7bbc26a34bd 100644 --- a/drivers/rtc/rtc-lib.c +++ b/drivers/rtc/rtc-lib.c @@ -117,4 +117,85 @@ int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time) } EXPORT_SYMBOL(rtc_tm_to_time); + +/* Merge the valid (i.e. non-negative) fields of alarm into the current + * time. If the valid alarm fields are earlier than the equivalent + * fields in the time, carry one into the least significant invalid + * field, so that the alarm expiry is in the future. It assumes that the + * least significant invalid field is more significant than the most + * significant valid field, and that the seconds field is valid. + * + * This is used by alarms that take relative (rather than absolute) + * times, and/or have a simple binary second counter instead of + * day/hour/minute/sec registers. + */ +void rtc_merge_alarm(struct rtc_time *now, struct rtc_time *alarm) +{ + int *alarmp = &alarm->tm_sec; + int *timep = &now->tm_sec; + int carry_into, i; + + /* Ignore everything past the 6th element (tm_year). */ + for (i = 5; i > 0; i--) { + if (alarmp[i] < 0) + alarmp[i] = timep[i]; + else + break; + } + + /* No carry needed if all fields are valid. */ + if (i == 5) + return; + + for (carry_into = i + 1; i >= 0; i--) { + if (alarmp[i] < timep[i]) + break; + + if (alarmp[i] > timep[i]) + return; + } + + switch (carry_into) { + case 1: + alarm->tm_min++; + + if (alarm->tm_min < 60) + return; + + alarm->tm_min = 0; + /* fall-through */ + + case 2: + alarm->tm_hour++; + + if (alarm->tm_hour < 60) + return; + + alarm->tm_hour = 0; + /* fall-through */ + + case 3: + alarm->tm_mday++; + + if (alarm->tm_mday <= rtc_days_in_month[alarm->tm_mon]) + return; + + alarm->tm_mday = 1; + /* fall-through */ + + case 4: + alarm->tm_mon++; + + if (alarm->tm_mon <= 12) + return; + + alarm->tm_mon = 1; + /* fall-through */ + + case 5: + alarm->tm_year++; + } +} +EXPORT_SYMBOL(rtc_merge_alarm); + MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c index a760cf69af9..4b72b8ef5d6 100644 --- a/drivers/rtc/rtc-pcf8563.c +++ b/drivers/rtc/rtc-pcf8563.c @@ -192,7 +192,7 @@ static int pcf8563_validate_client(struct i2c_client *client) xfer = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs)); if (xfer != ARRAY_SIZE(msgs)) { - dev_err(&client->adapter->dev, + dev_err(&client->dev, "%s: could not read register 0x%02X\n", __FUNCTION__, pattern[i].reg); @@ -203,7 +203,7 @@ static int pcf8563_validate_client(struct i2c_client *client) if (value > pattern[i].max || value < pattern[i].min) { - dev_dbg(&client->adapter->dev, + dev_dbg(&client->dev, "%s: pattern=%d, reg=%x, mask=0x%02x, min=%d, " "max=%d, value=%d, raw=0x%02X\n", __FUNCTION__, i, pattern[i].reg, pattern[i].mask, @@ -253,7 +253,7 @@ static int pcf8563_probe(struct i2c_adapter *adapter, int address, int kind) int err = 0; - dev_dbg(&adapter->dev, "%s\n", __FUNCTION__); + dev_dbg(adapter->class_dev.dev, "%s\n", __FUNCTION__); if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) { err = -ENODEV; diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index e2c7698fdba..1460f6b769f 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -200,7 +200,7 @@ static int rs5c372_probe(struct i2c_adapter *adapter, int address, int kind) struct i2c_client *client; struct rs5c372 *rs5c372; - dev_dbg(&adapter->dev, "%s\n", __FUNCTION__); + dev_dbg(adapter->class_dev.dev, "%s\n", __FUNCTION__); if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) { err = -ENODEV; diff --git a/drivers/rtc/rtc-x1205.c b/drivers/rtc/rtc-x1205.c index 9a67487d086..019ae255b0c 100644 --- a/drivers/rtc/rtc-x1205.c +++ b/drivers/rtc/rtc-x1205.c @@ -372,7 +372,7 @@ static int x1205_validate_client(struct i2c_client *client) }; if ((xfer = i2c_transfer(client->adapter, msgs, 2)) != 2) { - dev_err(&client->adapter->dev, + dev_err(&client->dev, "%s: could not read register %x\n", __FUNCTION__, probe_zero_pattern[i]); @@ -380,7 +380,7 @@ static int x1205_validate_client(struct i2c_client *client) } if ((buf & probe_zero_pattern[i+1]) != 0) { - dev_err(&client->adapter->dev, + dev_err(&client->dev, "%s: register=%02x, zero pattern=%d, value=%x\n", __FUNCTION__, probe_zero_pattern[i], i, buf); @@ -400,7 +400,7 @@ static int x1205_validate_client(struct i2c_client *client) }; if ((xfer = i2c_transfer(client->adapter, msgs, 2)) != 2) { - dev_err(&client->adapter->dev, + dev_err(&client->dev, "%s: could not read register %x\n", __FUNCTION__, probe_limits_pattern[i].reg); @@ -411,7 +411,7 @@ static int x1205_validate_client(struct i2c_client *client) if (value > probe_limits_pattern[i].max || value < probe_limits_pattern[i].min) { - dev_dbg(&client->adapter->dev, + dev_dbg(&client->dev, "%s: register=%x, lim pattern=%d, value=%d\n", __FUNCTION__, probe_limits_pattern[i].reg, i, value); @@ -506,7 +506,7 @@ static int x1205_probe(struct i2c_adapter *adapter, int address, int kind) struct i2c_client *client; struct rtc_device *rtc; - dev_dbg(&adapter->dev, "%s\n", __FUNCTION__); + dev_dbg(adapter->class_dev.dev, "%s\n", __FUNCTION__); if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) { err = -ENODEV; diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig index fc12d5df10e..0b36dd5cdac 100644 --- a/drivers/serial/Kconfig +++ b/drivers/serial/Kconfig @@ -151,32 +151,6 @@ config SERIAL_8250_MANY_PORTS say N here to save some memory. You can also say Y if you have an "intelligent" multiport card such as Cyclades, Digiboards, etc. -config SERIAL_8250_SHARE_IRQ - bool "Support for sharing serial interrupts" - depends on SERIAL_8250_EXTENDED - help - Some serial boards have hardware support which allows multiple dumb - serial ports on the same board to share a single IRQ. To enable - support for this in the serial driver, say Y here. - -config SERIAL_8250_DETECT_IRQ - bool "Autodetect IRQ on standard ports (unsafe)" - depends on SERIAL_8250_EXTENDED - help - Say Y here if you want the kernel to try to guess which IRQ - to use for your serial port. - - This is considered unsafe; it is far better to configure the IRQ in - a boot script using the setserial command. - - If unsure, say N. - -config SERIAL_8250_RSA - bool "Support RSA serial ports" - depends on SERIAL_8250_EXTENDED - help - ::: To be written ::: - # # Multi-port serial cards # @@ -199,7 +173,6 @@ config SERIAL_8250_ACCENT To compile this driver as a module, choose M here: the module will be called 8250_accent. - config SERIAL_8250_BOCA tristate "Support Boca cards" depends on SERIAL_8250 != n && ISA && SERIAL_8250_MANY_PORTS @@ -230,6 +203,32 @@ config SERIAL_8250_HUB6 To compile this driver as a module, choose M here: the module will be called 8250_hub6. +config SERIAL_8250_SHARE_IRQ + bool "Support for sharing serial interrupts" + depends on SERIAL_8250_EXTENDED + help + Some serial boards have hardware support which allows multiple dumb + serial ports on the same board to share a single IRQ. To enable + support for this in the serial driver, say Y here. + +config SERIAL_8250_DETECT_IRQ + bool "Autodetect IRQ on standard ports (unsafe)" + depends on SERIAL_8250_EXTENDED + help + Say Y here if you want the kernel to try to guess which IRQ + to use for your serial port. + + This is considered unsafe; it is far better to configure the IRQ in + a boot script using the setserial command. + + If unsure, say N. + +config SERIAL_8250_RSA + bool "Support RSA serial ports" + depends on SERIAL_8250_EXTENDED + help + ::: To be written ::: + config SERIAL_8250_MCA tristate "Support 8250-type ports on MCA buses" depends on SERIAL_8250 != n && MCA diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c index 494d9b85648..6ed3f1da929 100644 --- a/drivers/spi/pxa2xx_spi.c +++ b/drivers/spi/pxa2xx_spi.c @@ -49,6 +49,14 @@ MODULE_LICENSE("GPL"); #define RESET_DMA_CHANNEL (DCSR_NODESC | DMA_INT_MASK) #define IS_DMA_ALIGNED(x) (((u32)(x)&0x07)==0) +/* for testing SSCR1 changes that require SSP restart, basically + * everything except the service and interrupt enables */ +#define SSCR1_CHANGE_MASK (SSCR1_TTELP | SSCR1_TTE | SSCR1_EBCEI | SSCR1_SCFR \ + | SSCR1_ECRA | SSCR1_ECRB | SSCR1_SCLKDIR \ + | SSCR1_RWOT | SSCR1_TRAIL | SSCR1_PINTE \ + | SSCR1_STRF | SSCR1_EFWR |SSCR1_RFT \ + | SSCR1_TFT | SSCR1_SPH | SSCR1_SPO | SSCR1_LBM) + #define DEFINE_SSP_REG(reg, off) \ static inline u32 read_##reg(void *p) { return __raw_readl(p + (off)); } \ static inline void write_##reg(u32 v, void *p) { __raw_writel(v, p + (off)); } @@ -123,8 +131,8 @@ struct driver_data { u8 n_bytes; u32 dma_width; int cs_change; - void (*write)(struct driver_data *drv_data); - void (*read)(struct driver_data *drv_data); + int (*write)(struct driver_data *drv_data); + int (*read)(struct driver_data *drv_data); irqreturn_t (*transfer_handler)(struct driver_data *drv_data); void (*cs_control)(u32 command); }; @@ -132,7 +140,6 @@ struct driver_data { struct chip_data { u32 cr0; u32 cr1; - u32 to; u32 psp; u32 timeout; u8 n_bytes; @@ -143,8 +150,8 @@ struct chip_data { u8 enable_dma; u8 bits_per_word; u32 speed_hz; - void (*write)(struct driver_data *drv_data); - void (*read)(struct driver_data *drv_data); + int (*write)(struct driver_data *drv_data); + int (*read)(struct driver_data *drv_data); void (*cs_control)(u32 command); }; @@ -166,114 +173,118 @@ static int flush(struct driver_data *drv_data) return limit; } -static void restore_state(struct driver_data *drv_data) -{ - void *reg = drv_data->ioaddr; - - /* Clear status and disable clock */ - write_SSSR(drv_data->clear_sr, reg); - write_SSCR0(drv_data->cur_chip->cr0 & ~SSCR0_SSE, reg); - - /* Load the registers */ - write_SSCR1(drv_data->cur_chip->cr1, reg); - write_SSCR0(drv_data->cur_chip->cr0, reg); - if (drv_data->ssp_type != PXA25x_SSP) { - write_SSTO(0, reg); - write_SSPSP(drv_data->cur_chip->psp, reg); - } -} - static void null_cs_control(u32 command) { } -static void null_writer(struct driver_data *drv_data) +static int null_writer(struct driver_data *drv_data) { void *reg = drv_data->ioaddr; u8 n_bytes = drv_data->n_bytes; - while ((read_SSSR(reg) & SSSR_TNF) - && (drv_data->tx < drv_data->tx_end)) { - write_SSDR(0, reg); - drv_data->tx += n_bytes; - } + if (((read_SSSR(reg) & 0x00000f00) == 0x00000f00) + || (drv_data->tx == drv_data->tx_end)) + return 0; + + write_SSDR(0, reg); + drv_data->tx += n_bytes; + + return 1; } -static void null_reader(struct driver_data *drv_data) +static int null_reader(struct driver_data *drv_data) { void *reg = drv_data->ioaddr; u8 n_bytes = drv_data->n_bytes; while ((read_SSSR(reg) & SSSR_RNE) - && (drv_data->rx < drv_data->rx_end)) { + && (drv_data->rx < drv_data->rx_end)) { read_SSDR(reg); drv_data->rx += n_bytes; } + + return drv_data->rx == drv_data->rx_end; } -static void u8_writer(struct driver_data *drv_data) +static int u8_writer(struct driver_data *drv_data) { void *reg = drv_data->ioaddr; - while ((read_SSSR(reg) & SSSR_TNF) - && (drv_data->tx < drv_data->tx_end)) { - write_SSDR(*(u8 *)(drv_data->tx), reg); - ++drv_data->tx; - } + if (((read_SSSR(reg) & 0x00000f00) == 0x00000f00) + || (drv_data->tx == drv_data->tx_end)) + return 0; + + write_SSDR(*(u8 *)(drv_data->tx), reg); + ++drv_data->tx; + + return 1; } -static void u8_reader(struct driver_data *drv_data) +static int u8_reader(struct driver_data *drv_data) { void *reg = drv_data->ioaddr; while ((read_SSSR(reg) & SSSR_RNE) - && (drv_data->rx < drv_data->rx_end)) { + && (drv_data->rx < drv_data->rx_end)) { *(u8 *)(drv_data->rx) = read_SSDR(reg); ++drv_data->rx; } + + return drv_data->rx == drv_data->rx_end; } -static void u16_writer(struct driver_data *drv_data) +static int u16_writer(struct driver_data *drv_data) { void *reg = drv_data->ioaddr; - while ((read_SSSR(reg) & SSSR_TNF) - && (drv_data->tx < drv_data->tx_end)) { - write_SSDR(*(u16 *)(drv_data->tx), reg); - drv_data->tx += 2; - } + if (((read_SSSR(reg) & 0x00000f00) == 0x00000f00) + || (drv_data->tx == drv_data->tx_end)) + return 0; + + write_SSDR(*(u16 *)(drv_data->tx), reg); + drv_data->tx += 2; + + return 1; } -static void u16_reader(struct driver_data *drv_data) +static int u16_reader(struct driver_data *drv_data) { void *reg = drv_data->ioaddr; while ((read_SSSR(reg) & SSSR_RNE) - && (drv_data->rx < drv_data->rx_end)) { + && (drv_data->rx < drv_data->rx_end)) { *(u16 *)(drv_data->rx) = read_SSDR(reg); drv_data->rx += 2; } + + return drv_data->rx == drv_data->rx_end; } -static void u32_writer(struct driver_data *drv_data) + +static int u32_writer(struct driver_data *drv_data) { void *reg = drv_data->ioaddr; - while ((read_SSSR(reg) & SSSR_TNF) - && (drv_data->tx < drv_data->tx_end)) { - write_SSDR(*(u32 *)(drv_data->tx), reg); - drv_data->tx += 4; - } + if (((read_SSSR(reg) & 0x00000f00) == 0x00000f00) + || (drv_data->tx == drv_data->tx_end)) + return 0; + + write_SSDR(*(u32 *)(drv_data->tx), reg); + drv_data->tx += 4; + + return 1; } -static void u32_reader(struct driver_data *drv_data) +static int u32_reader(struct driver_data *drv_data) { void *reg = drv_data->ioaddr; while ((read_SSSR(reg) & SSSR_RNE) - && (drv_data->rx < drv_data->rx_end)) { + && (drv_data->rx < drv_data->rx_end)) { *(u32 *)(drv_data->rx) = read_SSDR(reg); drv_data->rx += 4; } + + return drv_data->rx == drv_data->rx_end; } static void *next_transfer(struct driver_data *drv_data) @@ -409,166 +420,134 @@ static int wait_dma_channel_stop(int channel) return limit; } -static void dma_handler(int channel, void *data) +void dma_error_stop(struct driver_data *drv_data, const char *msg) { - struct driver_data *drv_data = data; - struct spi_message *msg = drv_data->cur_msg; void *reg = drv_data->ioaddr; - u32 irq_status = DCSR(channel) & DMA_INT_MASK; - u32 trailing_sssr = 0; - if (irq_status & DCSR_BUSERR) { + /* Stop and reset */ + DCSR(drv_data->rx_channel) = RESET_DMA_CHANNEL; + DCSR(drv_data->tx_channel) = RESET_DMA_CHANNEL; + write_SSSR(drv_data->clear_sr, reg); + write_SSCR1(read_SSCR1(reg) & ~drv_data->dma_cr1, reg); + if (drv_data->ssp_type != PXA25x_SSP) + write_SSTO(0, reg); + flush(drv_data); + write_SSCR0(read_SSCR0(reg) & ~SSCR0_SSE, reg); - /* Disable interrupts, clear status and reset DMA */ - write_SSCR0(read_SSCR0(reg) & ~SSCR0_SSE, reg); - write_SSCR1(read_SSCR1(reg) & ~drv_data->dma_cr1, reg); - if (drv_data->ssp_type != PXA25x_SSP) - write_SSTO(0, reg); - write_SSSR(drv_data->clear_sr, reg); - DCSR(drv_data->rx_channel) = RESET_DMA_CHANNEL; - DCSR(drv_data->tx_channel) = RESET_DMA_CHANNEL; + unmap_dma_buffers(drv_data); - if (flush(drv_data) == 0) - dev_err(&drv_data->pdev->dev, - "dma_handler: flush fail\n"); + dev_err(&drv_data->pdev->dev, "%s\n", msg); - unmap_dma_buffers(drv_data); + drv_data->cur_msg->state = ERROR_STATE; + tasklet_schedule(&drv_data->pump_transfers); +} + +static void dma_transfer_complete(struct driver_data *drv_data) +{ + void *reg = drv_data->ioaddr; + struct spi_message *msg = drv_data->cur_msg; + + /* Clear and disable interrupts on SSP and DMA channels*/ + write_SSCR1(read_SSCR1(reg) & ~drv_data->dma_cr1, reg); + write_SSSR(drv_data->clear_sr, reg); + DCSR(drv_data->tx_channel) = RESET_DMA_CHANNEL; + DCSR(drv_data->rx_channel) = RESET_DMA_CHANNEL; + + if (wait_dma_channel_stop(drv_data->rx_channel) == 0) + dev_err(&drv_data->pdev->dev, + "dma_handler: dma rx channel stop failed\n"); + + if (wait_ssp_rx_stall(drv_data->ioaddr) == 0) + dev_err(&drv_data->pdev->dev, + "dma_transfer: ssp rx stall failed\n"); + + unmap_dma_buffers(drv_data); + + /* update the buffer pointer for the amount completed in dma */ + drv_data->rx += drv_data->len - + (DCMD(drv_data->rx_channel) & DCMD_LENGTH); + + /* read trailing data from fifo, it does not matter how many + * bytes are in the fifo just read until buffer is full + * or fifo is empty, which ever occurs first */ + drv_data->read(drv_data); + + /* return count of what was actually read */ + msg->actual_length += drv_data->len - + (drv_data->rx_end - drv_data->rx); + + /* Release chip select if requested, transfer delays are + * handled in pump_transfers */ + if (drv_data->cs_change) + drv_data->cs_control(PXA2XX_CS_DEASSERT); + + /* Move to next transfer */ + msg->state = next_transfer(drv_data); + + /* Schedule transfer tasklet */ + tasklet_schedule(&drv_data->pump_transfers); +} + +static void dma_handler(int channel, void *data) +{ + struct driver_data *drv_data = data; + u32 irq_status = DCSR(channel) & DMA_INT_MASK; + + if (irq_status & DCSR_BUSERR) { if (channel == drv_data->tx_channel) - dev_err(&drv_data->pdev->dev, - "dma_handler: bad bus address on " - "tx channel %d, source %x target = %x\n", - channel, DSADR(channel), DTADR(channel)); + dma_error_stop(drv_data, + "dma_handler: " + "bad bus address on tx channel"); else - dev_err(&drv_data->pdev->dev, - "dma_handler: bad bus address on " - "rx channel %d, source %x target = %x\n", - channel, DSADR(channel), DTADR(channel)); - - msg->state = ERROR_STATE; - tasklet_schedule(&drv_data->pump_transfers); + dma_error_stop(drv_data, + "dma_handler: " + "bad bus address on rx channel"); + return; } /* PXA255x_SSP has no timeout interrupt, wait for tailing bytes */ - if ((drv_data->ssp_type == PXA25x_SSP) - && (channel == drv_data->tx_channel) - && (irq_status & DCSR_ENDINTR)) { + if ((channel == drv_data->tx_channel) + && (irq_status & DCSR_ENDINTR) + && (drv_data->ssp_type == PXA25x_SSP)) { /* Wait for rx to stall */ if (wait_ssp_rx_stall(drv_data->ioaddr) == 0) dev_err(&drv_data->pdev->dev, "dma_handler: ssp rx stall failed\n"); - /* Clear and disable interrupts on SSP and DMA channels*/ - write_SSCR1(read_SSCR1(reg) & ~drv_data->dma_cr1, reg); - write_SSSR(drv_data->clear_sr, reg); - DCSR(drv_data->tx_channel) = RESET_DMA_CHANNEL; - DCSR(drv_data->rx_channel) = RESET_DMA_CHANNEL; - if (wait_dma_channel_stop(drv_data->rx_channel) == 0) - dev_err(&drv_data->pdev->dev, - "dma_handler: dma rx channel stop failed\n"); - - unmap_dma_buffers(drv_data); - - /* Read trailing bytes */ - /* Calculate number of trailing bytes, read them */ - trailing_sssr = read_SSSR(reg); - if ((trailing_sssr & 0xf008) != 0xf000) { - drv_data->rx = drv_data->rx_end - - (((trailing_sssr >> 12) & 0x0f) + 1); - drv_data->read(drv_data); - } - msg->actual_length += drv_data->len; - - /* Release chip select if requested, transfer delays are - * handled in pump_transfers */ - if (drv_data->cs_change) - drv_data->cs_control(PXA2XX_CS_DEASSERT); - - /* Move to next transfer */ - msg->state = next_transfer(drv_data); - - /* Schedule transfer tasklet */ - tasklet_schedule(&drv_data->pump_transfers); + /* finish this transfer, start the next */ + dma_transfer_complete(drv_data); } } static irqreturn_t dma_transfer(struct driver_data *drv_data) { u32 irq_status; - u32 trailing_sssr = 0; - struct spi_message *msg = drv_data->cur_msg; void *reg = drv_data->ioaddr; irq_status = read_SSSR(reg) & drv_data->mask_sr; if (irq_status & SSSR_ROR) { - /* Clear and disable interrupts on SSP and DMA channels*/ - write_SSCR0(read_SSCR0(reg) & ~SSCR0_SSE, reg); - write_SSCR1(read_SSCR1(reg) & ~drv_data->dma_cr1, reg); - if (drv_data->ssp_type != PXA25x_SSP) - write_SSTO(0, reg); - write_SSSR(drv_data->clear_sr, reg); - DCSR(drv_data->tx_channel) = RESET_DMA_CHANNEL; - DCSR(drv_data->rx_channel) = RESET_DMA_CHANNEL; - unmap_dma_buffers(drv_data); - - if (flush(drv_data) == 0) - dev_err(&drv_data->pdev->dev, - "dma_transfer: flush fail\n"); - - dev_warn(&drv_data->pdev->dev, "dma_transfer: fifo overun\n"); - - drv_data->cur_msg->state = ERROR_STATE; - tasklet_schedule(&drv_data->pump_transfers); - + dma_error_stop(drv_data, "dma_transfer: fifo overrun"); return IRQ_HANDLED; } /* Check for false positive timeout */ - if ((irq_status & SSSR_TINT) && DCSR(drv_data->tx_channel) & DCSR_RUN) { + if ((irq_status & SSSR_TINT) + && (DCSR(drv_data->tx_channel) & DCSR_RUN)) { write_SSSR(SSSR_TINT, reg); return IRQ_HANDLED; } if (irq_status & SSSR_TINT || drv_data->rx == drv_data->rx_end) { - /* Clear and disable interrupts on SSP and DMA channels*/ - write_SSCR1(read_SSCR1(reg) & ~drv_data->dma_cr1, reg); + /* Clear and disable timeout interrupt, do the rest in + * dma_transfer_complete */ if (drv_data->ssp_type != PXA25x_SSP) write_SSTO(0, reg); - write_SSSR(drv_data->clear_sr, reg); - DCSR(drv_data->tx_channel) = RESET_DMA_CHANNEL; - DCSR(drv_data->rx_channel) = RESET_DMA_CHANNEL; - if (wait_dma_channel_stop(drv_data->rx_channel) == 0) - dev_err(&drv_data->pdev->dev, - "dma_transfer: dma rx channel stop failed\n"); - - if (wait_ssp_rx_stall(drv_data->ioaddr) == 0) - dev_err(&drv_data->pdev->dev, - "dma_transfer: ssp rx stall failed\n"); - - unmap_dma_buffers(drv_data); - - /* Calculate number of trailing bytes, read them */ - trailing_sssr = read_SSSR(reg); - if ((trailing_sssr & 0xf008) != 0xf000) { - drv_data->rx = drv_data->rx_end - - (((trailing_sssr >> 12) & 0x0f) + 1); - drv_data->read(drv_data); - } - msg->actual_length += drv_data->len; - - /* Release chip select if requested, transfer delays are - * handled in pump_transfers */ - if (drv_data->cs_change) - drv_data->cs_control(PXA2XX_CS_DEASSERT); - - /* Move to next transfer */ - msg->state = next_transfer(drv_data); - - /* Schedule transfer tasklet */ - tasklet_schedule(&drv_data->pump_transfers); + /* finish this transfer, start the next */ + dma_transfer_complete(drv_data); return IRQ_HANDLED; } @@ -577,89 +556,103 @@ static irqreturn_t dma_transfer(struct driver_data *drv_data) return IRQ_NONE; } -static irqreturn_t interrupt_transfer(struct driver_data *drv_data) +static void int_error_stop(struct driver_data *drv_data, const char* msg) { - struct spi_message *msg = drv_data->cur_msg; void *reg = drv_data->ioaddr; - unsigned long limit = loops_per_jiffy << 1; - u32 irq_status; - u32 irq_mask = (read_SSCR1(reg) & SSCR1_TIE) ? - drv_data->mask_sr : drv_data->mask_sr & ~SSSR_TFS; - - while ((irq_status = read_SSSR(reg) & irq_mask)) { - - if (irq_status & SSSR_ROR) { - /* Clear and disable interrupts */ - write_SSCR0(read_SSCR0(reg) & ~SSCR0_SSE, reg); - write_SSCR1(read_SSCR1(reg) & ~drv_data->int_cr1, reg); - if (drv_data->ssp_type != PXA25x_SSP) - write_SSTO(0, reg); - write_SSSR(drv_data->clear_sr, reg); + /* Stop and reset SSP */ + write_SSSR(drv_data->clear_sr, reg); + write_SSCR1(read_SSCR1(reg) & ~drv_data->int_cr1, reg); + if (drv_data->ssp_type != PXA25x_SSP) + write_SSTO(0, reg); + flush(drv_data); + write_SSCR0(read_SSCR0(reg) & ~SSCR0_SSE, reg); - if (flush(drv_data) == 0) - dev_err(&drv_data->pdev->dev, - "interrupt_transfer: flush fail\n"); + dev_err(&drv_data->pdev->dev, "%s\n", msg); - /* Stop the SSP */ + drv_data->cur_msg->state = ERROR_STATE; + tasklet_schedule(&drv_data->pump_transfers); +} - dev_warn(&drv_data->pdev->dev, - "interrupt_transfer: fifo overun\n"); +static void int_transfer_complete(struct driver_data *drv_data) +{ + void *reg = drv_data->ioaddr; - msg->state = ERROR_STATE; - tasklet_schedule(&drv_data->pump_transfers); + /* Stop SSP */ + write_SSSR(drv_data->clear_sr, reg); + write_SSCR1(read_SSCR1(reg) & ~drv_data->int_cr1, reg); + if (drv_data->ssp_type != PXA25x_SSP) + write_SSTO(0, reg); - return IRQ_HANDLED; - } + /* Update total byte transfered return count actual bytes read */ + drv_data->cur_msg->actual_length += drv_data->len - + (drv_data->rx_end - drv_data->rx); - /* Look for false positive timeout */ - if ((irq_status & SSSR_TINT) - && (drv_data->rx < drv_data->rx_end)) - write_SSSR(SSSR_TINT, reg); + /* Release chip select if requested, transfer delays are + * handled in pump_transfers */ + if (drv_data->cs_change) + drv_data->cs_control(PXA2XX_CS_DEASSERT); - /* Pump data */ - drv_data->read(drv_data); - drv_data->write(drv_data); + /* Move to next transfer */ + drv_data->cur_msg->state = next_transfer(drv_data); - if (drv_data->tx == drv_data->tx_end) { - /* Disable tx interrupt */ - write_SSCR1(read_SSCR1(reg) & ~SSCR1_TIE, reg); - irq_mask = drv_data->mask_sr & ~SSSR_TFS; + /* Schedule transfer tasklet */ + tasklet_schedule(&drv_data->pump_transfers); +} - /* PXA25x_SSP has no timeout, read trailing bytes */ - if (drv_data->ssp_type == PXA25x_SSP) { - while ((read_SSSR(reg) & SSSR_BSY) && limit--) - drv_data->read(drv_data); +static irqreturn_t interrupt_transfer(struct driver_data *drv_data) +{ + void *reg = drv_data->ioaddr; - if (limit == 0) - dev_err(&drv_data->pdev->dev, - "interrupt_transfer: " - "trailing byte read failed\n"); - } - } + u32 irq_mask = (read_SSCR1(reg) & SSCR1_TIE) ? + drv_data->mask_sr : drv_data->mask_sr & ~SSSR_TFS; - if ((irq_status & SSSR_TINT) - || (drv_data->rx == drv_data->rx_end)) { + u32 irq_status = read_SSSR(reg) & irq_mask; - /* Clear timeout */ - write_SSCR1(read_SSCR1(reg) & ~drv_data->int_cr1, reg); - if (drv_data->ssp_type != PXA25x_SSP) - write_SSTO(0, reg); - write_SSSR(drv_data->clear_sr, reg); + if (irq_status & SSSR_ROR) { + int_error_stop(drv_data, "interrupt_transfer: fifo overrun"); + return IRQ_HANDLED; + } - /* Update total byte transfered */ - msg->actual_length += drv_data->len; + if (irq_status & SSSR_TINT) { + write_SSSR(SSSR_TINT, reg); + if (drv_data->read(drv_data)) { + int_transfer_complete(drv_data); + return IRQ_HANDLED; + } + } - /* Release chip select if requested, transfer delays are - * handled in pump_transfers */ - if (drv_data->cs_change) - drv_data->cs_control(PXA2XX_CS_DEASSERT); + /* Drain rx fifo, Fill tx fifo and prevent overruns */ + do { + if (drv_data->read(drv_data)) { + int_transfer_complete(drv_data); + return IRQ_HANDLED; + } + } while (drv_data->write(drv_data)); - /* Move to next transfer */ - msg->state = next_transfer(drv_data); + if (drv_data->read(drv_data)) { + int_transfer_complete(drv_data); + return IRQ_HANDLED; + } - /* Schedule transfer tasklet */ - tasklet_schedule(&drv_data->pump_transfers); + if (drv_data->tx == drv_data->tx_end) { + write_SSCR1(read_SSCR1(reg) & ~SSCR1_TIE, reg); + /* PXA25x_SSP has no timeout, read trailing bytes */ + if (drv_data->ssp_type == PXA25x_SSP) { + if (!wait_ssp_rx_stall(reg)) + { + int_error_stop(drv_data, "interrupt_transfer: " + "rx stall failed"); + return IRQ_HANDLED; + } + if (!drv_data->read(drv_data)) + { + int_error_stop(drv_data, + "interrupt_transfer: " + "trailing byte read failed"); + return IRQ_HANDLED; + } + int_transfer_complete(drv_data); } } @@ -681,7 +674,7 @@ static irqreturn_t ssp_int(int irq, void *dev_id) write_SSSR(drv_data->clear_sr, reg); dev_err(&drv_data->pdev->dev, "bad message state " - "in interrupt handler"); + "in interrupt handler\n"); /* Never fail */ return IRQ_HANDLED; @@ -690,6 +683,102 @@ static irqreturn_t ssp_int(int irq, void *dev_id) return drv_data->transfer_handler(drv_data); } +int set_dma_burst_and_threshold(struct chip_data *chip, struct spi_device *spi, + u8 bits_per_word, u32 *burst_code, + u32 *threshold) +{ + struct pxa2xx_spi_chip *chip_info = + (struct pxa2xx_spi_chip *)spi->controller_data; + int bytes_per_word; + int burst_bytes; + int thresh_words; + int req_burst_size; + int retval = 0; + + /* Set the threshold (in registers) to equal the same amount of data + * as represented by burst size (in bytes). The computation below + * is (burst_size rounded up to nearest 8 byte, word or long word) + * divided by (bytes/register); the tx threshold is the inverse of + * the rx, so that there will always be enough data in the rx fifo + * to satisfy a burst, and there will always be enough space in the + * tx fifo to accept a burst (a tx burst will overwrite the fifo if + * there is not enough space), there must always remain enough empty + * space in the rx fifo for any data loaded to the tx fifo. + * Whenever burst_size (in bytes) equals bits/word, the fifo threshold + * will be 8, or half the fifo; + * The threshold can only be set to 2, 4 or 8, but not 16, because + * to burst 16 to the tx fifo, the fifo would have to be empty; + * however, the minimum fifo trigger level is 1, and the tx will + * request service when the fifo is at this level, with only 15 spaces. + */ + + /* find bytes/word */ + if (bits_per_word <= 8) + bytes_per_word = 1; + else if (bits_per_word <= 16) + bytes_per_word = 2; + else + bytes_per_word = 4; + + /* use struct pxa2xx_spi_chip->dma_burst_size if available */ + if (chip_info) + req_burst_size = chip_info->dma_burst_size; + else { + switch (chip->dma_burst_size) { + default: + /* if the default burst size is not set, + * do it now */ + chip->dma_burst_size = DCMD_BURST8; + case DCMD_BURST8: + req_burst_size = 8; + break; + case DCMD_BURST16: + req_burst_size = 16; + break; + case DCMD_BURST32: + req_burst_size = 32; + break; + } + } + if (req_burst_size <= 8) { + *burst_code = DCMD_BURST8; + burst_bytes = 8; + } else if (req_burst_size <= 16) { + if (bytes_per_word == 1) { + /* don't burst more than 1/2 the fifo */ + *burst_code = DCMD_BURST8; + burst_bytes = 8; + retval = 1; + } else { + *burst_code = DCMD_BURST16; + burst_bytes = 16; + } + } else { + if (bytes_per_word == 1) { + /* don't burst more than 1/2 the fifo */ + *burst_code = DCMD_BURST8; + burst_bytes = 8; + retval = 1; + } else if (bytes_per_word == 2) { + /* don't burst more than 1/2 the fifo */ + *burst_code = DCMD_BURST16; + burst_bytes = 16; + retval = 1; + } else { + *burst_code = DCMD_BURST32; + burst_bytes = 32; + } + } + + thresh_words = burst_bytes / bytes_per_word; + + /* thresh_words will be between 2 and 8 */ + *threshold = (SSCR1_RxTresh(thresh_words) & SSCR1_RFT) + | (SSCR1_TxTresh(16-thresh_words) & SSCR1_TFT); + + return retval; +} + static void pump_transfers(unsigned long data) { struct driver_data *drv_data = (struct driver_data *)data; @@ -702,6 +791,9 @@ static void pump_transfers(unsigned long data) u8 bits = 0; u32 speed = 0; u32 cr0; + u32 cr1; + u32 dma_thresh = drv_data->cur_chip->dma_threshold; + u32 dma_burst = drv_data->cur_chip->dma_burst_size; /* Get current state information */ message = drv_data->cur_msg; @@ -731,6 +823,16 @@ static void pump_transfers(unsigned long data) udelay(previous->delay_usecs); } + /* Check transfer length */ + if (transfer->len > 8191) + { + dev_warn(&drv_data->pdev->dev, "pump_transfers: transfer " + "length greater than 8191\n"); + message->status = -EINVAL; + giveback(drv_data); + return; + } + /* Setup the transfer state based on the type of transfer */ if (flush(drv_data) == 0) { dev_err(&drv_data->pdev->dev, "pump_transfers: flush failed\n"); @@ -747,17 +849,15 @@ static void pump_transfers(unsigned long data) drv_data->rx_end = drv_data->rx + transfer->len; drv_data->rx_dma = transfer->rx_dma; drv_data->tx_dma = transfer->tx_dma; - drv_data->len = transfer->len; + drv_data->len = transfer->len & DCMD_LENGTH; drv_data->write = drv_data->tx ? chip->write : null_writer; drv_data->read = drv_data->rx ? chip->read : null_reader; drv_data->cs_change = transfer->cs_change; /* Change speed and bit per word on a per transfer */ + cr0 = chip->cr0; if (transfer->speed_hz || transfer->bits_per_word) { - /* Disable clock */ - write_SSCR0(chip->cr0 & ~SSCR0_SSE, reg); - cr0 = chip->cr0; bits = chip->bits_per_word; speed = chip->speed_hz; @@ -796,15 +896,24 @@ static void pump_transfers(unsigned long data) drv_data->write = drv_data->write != null_writer ? u32_writer : null_writer; } + /* if bits/word is changed in dma mode, then must check the + * thresholds and burst also */ + if (chip->enable_dma) { + if (set_dma_burst_and_threshold(chip, message->spi, + bits, &dma_burst, + &dma_thresh)) + if (printk_ratelimit()) + dev_warn(&message->spi->dev, + "pump_transfer: " + "DMA burst size reduced to " + "match bits_per_word\n"); + } cr0 = clk_div | SSCR0_Motorola | SSCR0_DataSize(bits > 16 ? bits - 16 : bits) | SSCR0_SSE | (bits > 16 ? SSCR0_EDSS : 0); - - /* Start it back up */ - write_SSCR0(cr0, reg); } message->state = RUNNING_STATE; @@ -823,13 +932,13 @@ static void pump_transfers(unsigned long data) /* No target address increment */ DCMD(drv_data->rx_channel) = DCMD_FLOWSRC | drv_data->dma_width - | chip->dma_burst_size + | dma_burst | drv_data->len; else DCMD(drv_data->rx_channel) = DCMD_INCTRGADDR | DCMD_FLOWSRC | drv_data->dma_width - | chip->dma_burst_size + | dma_burst | drv_data->len; /* Setup tx DMA Channel */ @@ -840,13 +949,13 @@ static void pump_transfers(unsigned long data) /* No source address increment */ DCMD(drv_data->tx_channel) = DCMD_FLOWTRG | drv_data->dma_width - | chip->dma_burst_size + | dma_burst | drv_data->len; else DCMD(drv_data->tx_channel) = DCMD_INCSRCADDR | DCMD_FLOWTRG | drv_data->dma_width - | chip->dma_burst_size + | dma_burst | drv_data->len; /* Enable dma end irqs on SSP to detect end of transfer */ @@ -856,16 +965,11 @@ static void pump_transfers(unsigned long data) /* Fix me, need to handle cs polarity */ drv_data->cs_control(PXA2XX_CS_ASSERT); - /* Go baby, go */ + /* Clear status and start DMA engine */ + cr1 = chip->cr1 | dma_thresh | drv_data->dma_cr1; write_SSSR(drv_data->clear_sr, reg); DCSR(drv_data->rx_channel) |= DCSR_RUN; DCSR(drv_data->tx_channel) |= DCSR_RUN; - if (drv_data->ssp_type != PXA25x_SSP) - write_SSTO(chip->timeout, reg); - write_SSCR1(chip->cr1 - | chip->dma_threshold - | drv_data->dma_cr1, - reg); } else { /* Ensure we have the correct interrupt handler */ drv_data->transfer_handler = interrupt_transfer; @@ -873,14 +977,25 @@ static void pump_transfers(unsigned long data) /* Fix me, need to handle cs polarity */ drv_data->cs_control(PXA2XX_CS_ASSERT); - /* Go baby, go */ + /* Clear status */ + cr1 = chip->cr1 | chip->threshold | drv_data->int_cr1; write_SSSR(drv_data->clear_sr, reg); + } + + /* see if we need to reload the config registers */ + if ((read_SSCR0(reg) != cr0) + || (read_SSCR1(reg) & SSCR1_CHANGE_MASK) != + (cr1 & SSCR1_CHANGE_MASK)) { + + write_SSCR0(cr0 & ~SSCR0_SSE, reg); if (drv_data->ssp_type != PXA25x_SSP) write_SSTO(chip->timeout, reg); - write_SSCR1(chip->cr1 - | chip->threshold - | drv_data->int_cr1, - reg); + write_SSCR1(cr1, reg); + write_SSCR0(cr0, reg); + } else { + if (drv_data->ssp_type != PXA25x_SSP) + write_SSTO(chip->timeout, reg); + write_SSCR1(cr1, reg); } } @@ -915,9 +1030,9 @@ static void pump_messages(struct work_struct *work) struct spi_transfer, transfer_list); - /* Setup the SSP using the per chip configuration */ + /* prepare to setup the SSP, in pump_transfers, using the per + * chip configuration */ drv_data->cur_chip = spi_get_ctldata(drv_data->cur_msg->spi); - restore_state(drv_data); /* Mark as busy and launch transfers */ tasklet_schedule(&drv_data->pump_transfers); @@ -963,63 +1078,77 @@ static int setup(struct spi_device *spi) spi->bits_per_word = 8; if (drv_data->ssp_type != PXA25x_SSP - && (spi->bits_per_word < 4 || spi->bits_per_word > 32)) + && (spi->bits_per_word < 4 || spi->bits_per_word > 32)) { + dev_err(&spi->dev, "failed setup: ssp_type=%d, bits/wrd=%d " + "b/w not 4-32 for type non-PXA25x_SSP\n", + drv_data->ssp_type, spi->bits_per_word); return -EINVAL; - else if (spi->bits_per_word < 4 || spi->bits_per_word > 16) + } + else if (drv_data->ssp_type == PXA25x_SSP + && (spi->bits_per_word < 4 + || spi->bits_per_word > 16)) { + dev_err(&spi->dev, "failed setup: ssp_type=%d, bits/wrd=%d " + "b/w not 4-16 for type PXA25x_SSP\n", + drv_data->ssp_type, spi->bits_per_word); return -EINVAL; + } - /* Only alloc (or use chip_info) on first setup */ + /* Only alloc on first setup */ chip = spi_get_ctldata(spi); - if (chip == NULL) { + if (!chip) { chip = kzalloc(sizeof(struct chip_data), GFP_KERNEL); - if (!chip) + if (!chip) { + dev_err(&spi->dev, + "failed setup: can't allocate chip data\n"); return -ENOMEM; + } chip->cs_control = null_cs_control; chip->enable_dma = 0; - chip->timeout = SSP_TIMEOUT(1000); + chip->timeout = 1000; chip->threshold = SSCR1_RxTresh(1) | SSCR1_TxTresh(1); chip->dma_burst_size = drv_data->master_info->enable_dma ? DCMD_BURST8 : 0; - - chip_info = spi->controller_data; } + /* protocol drivers may change the chip settings, so... + * if chip_info exists, use it */ + chip_info = spi->controller_data; + /* chip_info isn't always needed */ + chip->cr1 = 0; if (chip_info) { if (chip_info->cs_control) chip->cs_control = chip_info->cs_control; - chip->timeout = SSP_TIMEOUT(chip_info->timeout_microsecs); + chip->timeout = chip_info->timeout; - chip->threshold = SSCR1_RxTresh(chip_info->rx_threshold) - | SSCR1_TxTresh(chip_info->tx_threshold); + chip->threshold = (SSCR1_RxTresh(chip_info->rx_threshold) & + SSCR1_RFT) | + (SSCR1_TxTresh(chip_info->tx_threshold) & + SSCR1_TFT); chip->enable_dma = chip_info->dma_burst_size != 0 && drv_data->master_info->enable_dma; chip->dma_threshold = 0; - if (chip->enable_dma) { - if (chip_info->dma_burst_size <= 8) { - chip->dma_threshold = SSCR1_RxTresh(8) - | SSCR1_TxTresh(8); - chip->dma_burst_size = DCMD_BURST8; - } else if (chip_info->dma_burst_size <= 16) { - chip->dma_threshold = SSCR1_RxTresh(16) - | SSCR1_TxTresh(16); - chip->dma_burst_size = DCMD_BURST16; - } else { - chip->dma_threshold = SSCR1_RxTresh(32) - | SSCR1_TxTresh(32); - chip->dma_burst_size = DCMD_BURST32; - } - } - - if (chip_info->enable_loopback) chip->cr1 = SSCR1_LBM; } + /* set dma burst and threshold outside of chip_info path so that if + * chip_info goes away after setting chip->enable_dma, the + * burst and threshold can still respond to changes in bits_per_word */ + if (chip->enable_dma) { + /* set up legal burst and threshold for dma */ + if (set_dma_burst_and_threshold(chip, spi, spi->bits_per_word, + &chip->dma_burst_size, + &chip->dma_threshold)) { + dev_warn(&spi->dev, "in setup: DMA burst size reduced " + "to match bits_per_word\n"); + } + } + if (drv_data->ioaddr == SSP1_VIRT) clk_div = SSP1_SerClkDiv(spi->max_speed_hz); else if (drv_data->ioaddr == SSP2_VIRT) @@ -1027,7 +1156,11 @@ static int setup(struct spi_device *spi) else if (drv_data->ioaddr == SSP3_VIRT) clk_div = SSP3_SerClkDiv(spi->max_speed_hz); else + { + dev_err(&spi->dev, "failed setup: unknown IO address=0x%p\n", + drv_data->ioaddr); return -ENODEV; + } chip->speed_hz = spi->max_speed_hz; chip->cr0 = clk_div @@ -1071,7 +1204,6 @@ static int setup(struct spi_device *spi) chip->write = u32_writer; } else { dev_err(&spi->dev, "invalid wordsize\n"); - kfree(chip); return -ENODEV; } chip->bits_per_word = spi->bits_per_word; @@ -1162,6 +1294,12 @@ static int destroy_queue(struct driver_data *drv_data) int status; status = stop_queue(drv_data); + /* we are unloading the module or failing to load (only two calls + * to this routine), and neither call can handle a return value. + * However, destroy_workqueue calls flush_workqueue, and that will + * block until all work is done. If the reason that stop_queue + * timed out is that the work will never finish, then it does no + * good to call destroy_workqueue, so return anyway. */ if (status != 0) return status; @@ -1360,7 +1498,16 @@ static int pxa2xx_spi_remove(struct platform_device *pdev) /* Remove the queue */ status = destroy_queue(drv_data); if (status != 0) - return status; + /* the kernel does not check the return status of this + * this routine (mod->exit, within the kernel). Therefore + * nothing is gained by returning from here, the module is + * going away regardless, and we should not leave any more + * resources allocated than necessary. We cannot free the + * message memory in drv_data->queue, but we can release the + * resources below. I think the kernel should honor -EBUSY + * returns but... */ + dev_err(&pdev->dev, "pxa2xx_spi_remove: workqueue will not " + "complete, message memory not freed\n"); /* Disable the SSP at the peripheral and SOC level */ write_SSCR0(0, drv_data->ioaddr); diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index ab1daecfeac..4e83f01e894 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -1615,6 +1615,16 @@ config FB_PNX4008_DUM_RGB ---help--- Say Y here to enable support for PNX4008 RGB Framebuffer +config FB_IBM_GXT4500 + tristate "Framebuffer support for IBM GXT4500P adaptor" + depends on PPC + select FB_CFB_FILLRECT + select FB_CFB_COPYAREA + select FB_CFB_IMAGEBLIT + ---help--- + Say Y here to enable support for the IBM GXT4500P display + adaptor, found on some IBM System P (pSeries) machines. + config FB_VIRTUAL tristate "Virtual Frame Buffer support (ONLY FOR TESTING!)" depends on FB diff --git a/drivers/video/Makefile b/drivers/video/Makefile index a6980e9a248..309a26dd164 100644 --- a/drivers/video/Makefile +++ b/drivers/video/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_FB_IMX) += imxfb.o obj-$(CONFIG_FB_S3C2410) += s3c2410fb.o obj-$(CONFIG_FB_PNX4008_DUM) += pnx4008/ obj-$(CONFIG_FB_PNX4008_DUM_RGB) += pnx4008/ +obj-$(CONFIG_FB_IBM_GXT4500) += gxt4500.o # Platform or fallback drivers go here obj-$(CONFIG_FB_VESA) += vesafb.o diff --git a/drivers/video/gxt4500.c b/drivers/video/gxt4500.c new file mode 100644 index 00000000000..3adf6ab0768 --- /dev/null +++ b/drivers/video/gxt4500.c @@ -0,0 +1,741 @@ +/* + * Frame buffer device for IBM GXT4500P display adaptor + * + * Copyright (C) 2006 Paul Mackerras, IBM Corp. <paulus@samba.org> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/fb.h> +#include <linux/console.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> +#include <linux/delay.h> + +#define PCI_DEVICE_ID_IBM_GXT4500P 0x21c + +/* GXT4500P registers */ + +/* Registers in PCI config space */ +#define CFG_ENDIAN0 0x40 + +/* Misc control/status registers */ +#define STATUS 0x1000 +#define CTRL_REG0 0x1004 +#define CR0_HALT_DMA 0x4 +#define CR0_RASTER_RESET 0x8 +#define CR0_GEOM_RESET 0x10 +#define CR0_MEM_CTRLER_RESET 0x20 + +/* Framebuffer control registers */ +#define FB_AB_CTRL 0x1100 +#define FB_CD_CTRL 0x1104 +#define FB_WID_CTRL 0x1108 +#define FB_Z_CTRL 0x110c +#define FB_VGA_CTRL 0x1110 +#define REFRESH_AB_CTRL 0x1114 +#define REFRESH_CD_CTRL 0x1118 +#define FB_OVL_CTRL 0x111c +#define FB_CTRL_TYPE 0x80000000 +#define FB_CTRL_WIDTH_MASK 0x007f0000 +#define FB_CTRL_WIDTH_SHIFT 16 +#define FB_CTRL_START_SEG_MASK 0x00003fff + +#define REFRESH_START 0x1098 +#define REFRESH_SIZE 0x109c + +/* "Direct" framebuffer access registers */ +#define DFA_FB_A 0x11e0 +#define DFA_FB_B 0x11e4 +#define DFA_FB_C 0x11e8 +#define DFA_FB_D 0x11ec +#define DFA_FB_ENABLE 0x80000000 +#define DFA_FB_BASE_MASK 0x03f00000 +#define DFA_FB_STRIDE_1k 0x00000000 +#define DFA_FB_STRIDE_2k 0x00000010 +#define DFA_FB_STRIDE_4k 0x00000020 +#define DFA_PIX_8BIT 0x00000000 +#define DFA_PIX_16BIT_565 0x00000001 +#define DFA_PIX_16BIT_1555 0x00000002 +#define DFA_PIX_24BIT 0x00000004 +#define DFA_PIX_32BIT 0x00000005 + +/* maps DFA_PIX_* to pixel size in bytes */ +static const unsigned char pixsize[] = { + 1, 2, 2, 2, 4, 4 +}; + +/* Display timing generator registers */ +#define DTG_CONTROL 0x1900 +#define DTG_CTL_SCREEN_REFRESH 2 +#define DTG_CTL_ENABLE 1 +#define DTG_HORIZ_EXTENT 0x1904 +#define DTG_HORIZ_DISPLAY 0x1908 +#define DTG_HSYNC_START 0x190c +#define DTG_HSYNC_END 0x1910 +#define DTG_HSYNC_END_COMP 0x1914 +#define DTG_VERT_EXTENT 0x1918 +#define DTG_VERT_DISPLAY 0x191c +#define DTG_VSYNC_START 0x1920 +#define DTG_VSYNC_END 0x1924 +#define DTG_VERT_SHORT 0x1928 + +/* PLL/RAMDAC registers */ +#define DISP_CTL 0x402c +#define DISP_CTL_OFF 2 +#define SYNC_CTL 0x4034 +#define SYNC_CTL_SYNC_ON_RGB 1 +#define SYNC_CTL_SYNC_OFF 2 +#define SYNC_CTL_HSYNC_INV 8 +#define SYNC_CTL_VSYNC_INV 0x10 +#define SYNC_CTL_HSYNC_OFF 0x20 +#define SYNC_CTL_VSYNC_OFF 0x40 + +#define PLL_M 0x4040 +#define PLL_N 0x4044 +#define PLL_POSTDIV 0x4048 + +/* Hardware cursor */ +#define CURSOR_X 0x4078 +#define CURSOR_Y 0x407c +#define CURSOR_HOTSPOT 0x4080 +#define CURSOR_MODE 0x4084 +#define CURSOR_MODE_OFF 0 +#define CURSOR_MODE_4BPP 1 +#define CURSOR_PIXMAP 0x5000 +#define CURSOR_CMAP 0x7400 + +/* Window attribute table */ +#define WAT_FMT 0x4100 +#define WAT_FMT_24BIT 0 +#define WAT_FMT_16BIT_565 1 +#define WAT_FMT_16BIT_1555 2 +#define WAT_FMT_32BIT 3 /* 0 vs. 3 is a guess */ +#define WAT_FMT_8BIT_332 9 +#define WAT_FMT_8BIT 0xa +#define WAT_FMT_NO_CMAP 4 /* ORd in to other values */ +#define WAT_CMAP_OFFSET 0x4104 /* 4-bit value gets << 6 */ +#define WAT_CTRL 0x4108 +#define WAT_CTRL_SEL_B 1 /* select B buffer if 1 */ +#define WAT_CTRL_NO_INC 2 +#define WAT_GAMMA_CTRL 0x410c +#define WAT_GAMMA_DISABLE 1 /* disables gamma cmap */ +#define WAT_OVL_CTRL 0x430c /* controls overlay */ + +/* Indexed by DFA_PIX_* values */ +static const unsigned char watfmt[] = { + WAT_FMT_8BIT, WAT_FMT_16BIT_565, WAT_FMT_16BIT_1555, 0, + WAT_FMT_24BIT, WAT_FMT_32BIT +}; + +/* Colormap array; 1k entries of 4 bytes each */ +#define CMAP 0x6000 + +#define readreg(par, reg) readl((par)->regs + (reg)) +#define writereg(par, reg, val) writel((val), (par)->regs + (reg)) + +struct gxt4500_par { + void __iomem *regs; + + int pixfmt; /* pixel format, see DFA_PIX_* values */ + + /* PLL parameters */ + int pll_m; /* ref clock divisor */ + int pll_n; /* VCO divisor */ + int pll_pd1; /* first post-divisor */ + int pll_pd2; /* second post-divisor */ + + u32 pseudo_palette[16]; /* used in color blits */ +}; + +/* mode requested by user */ +static char *mode_option; + +/* default mode: 1280x1024 @ 60 Hz, 8 bpp */ +static const struct fb_videomode defaultmode __devinitdata = { + .refresh = 60, + .xres = 1280, + .yres = 1024, + .pixclock = 9295, + .left_margin = 248, + .right_margin = 48, + .upper_margin = 38, + .lower_margin = 1, + .hsync_len = 112, + .vsync_len = 3, + .vmode = FB_VMODE_NONINTERLACED +}; + +/* + * The refclk and VCO dividers appear to use a linear feedback shift + * register, which gets reloaded when it reaches a terminal value, at + * which point the divider output is toggled. Thus one can obtain + * whatever divisor is required by putting the appropriate value into + * the reload register. For a divisor of N, one puts the value from + * the LFSR sequence that comes N-1 places before the terminal value + * into the reload register. + */ + +static const unsigned char mdivtab[] = { +/* 1 */ 0x3f, 0x00, 0x20, 0x10, 0x28, 0x14, 0x2a, 0x15, 0x0a, +/* 10 */ 0x25, 0x32, 0x19, 0x0c, 0x26, 0x13, 0x09, 0x04, 0x22, 0x11, +/* 20 */ 0x08, 0x24, 0x12, 0x29, 0x34, 0x1a, 0x2d, 0x36, 0x1b, 0x0d, +/* 30 */ 0x06, 0x23, 0x31, 0x38, 0x1c, 0x2e, 0x17, 0x0b, 0x05, 0x02, +/* 40 */ 0x21, 0x30, 0x18, 0x2c, 0x16, 0x2b, 0x35, 0x3a, 0x1d, 0x0e, +/* 50 */ 0x27, 0x33, 0x39, 0x3c, 0x1e, 0x2f, 0x37, 0x3b, 0x3d, 0x3e, +/* 60 */ 0x1f, 0x0f, 0x07, 0x03, 0x01, +}; + +static const unsigned char ndivtab[] = { +/* 2 */ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0x78, 0xbc, 0x5e, +/* 10 */ 0x2f, 0x17, 0x0b, 0x85, 0xc2, 0xe1, 0x70, 0x38, 0x9c, 0x4e, +/* 20 */ 0xa7, 0xd3, 0xe9, 0xf4, 0xfa, 0xfd, 0xfe, 0x7f, 0xbf, 0xdf, +/* 30 */ 0xef, 0x77, 0x3b, 0x1d, 0x8e, 0xc7, 0xe3, 0x71, 0xb8, 0xdc, +/* 40 */ 0x6e, 0xb7, 0x5b, 0x2d, 0x16, 0x8b, 0xc5, 0xe2, 0xf1, 0xf8, +/* 50 */ 0xfc, 0x7e, 0x3f, 0x9f, 0xcf, 0x67, 0xb3, 0xd9, 0x6c, 0xb6, +/* 60 */ 0xdb, 0x6d, 0x36, 0x9b, 0x4d, 0x26, 0x13, 0x89, 0xc4, 0x62, +/* 70 */ 0xb1, 0xd8, 0xec, 0xf6, 0xfb, 0x7d, 0xbe, 0x5f, 0xaf, 0x57, +/* 80 */ 0x2b, 0x95, 0x4a, 0x25, 0x92, 0x49, 0xa4, 0x52, 0x29, 0x94, +/* 90 */ 0xca, 0x65, 0xb2, 0x59, 0x2c, 0x96, 0xcb, 0xe5, 0xf2, 0x79, +/* 100 */ 0x3c, 0x1e, 0x0f, 0x07, 0x83, 0x41, 0x20, 0x90, 0x48, 0x24, +/* 110 */ 0x12, 0x09, 0x84, 0x42, 0xa1, 0x50, 0x28, 0x14, 0x8a, 0x45, +/* 120 */ 0xa2, 0xd1, 0xe8, 0x74, 0xba, 0xdd, 0xee, 0xf7, 0x7b, 0x3d, +/* 130 */ 0x9e, 0x4f, 0x27, 0x93, 0xc9, 0xe4, 0x72, 0x39, 0x1c, 0x0e, +/* 140 */ 0x87, 0xc3, 0x61, 0x30, 0x18, 0x8c, 0xc6, 0x63, 0x31, 0x98, +/* 150 */ 0xcc, 0xe6, 0x73, 0xb9, 0x5c, 0x2e, 0x97, 0x4b, 0xa5, 0xd2, +/* 160 */ 0x69, 0xb4, 0xda, 0xed, 0x76, 0xbb, 0x5d, 0xae, 0xd7, 0x6b, +/* 170 */ 0xb5, 0x5a, 0xad, 0x56, 0xab, 0xd5, 0x6a, 0x35, 0x1a, 0x8d, +/* 180 */ 0x46, 0x23, 0x11, 0x88, 0x44, 0x22, 0x91, 0xc8, 0x64, 0x32, +/* 190 */ 0x19, 0x0c, 0x86, 0x43, 0x21, 0x10, 0x08, 0x04, 0x02, 0x81, +/* 200 */ 0x40, 0xa0, 0xd0, 0x68, 0x34, 0x9a, 0xcd, 0x66, 0x33, 0x99, +/* 210 */ 0x4c, 0xa6, 0x53, 0xa9, 0xd4, 0xea, 0x75, 0x3a, 0x9d, 0xce, +/* 220 */ 0xe7, 0xf3, 0xf9, 0x7c, 0x3e, 0x1f, 0x8f, 0x47, 0xa3, 0x51, +/* 230 */ 0xa8, 0x54, 0xaa, 0x55, 0x2a, 0x15, 0x0a, 0x05, 0x82, 0xc1, +/* 240 */ 0x60, 0xb0, 0x58, 0xac, 0xd6, 0xeb, 0xf5, 0x7a, 0xbd, 0xde, +/* 250 */ 0x6f, 0x37, 0x1b, 0x0d, 0x06, 0x03, 0x01, +}; + +#define REF_PERIOD_PS 9259 /* period of reference clock in ps */ + +static int calc_pll(int period_ps, struct gxt4500_par *par) +{ + int m, n, pdiv1, pdiv2, postdiv; + int pll_period, best_error, t; + + /* only deal with range 1MHz - 400MHz */ + if (period_ps < 2500 || period_ps > 1000000) + return -1; + + best_error = 1000000; + for (pdiv1 = 1; pdiv1 <= 8; ++pdiv1) { + for (pdiv2 = 1; pdiv2 <= pdiv1; ++pdiv2) { + postdiv = pdiv1 * pdiv2; + pll_period = (period_ps + postdiv - 1) / postdiv; + /* keep pll in range 500..1250 MHz */ + if (pll_period < 800 || pll_period > 2000) + continue; + for (m = 3; m <= 40; ++m) { + n = REF_PERIOD_PS * m * postdiv / period_ps; + if (n < 5 || n > 256) + continue; + t = REF_PERIOD_PS * m * postdiv / n; + t -= period_ps; + if (t >= 0 && t < best_error) { + par->pll_m = m; + par->pll_n = n; + par->pll_pd1 = pdiv1; + par->pll_pd2 = pdiv2; + best_error = t; + } + } + } + } + if (best_error == 1000000) + return -1; + return 0; +} + +static int calc_pixclock(struct gxt4500_par *par) +{ + return REF_PERIOD_PS * par->pll_m * par->pll_pd1 * par->pll_pd2 + / par->pll_n; +} + +static int gxt4500_var_to_par(struct fb_var_screeninfo *var, + struct gxt4500_par *par) +{ + if (var->xres + var->xoffset > var->xres_virtual || + var->yres + var->yoffset > var->yres_virtual || + var->xres_virtual > 4096) + return -EINVAL; + if ((var->vmode & FB_VMODE_MASK) != FB_VMODE_NONINTERLACED) + return -EINVAL; + + if (calc_pll(var->pixclock, par) < 0) + return -EINVAL; + + switch (var->bits_per_pixel) { + case 32: + if (var->transp.length) + par->pixfmt = DFA_PIX_32BIT; + else + par->pixfmt = DFA_PIX_24BIT; + break; + case 24: + par->pixfmt = DFA_PIX_24BIT; + break; + case 16: + if (var->green.length == 5) + par->pixfmt = DFA_PIX_16BIT_1555; + else + par->pixfmt = DFA_PIX_16BIT_565; + break; + case 8: + par->pixfmt = DFA_PIX_8BIT; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct fb_bitfield eightbits = {0, 8}; +static const struct fb_bitfield nobits = {0, 0}; + +static void gxt4500_unpack_pixfmt(struct fb_var_screeninfo *var, + int pixfmt) +{ + var->bits_per_pixel = pixsize[pixfmt] * 8; + var->red = eightbits; + var->green = eightbits; + var->blue = eightbits; + var->transp = nobits; + + switch (pixfmt) { + case DFA_PIX_16BIT_565: + var->red.length = 5; + var->green.length = 6; + var->blue.length = 5; + break; + case DFA_PIX_16BIT_1555: + var->red.length = 5; + var->green.length = 5; + var->blue.length = 5; + var->transp.length = 1; + break; + case DFA_PIX_32BIT: + var->transp.length = 8; + break; + } + if (pixfmt != DFA_PIX_8BIT) { + var->green.offset = var->red.length; + var->blue.offset = var->green.offset + var->green.length; + if (var->transp.length) + var->transp.offset = + var->blue.offset + var->blue.length; + } +} + +static int gxt4500_check_var(struct fb_var_screeninfo *var, + struct fb_info *info) +{ + struct gxt4500_par par; + int err; + + par = *(struct gxt4500_par *)info->par; + err = gxt4500_var_to_par(var, &par); + if (!err) { + var->pixclock = calc_pixclock(&par); + gxt4500_unpack_pixfmt(var, par.pixfmt); + } + return err; +} + +static int gxt4500_set_par(struct fb_info *info) +{ + struct gxt4500_par *par = info->par; + struct fb_var_screeninfo *var = &info->var; + int err; + u32 ctrlreg; + unsigned int dfa_ctl, pixfmt, stride; + unsigned int wid_tiles, i; + unsigned int prefetch_pix, htot; + struct gxt4500_par save_par; + + save_par = *par; + err = gxt4500_var_to_par(var, par); + if (err) { + *par = save_par; + return err; + } + + /* turn off DTG for now */ + ctrlreg = readreg(par, DTG_CONTROL); + ctrlreg &= ~(DTG_CTL_ENABLE | DTG_CTL_SCREEN_REFRESH); + writereg(par, DTG_CONTROL, ctrlreg); + + /* set PLL registers */ + writereg(par, PLL_M, mdivtab[par->pll_m - 1]); + writereg(par, PLL_N, ndivtab[par->pll_n - 2]); + writereg(par, PLL_POSTDIV, + ((8 - par->pll_pd1) << 3) | (8 - par->pll_pd2)); + msleep(20); + + /* turn off hardware cursor */ + writereg(par, CURSOR_MODE, CURSOR_MODE_OFF); + + /* reset raster engine */ + writereg(par, CTRL_REG0, CR0_RASTER_RESET | (CR0_RASTER_RESET << 16)); + udelay(10); + writereg(par, CTRL_REG0, CR0_RASTER_RESET << 16); + + /* set display timing generator registers */ + htot = var->xres + var->left_margin + var->right_margin + + var->hsync_len; + writereg(par, DTG_HORIZ_EXTENT, htot - 1); + writereg(par, DTG_HORIZ_DISPLAY, var->xres - 1); + writereg(par, DTG_HSYNC_START, var->xres + var->right_margin - 1); + writereg(par, DTG_HSYNC_END, + var->xres + var->right_margin + var->hsync_len - 1); + writereg(par, DTG_HSYNC_END_COMP, + var->xres + var->right_margin + var->hsync_len - 1); + writereg(par, DTG_VERT_EXTENT, + var->yres + var->upper_margin + var->lower_margin + + var->vsync_len - 1); + writereg(par, DTG_VERT_DISPLAY, var->yres - 1); + writereg(par, DTG_VSYNC_START, var->yres + var->lower_margin - 1); + writereg(par, DTG_VSYNC_END, + var->yres + var->lower_margin + var->vsync_len - 1); + prefetch_pix = 3300000 / var->pixclock; + if (prefetch_pix >= htot) + prefetch_pix = htot - 1; + writereg(par, DTG_VERT_SHORT, htot - prefetch_pix - 1); + ctrlreg |= DTG_CTL_ENABLE | DTG_CTL_SCREEN_REFRESH; + writereg(par, DTG_CONTROL, ctrlreg); + + /* calculate stride in DFA aperture */ + if (var->xres_virtual > 2048) { + stride = 4096; + dfa_ctl = DFA_FB_STRIDE_4k; + } else if (var->xres_virtual > 1024) { + stride = 2048; + dfa_ctl = DFA_FB_STRIDE_2k; + } else { + stride = 1024; + dfa_ctl = DFA_FB_STRIDE_1k; + } + + /* Set up framebuffer definition */ + wid_tiles = (var->xres_virtual + 63) >> 6; + + /* XXX add proper FB allocation here someday */ + writereg(par, FB_AB_CTRL, FB_CTRL_TYPE | (wid_tiles << 16) | 0); + writereg(par, REFRESH_AB_CTRL, FB_CTRL_TYPE | (wid_tiles << 16) | 0); + writereg(par, FB_CD_CTRL, FB_CTRL_TYPE | (wid_tiles << 16) | 0); + writereg(par, REFRESH_CD_CTRL, FB_CTRL_TYPE | (wid_tiles << 16) | 0); + writereg(par, REFRESH_START, (var->xoffset << 16) | var->yoffset); + writereg(par, REFRESH_SIZE, (var->xres << 16) | var->yres); + + /* Set up framebuffer access by CPU */ + + pixfmt = par->pixfmt; + dfa_ctl |= DFA_FB_ENABLE | pixfmt; + writereg(par, DFA_FB_A, dfa_ctl); + + /* + * Set up window attribute table. + * We set all WAT entries the same so it doesn't matter what the + * window ID (WID) plane contains. + */ + for (i = 0; i < 32; ++i) { + writereg(par, WAT_FMT + (i << 4), watfmt[pixfmt]); + writereg(par, WAT_CMAP_OFFSET + (i << 4), 0); + writereg(par, WAT_CTRL + (i << 4), 0); + writereg(par, WAT_GAMMA_CTRL + (i << 4), WAT_GAMMA_DISABLE); + } + + /* Set sync polarity etc. */ + ctrlreg = readreg(par, SYNC_CTL) & + ~(SYNC_CTL_SYNC_ON_RGB | SYNC_CTL_HSYNC_INV | + SYNC_CTL_VSYNC_INV); + if (var->sync & FB_SYNC_ON_GREEN) + ctrlreg |= SYNC_CTL_SYNC_ON_RGB; + if (!(var->sync & FB_SYNC_HOR_HIGH_ACT)) + ctrlreg |= SYNC_CTL_HSYNC_INV; + if (!(var->sync & FB_SYNC_VERT_HIGH_ACT)) + ctrlreg |= SYNC_CTL_VSYNC_INV; + writereg(par, SYNC_CTL, ctrlreg); + + info->fix.line_length = stride * pixsize[pixfmt]; + info->fix.visual = (pixfmt == DFA_PIX_8BIT)? FB_VISUAL_PSEUDOCOLOR: + FB_VISUAL_DIRECTCOLOR; + + return 0; +} + +static int gxt4500_setcolreg(unsigned int reg, unsigned int red, + unsigned int green, unsigned int blue, + unsigned int transp, struct fb_info *info) +{ + u32 cmap_entry; + struct gxt4500_par *par = info->par; + + if (reg > 1023) + return 1; + cmap_entry = ((transp & 0xff00) << 16) | ((blue & 0xff00) << 8) | + (green & 0xff00) | (red >> 8); + writereg(par, CMAP + reg * 4, cmap_entry); + + if (reg < 16 && par->pixfmt != DFA_PIX_8BIT) { + u32 *pal = info->pseudo_palette; + u32 val = reg; + switch (par->pixfmt) { + case DFA_PIX_16BIT_565: + val |= (reg << 11) | (reg << 6); + break; + case DFA_PIX_16BIT_1555: + val |= (reg << 10) | (reg << 5); + break; + case DFA_PIX_32BIT: + val |= (reg << 24); + /* fall through */ + case DFA_PIX_24BIT: + val |= (reg << 16) | (reg << 8); + break; + } + pal[reg] = val; + } + + return 0; +} + +static int gxt4500_pan_display(struct fb_var_screeninfo *var, + struct fb_info *info) +{ + struct gxt4500_par *par = info->par; + + if (var->xoffset & 7) + return -EINVAL; + if (var->xoffset + var->xres > var->xres_virtual || + var->yoffset + var->yres > var->yres_virtual) + return -EINVAL; + + writereg(par, REFRESH_START, (var->xoffset << 16) | var->yoffset); + return 0; +} + +static int gxt4500_blank(int blank, struct fb_info *info) +{ + struct gxt4500_par *par = info->par; + int ctrl, dctl; + + ctrl = readreg(par, SYNC_CTL); + ctrl &= ~(SYNC_CTL_SYNC_OFF | SYNC_CTL_HSYNC_OFF | SYNC_CTL_VSYNC_OFF); + dctl = readreg(par, DISP_CTL); + dctl |= DISP_CTL_OFF; + switch (blank) { + case FB_BLANK_UNBLANK: + dctl &= ~DISP_CTL_OFF; + break; + case FB_BLANK_POWERDOWN: + ctrl |= SYNC_CTL_SYNC_OFF; + break; + case FB_BLANK_HSYNC_SUSPEND: + ctrl |= SYNC_CTL_HSYNC_OFF; + break; + case FB_BLANK_VSYNC_SUSPEND: + ctrl |= SYNC_CTL_VSYNC_OFF; + break; + default: ; + } + writereg(par, SYNC_CTL, ctrl); + writereg(par, DISP_CTL, dctl); + + return 0; +} + +static const struct fb_fix_screeninfo gxt4500_fix __devinitdata = { + .id = "IBM GXT4500P", + .type = FB_TYPE_PACKED_PIXELS, + .visual = FB_VISUAL_PSEUDOCOLOR, + .xpanstep = 8, + .ypanstep = 1, + .mmio_len = 0x20000, +}; + +static struct fb_ops gxt4500_ops = { + .owner = THIS_MODULE, + .fb_check_var = gxt4500_check_var, + .fb_set_par = gxt4500_set_par, + .fb_setcolreg = gxt4500_setcolreg, + .fb_pan_display = gxt4500_pan_display, + .fb_blank = gxt4500_blank, + .fb_fillrect = cfb_fillrect, + .fb_copyarea = cfb_copyarea, + .fb_imageblit = cfb_imageblit, +}; + +/* PCI functions */ +static int __devinit gxt4500_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int err; + unsigned long reg_phys, fb_phys; + struct gxt4500_par *par; + struct fb_info *info; + struct fb_var_screeninfo var; + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "gxt4500: cannot enable PCI device: %d\n", + err); + return err; + } + + reg_phys = pci_resource_start(pdev, 0); + if (!request_mem_region(reg_phys, pci_resource_len(pdev, 0), + "gxt4500 regs")) { + dev_err(&pdev->dev, "gxt4500: cannot get registers\n"); + goto err_nodev; + } + + fb_phys = pci_resource_start(pdev, 1); + if (!request_mem_region(fb_phys, pci_resource_len(pdev, 1), + "gxt4500 FB")) { + dev_err(&pdev->dev, "gxt4500: cannot get framebuffer\n"); + goto err_free_regs; + } + + info = framebuffer_alloc(sizeof(struct gxt4500_par), &pdev->dev); + if (!info) { + dev_err(&pdev->dev, "gxt4500: cannot alloc FB info record"); + goto err_free_fb; + } + par = info->par; + info->fix = gxt4500_fix; + info->pseudo_palette = par->pseudo_palette; + + info->fix.mmio_start = reg_phys; + par->regs = ioremap(reg_phys, pci_resource_len(pdev, 0)); + if (!par->regs) { + dev_err(&pdev->dev, "gxt4500: cannot map registers\n"); + goto err_free_all; + } + + info->fix.smem_start = fb_phys; + info->fix.smem_len = pci_resource_len(pdev, 1); + info->screen_base = ioremap(fb_phys, pci_resource_len(pdev, 1)); + if (!info->screen_base) { + dev_err(&pdev->dev, "gxt4500: cannot map framebuffer\n"); + goto err_unmap_regs; + } + + pci_set_drvdata(pdev, info); + + /* Set byte-swapping for DFA aperture for all pixel sizes */ + pci_write_config_dword(pdev, CFG_ENDIAN0, 0x333300); + + info->fbops = &gxt4500_ops; + info->flags = FBINFO_FLAG_DEFAULT; + + err = fb_alloc_cmap(&info->cmap, 256, 0); + if (err) { + dev_err(&pdev->dev, "gxt4500: cannot allocate cmap\n"); + goto err_unmap_all; + } + + gxt4500_blank(FB_BLANK_UNBLANK, info); + + if (!fb_find_mode(&var, info, mode_option, NULL, 0, &defaultmode, 8)) { + dev_err(&pdev->dev, "gxt4500: cannot find valid video mode\n"); + goto err_free_cmap; + } + info->var = var; + if (gxt4500_set_par(info)) { + printk(KERN_ERR "gxt4500: cannot set video mode\n"); + goto err_free_cmap; + } + + if (register_framebuffer(info) < 0) { + dev_err(&pdev->dev, "gxt4500: cannot register framebuffer\n"); + goto err_free_cmap; + } + printk(KERN_INFO "fb%d: %s frame buffer device\n", + info->node, info->fix.id); + + return 0; + + err_free_cmap: + fb_dealloc_cmap(&info->cmap); + err_unmap_all: + iounmap(info->screen_base); + err_unmap_regs: + iounmap(par->regs); + err_free_all: + framebuffer_release(info); + err_free_fb: + release_mem_region(fb_phys, pci_resource_len(pdev, 1)); + err_free_regs: + release_mem_region(reg_phys, pci_resource_len(pdev, 0)); + err_nodev: + return -ENODEV; +} + +static void __devexit gxt4500_remove(struct pci_dev *pdev) +{ + struct fb_info *info = pci_get_drvdata(pdev); + struct gxt4500_par *par; + + if (!info) + return; + par = info->par; + unregister_framebuffer(info); + fb_dealloc_cmap(&info->cmap); + iounmap(par->regs); + iounmap(info->screen_base); + release_mem_region(pci_resource_start(pdev, 0), + pci_resource_len(pdev, 0)); + release_mem_region(pci_resource_start(pdev, 1), + pci_resource_len(pdev, 1)); + framebuffer_release(info); +} + +/* supported chipsets */ +static const struct pci_device_id gxt4500_pci_tbl[] = { + { PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_GXT4500P, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, gxt4500_pci_tbl); + +static struct pci_driver gxt4500_driver = { + .name = "gxt4500", + .id_table = gxt4500_pci_tbl, + .probe = gxt4500_probe, + .remove = __devexit_p(gxt4500_remove), +}; + +static int __devinit gxt4500_init(void) +{ +#ifndef MODULE + if (fb_get_options("gxt4500", &mode_option)) + return -ENODEV; +#endif + + return pci_register_driver(&gxt4500_driver); +} +module_init(gxt4500_init); + +static void __exit gxt4500_exit(void) +{ + pci_unregister_driver(&gxt4500_driver); +} +module_exit(gxt4500_exit); + +MODULE_AUTHOR("Paul Mackerras <paulus@samba.org>"); +MODULE_DESCRIPTION("FBDev driver for IBM GXT4500P"); +MODULE_LICENSE("GPL"); +module_param(mode_option, charp, 0); +MODULE_PARM_DESC(mode_option, "Specify resolution as \"<xres>x<yres>[-<bpp>][@<refresh>]\""); |