diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-05 09:49:07 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-05 09:49:07 -0800 |
commit | 7b626acb8f983eb83b396ab96cc24b18d635d487 (patch) | |
tree | 8c3320191311e6186d3aa722f1acd12acd47ece8 | |
parent | 1ebb275afcf5a47092e995541d6c604eef96062a (diff) | |
parent | 4528752f49c1f4025473d12bc5fa9181085c3f22 (diff) |
Merge branch 'core-iommu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-iommu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (63 commits)
x86, Calgary IOMMU quirk: Find nearest matching Calgary while walking up the PCI tree
x86/amd-iommu: Remove amd_iommu_pd_table
x86/amd-iommu: Move reset_iommu_command_buffer out of locked code
x86/amd-iommu: Cleanup DTE flushing code
x86/amd-iommu: Introduce iommu_flush_device() function
x86/amd-iommu: Cleanup attach/detach_device code
x86/amd-iommu: Keep devices per domain in a list
x86/amd-iommu: Add device bind reference counting
x86/amd-iommu: Use dev->arch->iommu to store iommu related information
x86/amd-iommu: Remove support for domain sharing
x86/amd-iommu: Rearrange dma_ops related functions
x86/amd-iommu: Move some pte allocation functions in the right section
x86/amd-iommu: Remove iommu parameter from dma_ops_domain_alloc
x86/amd-iommu: Use get_device_id and check_device where appropriate
x86/amd-iommu: Move find_protection_domain to helper functions
x86/amd-iommu: Simplify get_device_resources()
x86/amd-iommu: Let domain_for_device handle aliases
x86/amd-iommu: Remove iommu specific handling from dma_ops path
x86/amd-iommu: Remove iommu parameter from __(un)map_single
x86/amd-iommu: Make alloc_new_range aware of multiple IOMMUs
...
33 files changed, 1068 insertions, 883 deletions
diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h index dcbaea7ce12..f0acde68aae 100644 --- a/arch/ia64/include/asm/swiotlb.h +++ b/arch/ia64/include/asm/swiotlb.h @@ -4,8 +4,6 @@ #include <linux/dma-mapping.h> #include <linux/swiotlb.h> -extern int swiotlb_force; - #ifdef CONFIG_SWIOTLB extern int swiotlb; extern void pci_swiotlb_init(void); diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c index 285aae8431c..53292abf846 100644 --- a/arch/ia64/kernel/pci-swiotlb.c +++ b/arch/ia64/kernel/pci-swiotlb.c @@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = { void __init swiotlb_dma_init(void) { dma_ops = &swiotlb_dma_ops; - swiotlb_init(); + swiotlb_init(1); } void __init pci_swiotlb_init(void) @@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void) swiotlb = 1; printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n"); machvec_init("dig"); - swiotlb_init(); + swiotlb_init(1); dma_ops = &swiotlb_dma_ops; #else panic("Unable to find Intel IOMMU"); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 53bcf3d792d..b152de3e64d 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_SWIOTLB if (ppc_swiotlb_enable) - swiotlb_init(); + swiotlb_init(1); #endif paging_init(); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 04f638d82fb..df2c9e932b3 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -550,7 +550,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_SWIOTLB if (ppc_swiotlb_enable) - swiotlb_init(); + swiotlb_init(1); #endif paging_init(); diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 4b180897e6b..5af2982133b 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -23,19 +23,13 @@ #include <linux/irqreturn.h> #ifdef CONFIG_AMD_IOMMU -extern int amd_iommu_init(void); -extern int amd_iommu_init_dma_ops(void); -extern int amd_iommu_init_passthrough(void); + extern void amd_iommu_detect(void); -extern irqreturn_t amd_iommu_int_handler(int irq, void *data); -extern void amd_iommu_flush_all_domains(void); -extern void amd_iommu_flush_all_devices(void); -extern void amd_iommu_shutdown(void); -extern void amd_iommu_apply_erratum_63(u16 devid); + #else -static inline int amd_iommu_init(void) { return -ENODEV; } + static inline void amd_iommu_detect(void) { } -static inline void amd_iommu_shutdown(void) { } + #endif #endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h new file mode 100644 index 00000000000..84786fb9a23 --- /dev/null +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2009 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_PROTO_H +#define _ASM_X86_AMD_IOMMU_PROTO_H + +struct amd_iommu; + +extern int amd_iommu_init_dma_ops(void); +extern int amd_iommu_init_passthrough(void); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_flush_all_domains(void); +extern void amd_iommu_flush_all_devices(void); +extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); + +#ifndef CONFIG_AMD_IOMMU_STATS + +static inline void amd_iommu_stats_init(void) { } + +#endif /* !CONFIG_AMD_IOMMU_STATS */ + +#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 2a2cc7a78a8..ba19ad4c47d 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -25,6 +25,11 @@ #include <linux/spinlock.h> /* + * Maximum number of IOMMUs supported + */ +#define MAX_IOMMUS 32 + +/* * some size calculation constants */ #define DEV_TABLE_ENTRY_SIZE 32 @@ -206,6 +211,9 @@ extern bool amd_iommu_dump; printk(KERN_INFO "AMD-Vi: " format, ## arg); \ } while(0); +/* global flag if IOMMUs cache non-present entries */ +extern bool amd_iommu_np_cache; + /* * Make iterating over all IOMMUs easier */ @@ -226,6 +234,8 @@ extern bool amd_iommu_dump; * independent of their use. */ struct protection_domain { + struct list_head list; /* for list of all protection domains */ + struct list_head dev_list; /* List of all devices in this domain */ spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ @@ -233,7 +243,20 @@ struct protection_domain { unsigned long flags; /* flags to find out type of domain */ bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ + unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ void *priv; /* private data */ + +}; + +/* + * This struct contains device specific data for the IOMMU + */ +struct iommu_dev_data { + struct list_head list; /* For domain->dev_list */ + struct device *dev; /* Device this data belong to */ + struct device *alias; /* The Alias Device */ + struct protection_domain *domain; /* Domain the device is bound to */ + atomic_t bind; /* Domain attach reverent count */ }; /* @@ -291,6 +314,9 @@ struct dma_ops_domain { struct amd_iommu { struct list_head list; + /* Index within the IOMMU array */ + int index; + /* locks the accesses to the hardware */ spinlock_t lock; @@ -357,6 +383,21 @@ struct amd_iommu { extern struct list_head amd_iommu_list; /* + * Array with pointers to each IOMMU struct + * The indices are referenced in the protection domains + */ +extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; + +/* Number of IOMMUs present in the system */ +extern int amd_iommus_present; + +/* + * Declarations for the global list of all protection domains + */ +extern spinlock_t amd_iommu_pd_lock; +extern struct list_head amd_iommu_pd_list; + +/* * Structure defining one entry in the device table */ struct dev_table_entry { @@ -416,15 +457,9 @@ extern unsigned amd_iommu_aperture_order; /* largest PCI device id we expect translation requests for */ extern u16 amd_iommu_last_bdf; -/* data structures for protection domain handling */ -extern struct protection_domain **amd_iommu_pd_table; - /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; -/* will be 1 if device isolation is enabled */ -extern bool amd_iommu_isolate; - /* * If true, the addresses will be flushed on unmap time, not when * they are reused @@ -462,11 +497,6 @@ struct __iommu_counter { #define ADD_STATS_COUNTER(name, x) #define SUB_STATS_COUNTER(name, x) -static inline void amd_iommu_stats_init(void) { } - #endif /* CONFIG_AMD_IOMMU_STATS */ -/* some function prototypes */ -extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); - #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h index b03bedb62aa..0918654305a 100644 --- a/arch/x86/include/asm/calgary.h +++ b/arch/x86/include/asm/calgary.h @@ -62,10 +62,8 @@ struct cal_chipset_ops { extern int use_calgary; #ifdef CONFIG_CALGARY_IOMMU -extern int calgary_iommu_init(void); extern void detect_calgary(void); #else -static inline int calgary_iommu_init(void) { return 1; } static inline void detect_calgary(void) { return; } #endif diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index cee34e9ca45..029f230ab63 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -8,7 +8,7 @@ struct dev_archdata { #ifdef CONFIG_X86_64 struct dma_map_ops *dma_ops; #endif -#ifdef CONFIG_DMAR +#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU) void *iommu; /* hook for IOMMU specific extension */ #endif }; diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 6a25d5d4283..0f6c02f3b7d 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -20,7 +20,8 @@ # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) #endif -extern dma_addr_t bad_dma_address; +#define DMA_ERROR_CODE 0 + extern int iommu_merge; extern struct device x86_dma_fallback_dev; extern int panic_on_overflow; @@ -48,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); - return (dma_addr == bad_dma_address); + return (dma_addr == DMA_ERROR_CODE); } #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 6cfdafa409d..4ac5b0f33fc 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed; extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); -extern void gart_iommu_shutdown(void); +extern int gart_iommu_init(void); extern void __init gart_parse_options(char *); extern void gart_iommu_hole_init(void); @@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void); static inline void early_gart_iommu_check(void) { } -static inline void gart_iommu_init(void) -{ -} -static inline void gart_iommu_shutdown(void) -{ -} static inline void gart_parse_options(char *options) { } diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index fd6d21bbee6..345c99cef15 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_IOMMU_H #define _ASM_X86_IOMMU_H -extern void pci_iommu_shutdown(void); -extern void no_iommu_init(void); extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index b9e4e20174f..87ffcb12a1b 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -3,17 +3,14 @@ #include <linux/swiotlb.h> -/* SWIOTLB interface */ - -extern int swiotlb_force; - #ifdef CONFIG_SWIOTLB extern int swiotlb; -extern void pci_swiotlb_init(void); +extern int pci_swiotlb_init(void); #else #define swiotlb 0 -static inline void pci_swiotlb_init(void) +static inline int pci_swiotlb_init(void) { + return 0; } #endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2c756fd4ab0..d8e71459f02 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -91,6 +91,14 @@ struct x86_init_timers { }; /** + * struct x86_init_iommu - platform specific iommu setup + * @iommu_init: platform specific iommu setup + */ +struct x86_init_iommu { + int (*iommu_init)(void); +}; + +/** * struct x86_init_ops - functions for platform specific setup * */ @@ -101,6 +109,7 @@ struct x86_init_ops { struct x86_init_oem oem; struct x86_init_paging paging; struct x86_init_timers timers; + struct x86_init_iommu iommu; }; /** @@ -121,6 +130,7 @@ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); + void (*iommu_shutdown)(void); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0285521e0a9..32fb09102a1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -28,6 +28,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/amd_iommu_proto.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -56,20 +57,115 @@ struct iommu_cmd { u32 data[4]; }; -static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, - struct unity_map_entry *e); -static struct dma_ops_domain *find_protection_domain(u16 devid); -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, int end_lvl, - u64 **pte_page, gfp_t gfp); -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages); static void reset_iommu_command_buffer(struct amd_iommu *iommu); -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size); static void update_domain(struct protection_domain *domain); +/**************************************************************************** + * + * Helper functions + * + ****************************************************************************/ + +static inline u16 get_device_id(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return calc_devid(pdev->bus->number, pdev->devfn); +} + +static struct iommu_dev_data *get_dev_data(struct device *dev) +{ + return dev->archdata.iommu; +} + +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + u16 alias = amd_iommu_alias_table[devid]; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid || + entry->target_dev == alias) { + ret = entry; + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + u16 devid; + + if (!dev || !dev->dma_mask) + return false; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return false; + + devid = get_device_id(dev); + + /* Out of our scope? */ + if (devid > amd_iommu_last_bdf) + return false; + + if (amd_iommu_rlookup_table[devid] == NULL) + return false; + + return true; +} + +static int iommu_init_device(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct pci_dev *pdev; + u16 devid, alias; + + if (dev->archdata.iommu) + return 0; + + dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + dev_data->dev = dev; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); + if (pdev) + dev_data->alias = &pdev->dev; + + atomic_set(&dev_data->bind, 0); + + dev->archdata.iommu = dev_data; + + + return 0; +} + +static void iommu_uninit_device(struct device *dev) +{ + kfree(dev->archdata.iommu); +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -90,7 +186,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); static struct dentry *stats_dir; -static struct dentry *de_isolate; static struct dentry *de_fflush; static void amd_iommu_stats_add(struct __iommu_counter *cnt) @@ -108,9 +203,6 @@ static void amd_iommu_stats_init(void) if (stats_dir == NULL) return; - de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, - (u32 *)&amd_iommu_isolate); - de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, (u32 *)&amd_iommu_unmap_flush); @@ -130,12 +222,6 @@ static void amd_iommu_stats_init(void) #endif -/* returns !0 if the IOMMU is caching non-present entries in its TLB */ -static int iommu_has_npcache(struct amd_iommu *iommu) -{ - return iommu->cap & (1UL << IOMMU_CAP_NPCACHE); -} - /**************************************************************************** * * Interrupt handling functions @@ -199,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + iommu->reset_in_progress = true; reset_iommu_command_buffer(iommu); dump_command(address); break; @@ -321,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) { - spin_unlock(&iommu->lock); - reset_iommu_command_buffer(iommu); - spin_lock(&iommu->lock); - } + if (unlikely(i == EXIT_LOOP_COUNT)) + iommu->reset_in_progress = true; } /* @@ -372,26 +456,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu) out: spin_unlock_irqrestore(&iommu->lock, flags); + if (iommu->reset_in_progress) + reset_iommu_command_buffer(iommu); + return 0; } +static void iommu_flush_complete(struct protection_domain *domain) +{ + int i; + + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); + } +} + /* * Command send function for invalidating a device table entry */ -static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) +static int iommu_flush_device(struct device *dev) { + struct amd_iommu *iommu; struct iommu_cmd cmd; - int ret; + u16 devid; - BUG_ON(iommu == NULL); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + /* Build command */ memset(&cmd, 0, sizeof(cmd)); CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; - ret = iommu_queue_command(iommu, &cmd); - - return ret; + return iommu_queue_command(iommu, &cmd); } static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, @@ -430,11 +534,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, - u64 address, size_t size) +static void __iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { - int s = 0; - unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); + int s = 0, i; + unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -447,142 +551,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, s = 1; } - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); - return 0; + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need a TLB flush + */ + iommu_queue_inv_iommu_pages(amd_iommus[i], address, + domain->id, pde, s); + } + + return; } -/* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); + __iommu_flush_pages(domain, address, size, 0); +} - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); +/* Flush the whole IO/TLB for a given protection domain */ +static void iommu_flush_tlb(struct protection_domain *domain) +{ + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_tlb_pde(struct protection_domain *domain) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); - - iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } + /* - * This function flushes one domain on one IOMMU + * This function flushes the DTEs for all devices in domain */ -static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_domain_devices(struct protection_domain *domain) { - struct iommu_cmd cmd; + struct iommu_dev_data *dev_data; unsigned long flags; - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); + spin_lock_irqsave(&domain->lock, flags); - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) + iommu_flush_device(dev_data->dev); + + spin_unlock_irqrestore(&domain->lock, flags); } -static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +static void iommu_flush_all_domain_devices(void) { - int i; + struct protection_domain *domain; + unsigned long flags; - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) - continue; - flush_domain_on_iommu(iommu, i); + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + iommu_flush_domain_devices(domain); + iommu_flush_complete(domain); } + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +void amd_iommu_flush_all_devices(void) +{ + iommu_flush_all_domain_devices(); } /* - * This function is used to flush the IO/TLB for a given protection domain - * on every IOMMU in the system + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. */ -static void iommu_flush_domain(u16 domid) +void amd_iommu_flush_all_domains(void) { - struct amd_iommu *iommu; + struct protection_domain *domain; + unsigned long flags; - INC_STATS_COUNTER(domain_flush_all); + spin_lock_irqsave(&amd_iommu_pd_lock, flags); - for_each_iommu(iommu) - flush_domain_on_iommu(iommu, domid); + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + spin_lock(&domain->lock); + iommu_flush_tlb_pde(domain); + iommu_flush_complete(domain); + spin_unlock(&domain->lock); + } + + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -void amd_iommu_flush_all_domains(void) +static void reset_iommu_command_buffer(struct amd_iommu *iommu) { - struct amd_iommu *iommu; + pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); - for_each_iommu(iommu) - flush_all_domains_on_iommu(iommu); + if (iommu->reset_in_progress) + panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + + amd_iommu_reset_cmd_buffer(iommu); + amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); + + iommu->reset_in_progress = false; } -static void flush_all_devices_for_iommu(struct amd_iommu *iommu) +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. + */ +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) { - int i; + u64 *pte; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (iommu != amd_iommu_rlookup_table[i]) - continue; + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); - } + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; } -static void flush_devices_by_domain(struct protection_domain *domain) +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) { - struct amd_iommu *iommu; - int i; + u64 *pte, *page; + int level; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || - (amd_iommu_pd_table[i] != domain)) - continue; + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); - iommu = amd_iommu_rlookup_table[i]; - if (!iommu) - continue; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } + + level -= 1; + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page && level == end_lvl) + *pte_page = pte; + + pte = &pte[PM_LEVEL_INDEX(level, address)]; } + + return pte; } -static void reset_iommu_command_buffer(struct amd_iommu *iommu) +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) { - pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); + int level; + u64 *pte; - if (iommu->reset_in_progress) - panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - iommu->reset_in_progress = true; + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; - amd_iommu_reset_cmd_buffer(iommu); - flush_all_devices_for_iommu(iommu); - flush_all_domains_on_iommu(iommu); + level -= 1; - iommu->reset_in_progress = false; -} + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; -void amd_iommu_flush_all_devices(void) -{ - flush_devices_by_domain(NULL); -} + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } -/**************************************************************************** - * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ + return pte; +} /* * Generic mapping functions. It maps a physical address into a DMA @@ -654,28 +828,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, } /* - * Init the unity mappings for a specific IOMMU in the system - * - * Basically iterates over all unity mapping entries and applies them to - * the default domain DMA of that IOMMU if necessary. - */ -static int iommu_init_unity_mappings(struct amd_iommu *iommu) -{ - struct unity_map_entry *entry; - int ret; - - list_for_each_entry(entry, &amd_iommu_unity_map, list) { - if (!iommu_for_unity_map(iommu, entry)) - continue; - ret = dma_ops_unity_map(iommu->default_dom, entry); - if (ret) - return ret; - } - - return 0; -} - -/* * This function actually applies the mapping to the page table of the * dma_ops domain. */ @@ -704,6 +856,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, } /* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ +static int iommu_init_unity_mappings(struct amd_iommu *iommu) +{ + struct unity_map_entry *entry; + int ret; + + list_for_each_entry(entry, &amd_iommu_unity_map, list) { + if (!iommu_for_unity_map(iommu, entry)) + continue; + ret = dma_ops_unity_map(iommu->default_dom, entry); + if (ret) + return ret; + } + + return 0; +} + +/* * Inits the unity mappings required for a specific device */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, @@ -740,34 +914,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, */ /* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. */ -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size) +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages) { - int level; - u64 *pte; - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > map_size) { - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - level -= 1; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; + if (start_page + pages > last_page) + pages = last_page - start_page; - if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { - pte = NULL; - break; - } + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); } - - return pte; } /* @@ -775,11 +938,11 @@ static u64 *fetch_pte(struct protection_domain *domain, * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + struct amd_iommu *iommu; int i; #ifdef CONFIG_IOMMU_STRESS @@ -819,14 +982,17 @@ static int alloc_new_range(struct amd_iommu *iommu, dma_dom->aperture_size += APERTURE_RANGE_SIZE; /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start >= dma_dom->aperture[index]->offset && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); + for_each_iommu(iommu) { + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset + && iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + startpage = iommu->exclusion_start >> PAGE_SHIFT; + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } } /* @@ -928,7 +1094,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, } if (unlikely(address == -1)) - address = bad_dma_address; + address = DMA_ERROR_CODE; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -973,6 +1139,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, * ****************************************************************************/ +/* + * This function adds a protection domain to the global protection domain list + */ +static void add_domain_to_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_add(&domain->list, &amd_iommu_pd_list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +/* + * This function removes a protection domain to the global + * protection domain list + */ +static void del_domain_from_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_del(&domain->list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + static u16 domain_id_alloc(void) { unsigned long flags; @@ -1000,26 +1191,6 @@ static void domain_id_free(int id) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -/* - * Used to reserve address ranges in the aperture (e.g. for exclusion - * ranges. - */ -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages) -{ - unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - - if (start_page + pages > last_page) - pages = last_page - start_page; - - for (i = start_page; i < start_page + pages; ++i) { - int index = i / APERTURE_RANGE_PAGES; - int page = i % APERTURE_RANGE_PAGES; - __set_bit(page, dom->aperture[index]->bitmap); - } -} - static void free_pagetable(struct protection_domain *domain) { int i, j; @@ -1061,6 +1232,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; + del_domain_from_list(&dom->domain); + free_pagetable(&dom->domain); for (i = 0; i < APERTURE_MAX_RANGES; ++i) { @@ -1078,7 +1251,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) +static struct dma_ops_domain *dma_ops_domain_alloc(void) { struct dma_ops_domain *dma_dom; @@ -1091,6 +1264,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; + INIT_LIST_HEAD(&dma_dom->domain.dev_list); dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; @@ -1101,7 +1275,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + add_domain_to_list(&dma_dom->domain); + + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -1129,22 +1305,6 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -/* - * Find out the protection domain structure for a given PCI device. This - * will give us the pointer to the page table root for example. - */ -static struct protection_domain *domain_for_device(u16 devid) -{ - struct protection_domain *dom; - unsigned long flags; - - read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = amd_iommu_pd_table[devid]; - read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - return dom; -} - static void set_dte_entry(u16 devid, struct protection_domain *domain) { u64 pte_root = virt_to_phys(domain->pt_root); @@ -1156,42 +1316,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); +} + +static void clear_dte_entry(u16 devid) +{ + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; - amd_iommu_pd_table[devid] = domain; + amd_iommu_apply_erratum_63(devid); +} + +static void do_attach(struct device *dev, struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + set_dte_entry(devid, domain); + + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; + + /* Flush the DTE entry */ + iommu_flush_device(dev); +} + +static void do_detach(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* decrease reference counters */ + dev_data->domain->dev_iommu[iommu->index] -= 1; + dev_data->domain->dev_cnt -= 1; + + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + clear_dte_entry(devid); + + /* Flush the DTE entry */ + iommu_flush_device(dev); } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void __attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int __attach_device(struct device *dev, + struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *alias_data; + + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + + if (!alias_data) + return -EINVAL; + /* lock domain */ spin_lock(&domain->lock); - /* update DTE entry */ - set_dte_entry(devid, domain); + /* Some sanity checks */ + if (alias_data->domain != NULL && + alias_data->domain != domain) + return -EBUSY; - domain->dev_cnt += 1; + if (dev_data->domain != NULL && + dev_data->domain != domain) + return -EBUSY; + + /* Do real assignment */ + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (alias_data->domain == NULL) + do_attach(dev_data->alias, domain); + + atomic_inc(&alias_data->bind); + } + + if (dev_data->domain == NULL) + do_attach(dev, domain); + + atomic_inc(&dev_data->bind); /* ready */ spin_unlock(&domain->lock); + + return 0; } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int attach_device(struct device *dev, + struct protection_domain *domain) { unsigned long flags; + int ret; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __attach_device(iommu, domain, devid); + ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* @@ -1199,98 +1440,125 @@ static void attach_device(struct amd_iommu *iommu, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_queue_inv_dev_entry(iommu, devid); - iommu_flush_tlb_pde(iommu, domain->id); + iommu_flush_tlb_pde(domain); + + return ret; } /* * Removes a device from a protection domain (unlocked) */ -static void __detach_device(struct protection_domain *domain, u16 devid) +static void __detach_device(struct device *dev) { + struct iommu_dev_data *dev_data = get_dev_data(dev); + struct iommu_dev_data *alias_data; + unsigned long flags; - /* lock domain */ - spin_lock(&domain->lock); - - /* remove domain from the lookup table */ - amd_iommu_pd_table[devid] = NULL; + BUG_ON(!dev_data->domain); - /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; + spin_lock_irqsave(&dev_data->domain->lock, flags); - amd_iommu_apply_erratum_63(devid); + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (atomic_dec_and_test(&alias_data->bind)) + do_detach(dev_data->alias); + } - /* decrease reference counter */ - domain->dev_cnt -= 1; + if (atomic_dec_and_test(&dev_data->bind)) + do_detach(dev); - /* ready */ - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&dev_data->domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the * passthrough domain if it is detached from any other domain */ - if (iommu_pass_through) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - __attach_device(iommu, pt_domain, devid); - } + if (iommu_pass_through && dev_data->domain == NULL) + __attach_device(dev, pt_domain); } /* * Removes a device from a protection domain (with devtable_lock held) */ -static void detach_device(struct protection_domain *domain, u16 devid) +static void detach_device(struct device *dev) { unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(domain, devid); + __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ +static struct protection_domain *domain_for_device(struct device *dev) +{ + struct protection_domain *dom; + struct iommu_dev_data *dev_data, *alias_data; + unsigned long flags; + u16 devid, alias; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + if (!alias_data) + return NULL; + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + dom = dev_data->domain; + if (dom == NULL && + alias_data->domain != NULL) { + __attach_device(dev, alias_data->domain); + dom = alias_data->domain; + } + + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return dom; +} + static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct pci_dev *pdev = to_pci_dev(dev); - u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + u16 devid; struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; unsigned long flags; - if (devid > amd_iommu_last_bdf) - goto out; - - devid = amd_iommu_alias_table[devid]; - - iommu = amd_iommu_rlookup_table[devid]; - if (iommu == NULL) - goto out; - - domain = domain_for_device(devid); + if (!check_device(dev)) + return 0; - if (domain && !dma_ops_domain(domain)) - WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " - "to a non-dma-ops domain\n", dev_name(dev)); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: + + domain = domain_for_device(dev); + if (!domain) goto out; if (iommu_pass_through) break; - detach_device(domain, devid); + detach_device(dev); break; case BUS_NOTIFY_ADD_DEVICE: + + iommu_init_device(dev); + + domain = domain_for_device(dev); + /* allocate a protection domain if a device is added */ dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu); + dma_domain = dma_ops_domain_alloc(); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1300,11 +1568,15 @@ static int device_change_notifier(struct notifier_block *nb, spin_unlock_irqrestore(&iommu_pd_list_lock, flags); break; + case BUS_NOTIFY_DEL_DEVICE: + + iommu_uninit_device(dev); + default: goto out; } - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); out: @@ -1322,106 +1594,46 @@ static struct notifier_block device_nb = { *****************************************************************************/ /* - * This function checks if the driver got a valid device from the caller to - * avoid dereferencing invalid pointers. - */ -static bool check_device(struct device *dev) -{ - if (!dev || !dev->dma_mask) - return false; - - return true; -} - -/* - * In this function the list of preallocated protection domains is traversed to - * find the domain for a specific device - */ -static struct dma_ops_domain *find_protection_domain(u16 devid) -{ - struct dma_ops_domain *entry, *ret = NULL; - unsigned long flags; - - if (list_empty(&iommu_pd_list)) - return NULL; - - spin_lock_irqsave(&iommu_pd_list_lock, flags); - - list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid) { - ret = entry; - break; - } - } - - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - return ret; -} - -/* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the * requestor id for a given device. * If the device is not yet associated with a domain this is also done * in this function. */ -static int get_device_resources(struct device *dev, - struct amd_iommu **iommu, - struct protection_domain **domain, - u16 *bdf) +static struct protection_domain *get_domain(struct device *dev) { + struct protection_domain *domain; struct dma_ops_domain *dma_dom; - struct pci_dev *pcidev; - u16 _bdf; - - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; - - if (dev->bus != &pci_bus_type) - return 0; + u16 devid = get_device_id(dev); - pcidev = to_pci_dev(dev); - _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + if (!check_device(dev)) + return ERR_PTR(-EINVAL); - /* device not translated by any IOMMU in the system? */ - if (_bdf > amd_iommu_last_bdf) - return 0; + domain = domain_for_device(dev); + if (domain != NULL && !dma_ops_domain(domain)) + return ERR_PTR(-EBUSY); - *bdf = amd_iommu_alias_table[_bdf]; + if (domain != NULL) + return domain; - *iommu = amd_iommu_rlookup_table[*bdf]; - if (*iommu == NULL) - return 0; - *domain = domain_for_device(*bdf); - if (*domain == NULL) { - dma_dom = find_protection_domain(*bdf); - if (!dma_dom) - dma_dom = (*iommu)->default_dom; - *domain = &dma_dom->domain; - attach_device(*iommu, *domain, *bdf); - DUMP_printk("Using protection domain %d for device %s\n", - (*domain)->id, dev_name(dev)); - } - - if (domain_for_device(_bdf) == NULL) - attach_device(*iommu, *domain, _bdf); + /* Device not bount yet - bind it */ + dma_dom = find_protection_domain(devid); + if (!dma_dom) + dma_dom = amd_iommu_rlookup_table[devid]->default_dom; + attach_device(dev, &dma_dom->domain); + DUMP_printk("Using protection domain %d for device %s\n", + dma_dom->domain.id, dev_name(dev)); - return 1; + return &dma_dom->domain; } static void update_device_table(struct protection_domain *domain) { - unsigned long flags; - int i; + struct iommu_dev_data *dev_data; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] != domain) - continue; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - set_dte_entry(i, domain); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) { + u16 devid = get_device_id(dev_data->dev); + set_dte_entry(devid, domain); } } @@ -1431,76 +1643,13 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - flush_devices_by_domain(domain); - iommu_flush_domain(domain->id); + iommu_flush_domain_devices(domain); + iommu_flush_tlb_pde(domain); domain->updated = false; } /* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - gfp_t gfp) -{ - u64 *pte; - - if (domain->mode == PAGE_MODE_6_LEVEL) - /* address space already 64 bit large */ - return false; - - pte = (void *)get_zeroed_page(gfp); - if (!pte) - return false; - - *pte = PM_LEVEL_PDE(domain->mode, - virt_to_phys(domain->pt_root)); - domain->pt_root = pte; - domain->mode += 1; - domain->updated = true; - - return true; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - int end_lvl, - u64 **pte_page, - gfp_t gfp) -{ - u64 *pte, *page; - int level; - - while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > end_lvl) { - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); - } - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* * This function fetches the PTE for a given address in the aperture */ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, @@ -1530,8 +1679,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ -static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, unsigned long address, phys_addr_t paddr, int direction) @@ -1544,7 +1692,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, pte = dma_ops_get_pte(dom, address); if (!pte) - return bad_dma_address; + return DMA_ERROR_CODE; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1565,8 +1713,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, /* * The generic unmapping function for on page in the DMA address space. */ -static void dma_ops_domain_unmap(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static void dma_ops_domain_unmap(struct dma_ops_domain *dom, unsigned long address) { struct aperture_range *aperture; @@ -1597,7 +1744,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, * Must be called with the domain lock held. */ static dma_addr_t __map_single(struct device *dev, - struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, @@ -1625,7 +1771,7 @@ static dma_addr_t __map_single(struct device *dev, retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) { + if (unlikely(address == DMA_ERROR_CODE)) { /* * setting next_address here will let the address * allocator only scan the new allocated range in the @@ -1633,7 +1779,7 @@ retry: */ dma_dom->next_address = dma_dom->aperture_size; - if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) goto out; /* @@ -1645,8 +1791,8 @@ retry: start = address; for (i = 0; i < pages; ++i) { - ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); - if (ret == bad_dma_address) + ret = dma_ops_domain_map(dma_dom, start, paddr, dir); + if (ret == DMA_ERROR_CODE) goto out_unmap; paddr += PAGE_SIZE; @@ -1657,10 +1803,10 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(iommu, dma_dom->domain.id); + iommu_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; - } else if (unlikely(iommu_has_npcache(iommu))) - iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + } else if (unlikely(amd_iommu_np_cache)) + iommu_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1669,20 +1815,19 @@ out_unmap: for (--i; i >= 0; --i) { start -= PAGE_SIZE; - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); } dma_ops_free_addresses(dma_dom, address, pages); - return bad_dma_address; + return DMA_ERROR_CODE; } /* * Does the reverse of the __map_single function. Must be called with * the domain lock held too */ -static void __unmap_single(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static void __unmap_single(struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, size_t size, int dir) @@ -1690,7 +1835,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_addr_t i, start; unsigned int pages; - if ((dma_addr == bad_dma_address) || + if ((dma_addr == DMA_ERROR_CODE) || (dma_addr + size > dma_dom->aperture_size)) return; @@ -1699,7 +1844,7 @@ static void __unmap_single(struct amd_iommu *iommu, start = dma_addr; for (i = 0; i < pages; ++i) { - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); start += PAGE_SIZE; } @@ -1708,7 +1853,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, dma_addr, size); dma_dom->need_flush = false; } } @@ -1722,36 +1867,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; dma_addr_t addr; u64 dma_mask; phys_addr_t paddr = page_to_phys(page) + offset; INC_STATS_COUNTER(cnt_map_single); - if (!check_device(dev)) - return bad_dma_address; - - dma_mask = *dev->dma_mask; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (iommu == NULL || domain == NULL) - /* device not handled by any AMD IOMMU */ + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) return (dma_addr_t)paddr; + else if (IS_ERR(domain)) + return DMA_ERROR_CODE; - if (!dma_ops_domain(domain)) - return bad_dma_address; + dma_mask = *dev->dma_mask; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + + addr = __map_single(dev, domain->priv, paddr, size, dir, false, dma_mask); - if (addr == bad_dma_address) + if (addr == DMA_ERROR_CODE) goto out; - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1766,25 +1904,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_unmap_single); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) - /* device not handled by any AMD IOMMU */ - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, dir); + __unmap_single(domain->priv, dma_addr, size, dir); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1816,9 +1948,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; int i; struct scatterlist *s; phys_addr_t paddr; @@ -1827,25 +1957,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!check_device(dev)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) + return map_sg_no_iommu(dev, sglist, nelems, dir); + else if (IS_ERR(domain)) return 0; dma_mask = *dev->dma_mask; - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - return map_sg_no_iommu(dev, sglist, nelems, dir); - - if (!dma_ops_domain(domain)) - return 0; - spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { paddr = sg_phys(s); - s->dma_address = __map_single(dev, iommu, domain->priv, + s->dma_address = __map_single(dev, domain->priv, paddr, s->length, dir, false, dma_mask); @@ -1856,7 +1981,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1865,7 +1990,7 @@ out: unmap: for_each_sg(sglist, s, mapped_elems, i) { if (s->dma_address) - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } @@ -1884,30 +2009,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; struct scatterlist *s; - u16 devid; int i; INC_STATS_COUNTER(cnt_unmap_sg); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1920,49 +2040,44 @@ static void *alloc_coherent(struct device *dev, size_t size, { unsigned long flags; void *virt_addr; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; INC_STATS_COUNTER(cnt_alloc_coherent); - if (!check_device(dev)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) { + virt_addr = (void *)__get_free_pages(flag, get_order(size)); + *dma_addr = __pa(virt_addr); + return virt_addr; + } else if (IS_ERR(domain)) return NULL; - if (!get_device_resources(dev, &iommu, &domain, &devid)) - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + dma_mask = dev->coherent_dma_mask; + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + flag |= __GFP_ZERO; - flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return NULL; paddr = virt_to_phys(virt_addr); - if (!iommu || !domain) { - *dma_addr = (dma_addr_t)paddr; - return virt_addr; - } - - if (!dma_ops_domain(domain)) - goto out_free; - if (!dma_mask) dma_mask = *dev->dma_mask; spin_lock_irqsave(&domain->lock, flags); - *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + *dma_addr = __map_single(dev, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { + if (*dma_addr == DMA_ERROR_CODE) { spin_unlock_irqrestore(&domain->lock, flags); goto out_free; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -1982,28 +2097,19 @@ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_free_coherent); - if (!check_device(dev)) - return; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - goto free_mem; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) goto free_mem; spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2017,22 +2123,7 @@ free_mem: */ static int amd_iommu_dma_supported(struct device *dev, u64 mask) { - u16 bdf; - struct pci_dev *pcidev; - - /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) - return 0; - - pcidev = to_pci_dev(dev); - - bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* Out of our scope? */ - if (bdf > amd_iommu_last_bdf) - return 0; - - return 1; + return check_device(dev); } /* @@ -2046,25 +2137,30 @@ static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; - struct amd_iommu *iommu; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) - continue; - devid = amd_iommu_alias_table[devid]; - if (domain_for_device(devid)) + + /* Do we handle this device? */ + if (!check_device(&dev->dev)) continue; - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) + + iommu_init_device(&dev->dev); + + /* Is there already any domain for it? */ + if (domain_for_device(&dev->dev)) continue; - dma_dom = dma_ops_domain_alloc(iommu); + + devid = get_device_id(&dev->dev); + + dma_dom = dma_ops_domain_alloc(); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; + attach_device(&dev->dev, &dma_dom->domain); + list_add_tail(&dma_dom->list, &iommu_pd_list); } } @@ -2093,7 +2189,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ for_each_iommu(iommu) { - iommu->default_dom = dma_ops_domain_alloc(iommu); + iommu->default_dom = dma_ops_domain_alloc(); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; @@ -2103,15 +2199,12 @@ int __init amd_iommu_init_dma_ops(void) } /* - * If device isolation is enabled, pre-allocate the protection - * domains for each device. + * Pre-allocate the protection domains for each device. */ - if (amd_iommu_isolate) - prealloc_protection_domains(); + prealloc_protection_domains(); iommu_detected = 1; - force_iommu = 1; - bad_dma_address = 0; + swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; @@ -2150,14 +2243,17 @@ free_domains: static void cleanup_domain(struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *next; unsigned long flags; - u16 devid; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) - if (amd_iommu_pd_table[devid] == domain) - __detach_device(domain, devid); + list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { + struct device *dev = dev_data->dev; + + do_detach(dev); + atomic_set(&dev_data->bind, 0); + } write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } @@ -2167,6 +2263,8 @@ static void protection_domain_free(struct protection_domain *domain) if (!domain) return; + del_domain_from_list(domain); + if (domain->id) domain_id_free(domain->id); @@ -2185,6 +2283,9 @@ static struct protection_domain *protection_domain_alloc(void) domain->id = domain_id_alloc(); if (!domain->id) goto out_err; + INIT_LIST_HEAD(&domain->dev_list); + + add_domain_to_list(domain); return domain; @@ -2241,26 +2342,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { - struct protection_domain *domain = dom->priv; + struct iommu_dev_data *dev_data = dev->archdata.iommu; struct amd_iommu *iommu; - struct pci_dev *pdev; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return; - pdev = to_pci_dev(dev); - - devid = calc_devid(pdev->bus->number, pdev->devfn); + devid = get_device_id(dev); - if (devid > 0) - detach_device(domain, devid); + if (dev_data->domain != NULL) + detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return; - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); } @@ -2268,35 +2366,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { struct protection_domain *domain = dom->priv; - struct protection_domain *old_domain; + struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - struct pci_dev *pdev; + int ret; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return -EINVAL; - pdev = to_pci_dev(dev); + dev_data = dev->archdata.iommu; - devid = calc_devid(pdev->bus->number, pdev->devfn); - - if (devid >= amd_iommu_last_bdf || - devid != amd_iommu_alias_table[devid]) - return -EINVAL; + devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return -EINVAL; - old_domain = domain_for_device(devid); - if (old_domain) - detach_device(old_domain, devid); + if (dev_data->domain) + detach_device(dev); - attach_device(iommu, domain, devid); + ret = attach_device(dev, domain); iommu_completion_wait(iommu); - return 0; + return ret; } static int amd_iommu_map_range(struct iommu_domain *dom, @@ -2342,7 +2435,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova += PAGE_SIZE; } - iommu_flush_domain(domain->id); + iommu_flush_tlb_pde(domain); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, @@ -2393,8 +2486,9 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { + struct amd_iommu *iommu; struct pci_dev *dev = NULL; - u16 devid, devid2; + u16 devid; /* allocate passthroug domain */ pt_domain = protection_domain_alloc(); @@ -2404,20 +2498,17 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - struct amd_iommu *iommu; - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) + if (!check_device(&dev->dev)) continue; - devid2 = amd_iommu_alias_table[devid]; + devid = get_device_id(&dev->dev); - iommu = amd_iommu_rlookup_table[devid2]; + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - __attach_device(iommu, pt_domain, devid); - __attach_device(iommu, pt_domain, devid2); + attach_device(&dev->dev, pt_domain); } pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c20001e4f55..7ffc3996523 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -25,10 +25,12 @@ #include <linux/interrupt.h> #include <linux/msi.h> #include <asm/pci-direct.h> +#include <asm/amd_iommu_proto.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/x86_init.h> /* * definitions for the ACPI scanning code @@ -123,18 +125,24 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -#ifdef CONFIG_IOMMU_STRESS -bool amd_iommu_isolate = false; -#else -bool amd_iommu_isolate = true; /* if true, device isolation is - enabled */ -#endif - bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ +/* Array to assign indices to IOMMUs*/ +struct amd_iommu *amd_iommus[MAX_IOMMUS]; +int amd_iommus_present; + +/* IOMMUs have a non-present cache? */ +bool amd_iommu_np_cache __read_mostly; + +/* + * List of protection domains - used during resume + */ +LIST_HEAD(amd_iommu_pd_list); +spinlock_t amd_iommu_pd_lock; + /* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains @@ -157,12 +165,6 @@ u16 *amd_iommu_alias_table; struct amd_iommu **amd_iommu_rlookup_table; /* - * The pd table (protection domain table) is used to find the protection domain - * data structure a device belongs to. Indexed with the PCI device id too. - */ -struct protection_domain **amd_iommu_pd_table; - -/* * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap * to know which ones are already in use. */ @@ -838,7 +840,18 @@ static void __init free_iommu_all(void) static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); + + /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); + iommu->index = amd_iommus_present++; + + if (unlikely(iommu->index >= MAX_IOMMUS)) { + WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n"); + return -ENOSYS; + } + + /* Index is fine - add IOMMU to the array */ + amd_iommus[iommu->index] = iommu; /* * Copy data from ACPI table entry to the iommu struct @@ -868,6 +881,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) + amd_iommu_np_cache = true; + return pci_enable_device(iommu->dev); } @@ -925,7 +941,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msi(struct amd_iommu *iommu) +static int iommu_setup_msi(struct amd_iommu *iommu) { int r; @@ -1176,19 +1192,10 @@ static struct sys_device device_amd_iommu = { * functions. Finally it prints some information about AMD IOMMUs and * the driver state and enables the hardware. */ -int __init amd_iommu_init(void) +static int __init amd_iommu_init(void) { int i, ret = 0; - - if (no_iommu) { - printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); - return 0; - } - - if (!amd_iommu_detected) - return -ENODEV; - /* * First parse ACPI tables to find the largest Bus/Dev/Func * we need to handle. Upon this information the shared data @@ -1225,15 +1232,6 @@ int __init amd_iommu_init(void) if (amd_iommu_rlookup_table == NULL) goto free; - /* - * Protection Domain table - maps devices to protection domains - * This table has the same size as the rlookup_table - */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(rlookup_table_size)); - if (amd_iommu_pd_table == NULL) - goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); @@ -1255,6 +1253,8 @@ int __init amd_iommu_init(void) */ amd_iommu_pd_alloc_bitmap[0] = 1; + spin_lock_init(&amd_iommu_pd_lock); + /* * now the data structures are allocated and basically initialized * start the real acpi table scan @@ -1286,17 +1286,12 @@ int __init amd_iommu_init(void) if (iommu_pass_through) goto out; - printk(KERN_INFO "AMD-Vi: device isolation "); - if (amd_iommu_isolate) - printk("enabled\n"); - else - printk("disabled\n"); - if (amd_iommu_unmap_flush) printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + x86_platform.iommu_shutdown = disable_iommus; out: return ret; @@ -1304,9 +1299,6 @@ free: free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); - free_pages((unsigned long)amd_iommu_rlookup_table, get_order(rlookup_table_size)); @@ -1323,11 +1315,6 @@ free: goto out; } -void amd_iommu_shutdown(void) -{ - disable_iommus(); -} - /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA @@ -1342,16 +1329,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) + if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif + x86_init.iommu.iommu_init = amd_iommu_init; } } @@ -1372,10 +1356,6 @@ static int __init parse_amd_iommu_dump(char *str) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = true; - if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 128111d8ffe..e0dfb6856aa 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -28,6 +28,7 @@ #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; @@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void) iommu_detected = 1; gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; @@ -456,7 +458,7 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (swiotlb && !valid_agp) { + } else if (!valid_agp) { /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 5e409dc298a..a4849c10a77 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,8 +27,7 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> -#include <asm/iommu.h> - +#include <asm/x86_init.h> #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif crash_save_cpu(regs, safe_smp_processor_id()); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 971a3bec47a..c563e4c8ff3 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -46,6 +46,7 @@ #include <asm/dma.h> #include <asm/rio.h> #include <asm/bios_ebda.h> +#include <asm/x86_init.h> #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; @@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return bad_dma_address; + return DMA_ERROR_CODE; } } @@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { unsigned long entry; - dma_addr_t ret = bad_dma_address; + dma_addr_t ret; entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == bad_dma_address)) - goto error; + if (unlikely(entry == DMA_ERROR_CODE)) { + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return DMA_ERROR_CODE; + } /* set the return dma address */ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); @@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* put the TCEs in the HW table */ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - return ret; - -error: - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); - return bad_dma_address; } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, @@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; @@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) pdev = to_pci_dev(dev); + /* search up the device tree for an iommu */ pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; pbus = pbus->parent; - - tbl = pci_iommu(pbus); + } while (pbus); BUG_ON(tbl && (tbl->it_busno != pbus->number)); @@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == bad_dma_address) { + if (entry == DMA_ERROR_CODE) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, NULL); for_each_sg(sg, s, nelems, i) { - sg->dma_address = bad_dma_address; + sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; } return 0; @@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) + if (mapping == DMA_ERROR_CODE) goto free; *dma_handle = mapping; return ret; @@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); + iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ @@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void) return; } +static int __init calgary_iommu_init(void) +{ + int ret; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + return ret; + } + + return 0; +} + void __init detect_calgary(void) { int bus; @@ -1357,7 +1374,7 @@ void __init detect_calgary(void) * if the user specified iommu=off or iommu=soft or we found * another HW IOMMU already, bail out. */ - if (swiotlb || no_iommu || iommu_detected) + if (no_iommu || iommu_detected) return; if (!use_calgary) @@ -1442,9 +1459,7 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", specified_table_size); - /* swiotlb for devices that aren't behind the Calgary. */ - if (max_pfn > MAX_DMA32_PFN) - swiotlb = 1; + x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1457,35 +1472,6 @@ cleanup: } } -int __init calgary_iommu_init(void) -{ - int ret; - - if (no_iommu || (swiotlb && !calgary_detected)) - return -ENODEV; - - if (!calgary_detected) - return -ENODEV; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - force_iommu = 1; - bad_dma_address = 0x0; - /* dma_ops is set to swiotlb or nommu */ - if (!dma_ops) - dma_ops = &nommu_dma_ops; - - return 0; -} - static int __init calgary_parse_options(char *p) { unsigned int bridge; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a6e804d16c3..afcc58b69c7 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,10 +11,11 @@ #include <asm/gart.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> +#include <asm/x86_init.h> static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops; +struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -42,9 +43,6 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; -dma_addr_t bad_dma_address __read_mostly = 0; -EXPORT_SYMBOL(bad_dma_address); - /* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", @@ -126,20 +124,17 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif + if (pci_swiotlb_init()) + return; - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ gart_iommu_hole_init(); detect_calgary(); detect_intel_iommu(); + /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); - - pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, @@ -214,7 +209,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; @@ -289,25 +284,17 @@ static int __init pci_iommu_init(void) #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); #endif + x86_init.iommu.iommu_init(); - calgary_iommu_init(); - - intel_iommu_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else + swiotlb_free(); - amd_iommu_init(); - - gart_iommu_init(); - - no_iommu_init(); return 0; } - -void pci_iommu_shutdown(void) -{ - gart_iommu_shutdown(); - - amd_iommu_shutdown(); -} /* Must execute after PCI subsystem */ rootfs_initcall(pci_iommu_init); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a7f1b64f86e..e6a0d402f17 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -39,6 +39,7 @@ #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ static u32 *iommu_gatt_base; /* Remapping table */ +static dma_addr_t bad_dma_addr; + /* * If this is disabled the IOMMU will use an optimized flushing strategy * of only flushing when an mapping is reused. With it true the GART is @@ -92,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); @@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); - return bad_dma_address; + return bad_dma_addr; } for (i = 0; i < npages; i++) { @@ -294,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, int i; #ifdef CONFIG_IOMMU_DEBUG - printk(KERN_DEBUG "dma_map_sg overflow\n"); + pr_debug("dma_map_sg overflow\n"); #endif for_each_sg(sg, s, nents, i) { @@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir, 0); - if (addr == bad_dma_address) { + if (addr == bad_dma_addr) { if (i > 0) gart_unmap_sg(dev, sg, i, dir, NULL); nents = 0; @@ -389,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, if (!dev) dev = &x86_dma_fallback_dev; - out = 0; - start = 0; - start_sg = sgmap = sg; - seg_size = 0; - max_seg_size = dma_get_max_seg_size(dev); - ps = NULL; /* shut up gcc */ + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); @@ -417,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, sgmap, pages, need) < 0) goto error; out++; - seg_size = 0; - sgmap = sg_next(sgmap); - pages = 0; - start = i; - start_sg = s; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; } } @@ -455,7 +461,7 @@ error: iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) - s->dma_address = bad_dma_address; + s->dma_address = bad_dma_addr; return 0; } @@ -479,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, DMA_BIDIRECTIONAL, align_mask); flush_gart(); - if (paddr != bad_dma_address) { + if (paddr != bad_dma_addr) { *dma_addr = paddr; return page_address(page); } @@ -499,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_addr); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -515,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - printk(KERN_WARNING + pr_warning( "PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", iommu_size >> 20); @@ -570,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static int gart_resume(struct sys_device *dev) +static void gart_fixup_northbridges(struct sys_device *dev) { - printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); + int i; - if (fix_up_north_bridges) { - int i; + if (!fix_up_north_bridges) + return; - printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < num_k8_northbridges; i++) { + struct pci_dev *dev = k8_northbridges[i]; - /* - * Don't enable translations just yet. That is the next - * step. Restore the pre-suspend aperture settings. - */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, - aperture_order << 1); - pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, - aperture_alloc >> 25); - } + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } +} + +static int gart_resume(struct sys_device *dev) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(dev); enable_gart_translations(); @@ -604,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state) } static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, - .resume = gart_resume, + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, }; static struct sys_device device_gart = { - .id = 0, - .cls = &gart_sysdev_class, + .cls = &gart_sysdev_class, }; /* @@ -627,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) void *gatt; int i, error; - printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + pr_info("PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { @@ -645,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) } if (!aper_base) goto nommu; + info->aper_base = aper_base; info->aper_size = aper_size >> 20; @@ -667,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) flush_gart(); - printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); return 0; nommu: /* Should not happen anymore */ - printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" "falling back to iommu=soft.\n"); return -1; } @@ -686,14 +702,15 @@ static struct dma_map_ops gart_dma_ops = { .unmap_page = gart_unmap_page, .alloc_coherent = gart_alloc_coherent, .free_coherent = gart_free_coherent, + .mapping_error = gart_mapping_error, }; -void gart_iommu_shutdown(void) +static void gart_iommu_shutdown(void) { struct pci_dev *dev; int i; - if (no_agp && (dma_ops != &gart_dma_ops)) + if (no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { @@ -708,7 +725,7 @@ void gart_iommu_shutdown(void) } } -void __init gart_iommu_init(void) +int __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; @@ -718,7 +735,7 @@ void __init gart_iommu_init(void) long i; if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) - return; + return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; @@ -730,35 +747,28 @@ void __init gart_iommu_init(void) (agp_copy_info(agp_bridge, &info) < 0); #endif - if (swiotlb) - return; - - /* Did we detect a different HW IOMMU? */ - if (iommu_detected && !gart_iommu_aperture) - return; - if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n"); - printk(KERN_WARNING "falling back to iommu=soft.\n"); + pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warning("falling back to iommu=soft.\n"); } - return; + return 0; } /* need to map that range */ - aper_size = info.aper_size << 20; - aper_base = info.aper_base; - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { start_pfn = (aper_base>>PAGE_SHIFT); init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); } - printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); + pr_info("PCI-DMA: using GART IOMMU.\n"); iommu_size = check_iommu_size(info.aper_base, aper_size); iommu_pages = iommu_size >> PAGE_SHIFT; @@ -773,8 +783,7 @@ void __init gart_iommu_init(void) ret = dma_debug_resize_entries(iommu_pages); if (ret) - printk(KERN_DEBUG - "PCI-DMA: Cannot trace all the entries\n"); + pr_debug("PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -784,15 +793,14 @@ void __init gart_iommu_init(void) */ iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; - printk(KERN_INFO - "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_address = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* * Unmap the IOMMU part of the GART. The alias of the page is @@ -814,7 +822,7 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); - + /* * Now all caches are flushed and we can safely enable * GART hardware. Doing it early leaves the possibility @@ -838,6 +846,10 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; + + return 0; } void __init gart_parse_options(char *p) @@ -856,7 +868,7 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) + if (!strncmp(p, "fullflush", 9)) iommu_fullflush = 1; if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a3933d4330c..22be12b60a8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return bad_dma_address; + return DMA_ERROR_CODE; flush_write_buffers(); return bus; } @@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, }; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index aaa6b7839f1..e3c0a66b9e7 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -42,18 +42,28 @@ static struct dma_map_ops swiotlb_dma_ops = { .dma_supported = NULL, }; -void __init pci_swiotlb_init(void) +/* + * pci_swiotlb_init - initialize swiotlb if necessary + * + * This returns non-zero if we are forced to use swiotlb (by the boot + * option). + */ +int __init pci_swiotlb_init(void) { + int use_swiotlb = swiotlb | swiotlb_force; + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) + if (!no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif if (swiotlb_force) swiotlb = 1; + if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_init(); + swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } + + return use_swiotlb; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f93078746e0..2b97fc5b124 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -23,7 +23,7 @@ # include <linux/ctype.h> # include <linux/mc146818rtc.h> #else -# include <asm/iommu.h> +# include <asm/x86_init.h> #endif /* @@ -622,7 +622,7 @@ void native_machine_shutdown(void) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4449a4a2c2e..d11c5ff7c65 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,10 +14,13 @@ #include <asm/time.h> #include <asm/irq.h> #include <asm/tsc.h> +#include <asm/iommu.h> void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } +int __init iommu_init_noop(void) { return 0; } +void iommu_shutdown_noop(void) { } /* * The platform setup functions are preset with the default functions @@ -62,6 +65,10 @@ struct x86_init_ops x86_init __initdata = { .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { @@ -72,4 +79,5 @@ struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, + .iommu_shutdown = iommu_shutdown_noop, }; diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig index ccb1fa89de2..2fb3a480f6b 100644 --- a/drivers/char/agp/Kconfig +++ b/drivers/char/agp/Kconfig @@ -56,9 +56,8 @@ config AGP_AMD X on AMD Irongate, 761, and 762 chipsets. config AGP_AMD64 - tristate "AMD Opteron/Athlon64 on-CPU GART support" if !GART_IOMMU + tristate "AMD Opteron/Athlon64 on-CPU GART support" depends on AGP && X86 - default y if GART_IOMMU help This option gives you AGP support for the GLX component of X using the on-CPU northbridge of the AMD Athlon64/Opteron CPUs. diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index b952ebc7a78..416f6ac65b7 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -645,10 +645,13 @@ void __init detect_intel_iommu(void) "x2apic and Intr-remapping.\n"); #endif #ifdef CONFIG_DMAR - if (ret && !no_iommu && !iommu_detected && !swiotlb && - !dmar_disabled) + if (ret && !no_iommu && !iommu_detected && !dmar_disabled) iommu_detected = 1; #endif +#ifdef CONFIG_X86 + if (ret) + x86_init.iommu.iommu_init = intel_iommu_init; +#endif } early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size); dmar_tbl = NULL; diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 1840a0578a4..9261327b49f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3266,7 +3266,7 @@ int __init intel_iommu_init(void) * Check the need for DMA-remapping initialization now. * Above initialization will also be used by Interrupt-remapping. */ - if (no_iommu || swiotlb || dmar_disabled) + if (no_iommu || dmar_disabled) return -ENODEV; iommu_init_mempool(); @@ -3287,7 +3287,9 @@ int __init intel_iommu_init(void) "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n"); init_timer(&unmap_timer); - force_iommu = 1; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif dma_ops = &intel_dma_ops; init_iommu_sysfs(); diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index dd97fb8408a..b10ec49ee2d 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat, unsigned long addr, unsigned long size); extern void free_bootmem(unsigned long addr, unsigned long size); +extern void free_bootmem_late(unsigned long addr, unsigned long size); /* * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 4a2b162c256..5de4c9e5856 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -208,16 +208,9 @@ struct dmar_atsr_unit { u8 include_all:1; /* include all ports */ }; -/* Intel DMAR initialization functions */ extern int intel_iommu_init(void); -#else -static inline int intel_iommu_init(void) -{ -#ifdef CONFIG_INTR_REMAP - return dmar_dev_scope_init(); -#else - return -ENODEV; -#endif -} -#endif /* !CONFIG_DMAR */ +#else /* !CONFIG_DMAR: */ +static inline int intel_iommu_init(void) { return -ENODEV; } +#endif /* CONFIG_DMAR */ + #endif /* __DMAR_H__ */ diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 73b1f1cec42..febedcf67c7 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -7,6 +7,8 @@ struct device; struct dma_attrs; struct scatterlist; +extern int swiotlb_force; + /* * Maximum allowable number of contiguous slabs to map, * must be a power of 2. What is the appropriate value ? @@ -20,8 +22,7 @@ struct scatterlist; */ #define IO_TLB_SHIFT 11 -extern void -swiotlb_init(void); +extern void swiotlb_init(int verbose); extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -88,4 +89,11 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); extern int swiotlb_dma_supported(struct device *hwdev, u64 mask); +#ifdef CONFIG_SWIOTLB +extern void __init swiotlb_free(void); +#else +static inline void swiotlb_free(void) { } +#endif + +extern void swiotlb_print_info(void); #endif /* __LINUX_SWIOTLB_H */ diff --git a/lib/swiotlb.c b/lib/swiotlb.c index ac25cd28e80..795472d8ae2 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -97,6 +97,8 @@ static phys_addr_t *io_tlb_orig_addr; */ static DEFINE_SPINLOCK(io_tlb_lock); +static int late_alloc; + static int __init setup_io_tlb_npages(char *str) { @@ -109,6 +111,7 @@ setup_io_tlb_npages(char *str) ++str; if (!strcmp(str, "force")) swiotlb_force = 1; + return 1; } __setup("swiotlb=", setup_io_tlb_npages); @@ -121,8 +124,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, return phys_to_dma(hwdev, virt_to_phys(address)); } -static void swiotlb_print_info(unsigned long bytes) +void swiotlb_print_info(void) { + unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; phys_addr_t pstart, pend; pstart = virt_to_phys(io_tlb_start); @@ -140,7 +144,7 @@ static void swiotlb_print_info(unsigned long bytes) * structures for the software IO TLB used to implement the DMA API. */ void __init -swiotlb_init_with_default_size(size_t default_size) +swiotlb_init_with_default_size(size_t default_size, int verbose) { unsigned long i, bytes; @@ -176,14 +180,14 @@ swiotlb_init_with_default_size(size_t default_size) io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); if (!io_tlb_overflow_buffer) panic("Cannot allocate SWIOTLB overflow buffer!\n"); - - swiotlb_print_info(bytes); + if (verbose) + swiotlb_print_info(); } void __init -swiotlb_init(void) +swiotlb_init(int verbose) { - swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */ + swiotlb_init_with_default_size(64 * (1<<20), verbose); /* default to 64MB */ } /* @@ -260,7 +264,9 @@ swiotlb_late_init_with_default_size(size_t default_size) if (!io_tlb_overflow_buffer) goto cleanup4; - swiotlb_print_info(bytes); + swiotlb_print_info(); + + late_alloc = 1; return 0; @@ -281,6 +287,32 @@ cleanup1: return -ENOMEM; } +void __init swiotlb_free(void) +{ + if (!io_tlb_overflow_buffer) + return; + + if (late_alloc) { + free_pages((unsigned long)io_tlb_overflow_buffer, + get_order(io_tlb_overflow)); + free_pages((unsigned long)io_tlb_orig_addr, + get_order(io_tlb_nslabs * sizeof(phys_addr_t))); + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + free_pages((unsigned long)io_tlb_start, + get_order(io_tlb_nslabs << IO_TLB_SHIFT)); + } else { + free_bootmem_late(__pa(io_tlb_overflow_buffer), + io_tlb_overflow); + free_bootmem_late(__pa(io_tlb_orig_addr), + io_tlb_nslabs * sizeof(phys_addr_t)); + free_bootmem_late(__pa(io_tlb_list), + io_tlb_nslabs * sizeof(int)); + free_bootmem_late(__pa(io_tlb_start), + io_tlb_nslabs << IO_TLB_SHIFT); + } +} + static int is_swiotlb_buffer(phys_addr_t paddr) { return paddr >= virt_to_phys(io_tlb_start) && diff --git a/mm/bootmem.c b/mm/bootmem.c index 555d5d2731c..d1dc23cc7f1 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; |