From 4f1fcfe94c1700f9e891adc1ca364a5cef00bbfd Mon Sep 17 00:00:00 2001 From: Simon Xue Date: Wed, 3 May 2017 17:19:40 +0200 Subject: [PATCH 01/60] iommu/rockchip: Enable Rockchip IOMMU on ARM64 This patch makes it possible to compile the rockchip-iommu driver on ARM64, so that it can be used with 64-bit SoCs equipped with this type of IOMMU. Signed-off-by: Simon Xue Signed-off-by: Shunqian Zheng Signed-off-by: Tomasz Figa Reviewed-by: Matthias Brugger Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6ee3a25ae731..99c6366a2551 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -219,7 +219,7 @@ config OMAP_IOMMU_DEBUG config ROCKCHIP_IOMMU bool "Rockchip IOMMU Support" - depends on ARM + depends on ARM || ARM64 depends on ARCH_ROCKCHIP || COMPILE_TEST select IOMMU_API select ARM_DMA_USE_IOMMU From 15060aba717115dc9f204c02213a7c6bf341163e Mon Sep 17 00:00:00 2001 From: CQ Tang Date: Wed, 10 May 2017 11:39:03 -0700 Subject: [PATCH 02/60] iommu/vt-d: Helper function to query if a pasid has any active users A driver would need to know if there are any active references to a a PASID before cleaning up its resources. This function helps check if there are any active users of a PASID before it can perform any recovery on that device. To: Joerg Roedel To: linux-kernel@vger.kernel.org To: David Woodhouse Cc: Jean-Phillipe Brucker Cc: iommu@lists.linux-foundation.org Signed-off-by: CQ Tang Signed-off-by: Ashok Raj Signed-off-by: Joerg Roedel --- drivers/iommu/intel-svm.c | 30 ++++++++++++++++++++++++++++++ include/linux/intel-svm.h | 20 ++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 23c427602c55..f167c0d84ebf 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -489,6 +489,36 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) } EXPORT_SYMBOL_GPL(intel_svm_unbind_mm); +int intel_svm_is_pasid_valid(struct device *dev, int pasid) +{ + struct intel_iommu *iommu; + struct intel_svm *svm; + int ret = -EINVAL; + + mutex_lock(&pasid_mutex); + iommu = intel_svm_device_to_iommu(dev); + if (!iommu || !iommu->pasid_table) + goto out; + + svm = idr_find(&iommu->pasid_idr, pasid); + if (!svm) + goto out; + + /* init_mm is used in this case */ + if (!svm->mm) + ret = 1; + else if (atomic_read(&svm->mm->mm_users) > 0) + ret = 1; + else + ret = 0; + + out: + mutex_unlock(&pasid_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid); + /* Page request queue descriptor */ struct page_req_dsc { u64 srr:1; diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 3c25794042f9..99bc5b3ae26e 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -102,6 +102,21 @@ extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, */ extern int intel_svm_unbind_mm(struct device *dev, int pasid); +/** + * intel_svm_is_pasid_valid() - check if pasid is valid + * @dev: Device for which PASID was allocated + * @pasid: PASID value to be checked + * + * This function checks if the specified pasid is still valid. A + * valid pasid means the backing mm is still having a valid user. + * For kernel callers init_mm is always valid. for other mm, if mm->mm_users + * is non-zero, it is valid. + * + * returns -EINVAL if invalid pasid, 0 if pasid ref count is invalid + * 1 if pasid is valid. + */ +extern int intel_svm_is_pasid_valid(struct device *dev, int pasid); + #else /* CONFIG_INTEL_IOMMU_SVM */ static inline int intel_svm_bind_mm(struct device *dev, int *pasid, @@ -114,6 +129,11 @@ static inline int intel_svm_unbind_mm(struct device *dev, int pasid) { BUG(); } + +static int intel_svm_is_pasid_valid(struct device *dev, int pasid) +{ + return -EINVAL; +} #endif /* CONFIG_INTEL_IOMMU_SVM */ #define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL)) From 6aa9a30838707f19cd5de4b8710987fbc747cdaf Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 17 May 2017 19:06:27 +0900 Subject: [PATCH 03/60] iommu/ipmmu-vmsa: Remove platform data handling The IPMMU driver is using DT these days, and platform data is no longer used by the driver. Remove unused code. Signed-off-by: Magnus Damm Reviewed-by: Laurent Pinchart Reviewed-by: Joerg Roedel Reviewed-by: Geert Uytterhoeven Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index b7e14ee863f9..d10a8d5eee4f 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -768,11 +768,6 @@ static int ipmmu_probe(struct platform_device *pdev) int irq; int ret; - if (!IS_ENABLED(CONFIG_OF) && !pdev->dev.platform_data) { - dev_err(&pdev->dev, "missing platform data\n"); - return -EINVAL; - } - mmu = devm_kzalloc(&pdev->dev, sizeof(*mmu), GFP_KERNEL); if (!mmu) { dev_err(&pdev->dev, "cannot allocate device data\n"); From dbb7069223bed0c40511ee28e5be519405e47906 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 17 May 2017 19:06:38 +0900 Subject: [PATCH 04/60] iommu/ipmmu-vmsa: Rework interrupt code and use bitmap for context Introduce a bitmap for context handing and convert the interrupt routine to handle all registered contexts. At this point the number of contexts are still limited. Also remove the use of the ARM specific mapping variable from ipmmu_irq() to allow compile on ARM64. Signed-off-by: Magnus Damm Reviewed-by: Joerg Roedel Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 76 +++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index d10a8d5eee4f..787b675b0e9f 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -8,6 +8,7 @@ * the Free Software Foundation; version 2 of the License. */ +#include #include #include #include @@ -26,12 +27,17 @@ #include "io-pgtable.h" +#define IPMMU_CTX_MAX 1 + struct ipmmu_vmsa_device { struct device *dev; void __iomem *base; struct list_head list; unsigned int num_utlbs; + spinlock_t lock; /* Protects ctx and domains[] */ + DECLARE_BITMAP(ctx, IPMMU_CTX_MAX); + struct ipmmu_vmsa_domain *domains[IPMMU_CTX_MAX]; struct dma_iommu_mapping *mapping; }; @@ -293,9 +299,29 @@ static struct iommu_gather_ops ipmmu_gather_ops = { * Domain/Context Management */ +static int ipmmu_domain_allocate_context(struct ipmmu_vmsa_device *mmu, + struct ipmmu_vmsa_domain *domain) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&mmu->lock, flags); + + ret = find_first_zero_bit(mmu->ctx, IPMMU_CTX_MAX); + if (ret != IPMMU_CTX_MAX) { + mmu->domains[ret] = domain; + set_bit(ret, mmu->ctx); + } + + spin_unlock_irqrestore(&mmu->lock, flags); + + return ret; +} + static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain) { u64 ttbr; + int ret; /* * Allocate the page table operations. @@ -327,10 +353,15 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain) return -EINVAL; /* - * TODO: When adding support for multiple contexts, find an unused - * context. + * Find an unused context. */ - domain->context_id = 0; + ret = ipmmu_domain_allocate_context(domain->mmu, domain); + if (ret == IPMMU_CTX_MAX) { + free_io_pgtable_ops(domain->iop); + return -EBUSY; + } + + domain->context_id = ret; /* TTBR0 */ ttbr = domain->cfg.arm_lpae_s1_cfg.ttbr[0]; @@ -372,6 +403,19 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain) return 0; } +static void ipmmu_domain_free_context(struct ipmmu_vmsa_device *mmu, + unsigned int context_id) +{ + unsigned long flags; + + spin_lock_irqsave(&mmu->lock, flags); + + clear_bit(context_id, mmu->ctx); + mmu->domains[context_id] = NULL; + + spin_unlock_irqrestore(&mmu->lock, flags); +} + static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain *domain) { /* @@ -382,6 +426,7 @@ static void ipmmu_domain_destroy_context(struct ipmmu_vmsa_domain *domain) */ ipmmu_ctx_write(domain, IMCTR, IMCTR_FLUSH); ipmmu_tlb_sync(domain); + ipmmu_domain_free_context(domain->mmu, domain->context_id); } /* ----------------------------------------------------------------------------- @@ -439,16 +484,25 @@ static irqreturn_t ipmmu_domain_irq(struct ipmmu_vmsa_domain *domain) static irqreturn_t ipmmu_irq(int irq, void *dev) { struct ipmmu_vmsa_device *mmu = dev; - struct iommu_domain *io_domain; - struct ipmmu_vmsa_domain *domain; + irqreturn_t status = IRQ_NONE; + unsigned int i; + unsigned long flags; - if (!mmu->mapping) - return IRQ_NONE; + spin_lock_irqsave(&mmu->lock, flags); - io_domain = mmu->mapping->domain; - domain = to_vmsa_domain(io_domain); + /* + * Check interrupts for all active contexts. + */ + for (i = 0; i < IPMMU_CTX_MAX; i++) { + if (!mmu->domains[i]) + continue; + if (ipmmu_domain_irq(mmu->domains[i]) == IRQ_HANDLED) + status = IRQ_HANDLED; + } - return ipmmu_domain_irq(domain); + spin_unlock_irqrestore(&mmu->lock, flags); + + return status; } /* ----------------------------------------------------------------------------- @@ -776,6 +830,8 @@ static int ipmmu_probe(struct platform_device *pdev) mmu->dev = &pdev->dev; mmu->num_utlbs = 32; + spin_lock_init(&mmu->lock); + bitmap_zero(mmu->ctx, IPMMU_CTX_MAX); /* Map I/O memory and request IRQ. */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); From 383fef5f4b56d8dc05cac12dc04262c8175331b1 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 17 May 2017 19:06:48 +0900 Subject: [PATCH 05/60] iommu/ipmmu-vmsa: Break out utlb parsing code Break out the utlb parsing code and dev_data allocation into a separate function. This is preparation for future code sharing. Signed-off-by: Magnus Damm Reviewed-by: Joerg Roedel Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 64 ++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 787b675b0e9f..c4c35abc1c1a 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -649,22 +649,15 @@ static int ipmmu_find_utlbs(struct ipmmu_vmsa_device *mmu, struct device *dev, return 0; } -static int ipmmu_add_device(struct device *dev) +static int ipmmu_init_platform_device(struct device *dev) { struct ipmmu_vmsa_archdata *archdata; struct ipmmu_vmsa_device *mmu; - struct iommu_group *group = NULL; unsigned int *utlbs; unsigned int i; int num_utlbs; int ret = -ENODEV; - if (dev->archdata.iommu) { - dev_warn(dev, "IOMMU driver already assigned to device %s\n", - dev_name(dev)); - return -EINVAL; - } - /* Find the master corresponding to the device. */ num_utlbs = of_count_phandle_with_args(dev->of_node, "iommus", @@ -701,6 +694,36 @@ static int ipmmu_add_device(struct device *dev) } } + archdata = kzalloc(sizeof(*archdata), GFP_KERNEL); + if (!archdata) { + ret = -ENOMEM; + goto error; + } + + archdata->mmu = mmu; + archdata->utlbs = utlbs; + archdata->num_utlbs = num_utlbs; + dev->archdata.iommu = archdata; + return 0; + +error: + kfree(utlbs); + return ret; +} + +static int ipmmu_add_device(struct device *dev) +{ + struct ipmmu_vmsa_archdata *archdata; + struct ipmmu_vmsa_device *mmu = NULL; + struct iommu_group *group; + int ret; + + if (dev->archdata.iommu) { + dev_warn(dev, "IOMMU driver already assigned to device %s\n", + dev_name(dev)); + return -EINVAL; + } + /* Create a device group and add the device to it. */ group = iommu_group_alloc(); if (IS_ERR(group)) { @@ -718,16 +741,9 @@ static int ipmmu_add_device(struct device *dev) goto error; } - archdata = kzalloc(sizeof(*archdata), GFP_KERNEL); - if (!archdata) { - ret = -ENOMEM; + ret = ipmmu_init_platform_device(dev); + if (ret < 0) goto error; - } - - archdata->mmu = mmu; - archdata->utlbs = utlbs; - archdata->num_utlbs = num_utlbs; - dev->archdata.iommu = archdata; /* * Create the ARM mapping, used by the ARM DMA mapping core to allocate @@ -738,6 +754,8 @@ static int ipmmu_add_device(struct device *dev) * - Make the mapping size configurable ? We currently use a 2GB mapping * at a 1GB offset to ensure that NULL VAs will fault. */ + archdata = dev->archdata.iommu; + mmu = archdata->mmu; if (!mmu->mapping) { struct dma_iommu_mapping *mapping; @@ -762,16 +780,16 @@ static int ipmmu_add_device(struct device *dev) return 0; error: - arm_iommu_release_mapping(mmu->mapping); - - kfree(dev->archdata.iommu); - kfree(utlbs); - - dev->archdata.iommu = NULL; + if (mmu) + arm_iommu_release_mapping(mmu->mapping); if (!IS_ERR_OR_NULL(group)) iommu_group_remove_device(dev); + kfree(archdata->utlbs); + kfree(archdata); + dev->archdata.iommu = NULL; + return ret; } From 8e73bf659135ee7333d9a762bbdadb2ad794452f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 17 May 2017 19:06:59 +0900 Subject: [PATCH 06/60] iommu/ipmmu-vmsa: Break out domain allocation code Break out the domain allocation code into a separate function. This is preparation for future code sharing. Signed-off-by: Magnus Damm Reviewed-by: Joerg Roedel Reviewed-by: Geert Uytterhoeven Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index c4c35abc1c1a..eb5008596051 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -509,13 +509,10 @@ static irqreturn_t ipmmu_irq(int irq, void *dev) * IOMMU Operations */ -static struct iommu_domain *ipmmu_domain_alloc(unsigned type) +static struct iommu_domain *__ipmmu_domain_alloc(unsigned type) { struct ipmmu_vmsa_domain *domain; - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; @@ -525,6 +522,14 @@ static struct iommu_domain *ipmmu_domain_alloc(unsigned type) return &domain->io_domain; } +static struct iommu_domain *ipmmu_domain_alloc(unsigned type) +{ + if (type != IOMMU_DOMAIN_UNMANAGED) + return NULL; + + return __ipmmu_domain_alloc(type); +} + static void ipmmu_domain_free(struct iommu_domain *io_domain) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); From 3ae47292024f85e82b769044c43f0bd13adcd7e8 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 17 May 2017 19:07:10 +0900 Subject: [PATCH 07/60] iommu/ipmmu-vmsa: Add new IOMMU_DOMAIN_DMA ops Introduce an alternative set of iommu_ops suitable for 64-bit ARM as well as 32-bit ARM when CONFIG_IOMMU_DMA=y. Also adjust the Kconfig to depend on ARM or IOMMU_DMA. Initialize the device from ->xlate() when CONFIG_IOMMU_DMA=y. Signed-off-by: Magnus Damm Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 1 + drivers/iommu/ipmmu-vmsa.c | 164 +++++++++++++++++++++++++++++++++++-- 2 files changed, 156 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6ee3a25ae731..504ba025a54c 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -274,6 +274,7 @@ config EXYNOS_IOMMU_DEBUG config IPMMU_VMSA bool "Renesas VMSA-compatible IPMMU" + depends on ARM || IOMMU_DMA depends on ARM_LPAE depends on ARCH_RENESAS || COMPILE_TEST select IOMMU_API diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index eb5008596051..8b648f6e0e4d 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -22,8 +23,10 @@ #include #include +#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA) #include #include +#endif #include "io-pgtable.h" @@ -57,6 +60,8 @@ struct ipmmu_vmsa_archdata { struct ipmmu_vmsa_device *mmu; unsigned int *utlbs; unsigned int num_utlbs; + struct device *dev; + struct list_head list; }; static DEFINE_SPINLOCK(ipmmu_devices_lock); @@ -522,14 +527,6 @@ static struct iommu_domain *__ipmmu_domain_alloc(unsigned type) return &domain->io_domain; } -static struct iommu_domain *ipmmu_domain_alloc(unsigned type) -{ - if (type != IOMMU_DOMAIN_UNMANAGED) - return NULL; - - return __ipmmu_domain_alloc(type); -} - static void ipmmu_domain_free(struct iommu_domain *io_domain) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); @@ -572,7 +569,8 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, dev_err(dev, "Can't attach IPMMU %s to domain on IPMMU %s\n", dev_name(mmu->dev), dev_name(domain->mmu->dev)); ret = -EINVAL; - } + } else + dev_info(dev, "Reusing IPMMU context %u\n", domain->context_id); spin_unlock_irqrestore(&domain->lock, flags); @@ -708,6 +706,7 @@ static int ipmmu_init_platform_device(struct device *dev) archdata->mmu = mmu; archdata->utlbs = utlbs; archdata->num_utlbs = num_utlbs; + archdata->dev = dev; dev->archdata.iommu = archdata; return 0; @@ -716,6 +715,16 @@ error: return ret; } +#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA) + +static struct iommu_domain *ipmmu_domain_alloc(unsigned type) +{ + if (type != IOMMU_DOMAIN_UNMANAGED) + return NULL; + + return __ipmmu_domain_alloc(type); +} + static int ipmmu_add_device(struct device *dev) { struct ipmmu_vmsa_archdata *archdata; @@ -825,6 +834,141 @@ static const struct iommu_ops ipmmu_ops = { .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K, }; +#endif /* !CONFIG_ARM && CONFIG_IOMMU_DMA */ + +#ifdef CONFIG_IOMMU_DMA + +static DEFINE_SPINLOCK(ipmmu_slave_devices_lock); +static LIST_HEAD(ipmmu_slave_devices); + +static struct iommu_domain *ipmmu_domain_alloc_dma(unsigned type) +{ + struct iommu_domain *io_domain = NULL; + + switch (type) { + case IOMMU_DOMAIN_UNMANAGED: + io_domain = __ipmmu_domain_alloc(type); + break; + + case IOMMU_DOMAIN_DMA: + io_domain = __ipmmu_domain_alloc(type); + if (io_domain) + iommu_get_dma_cookie(io_domain); + break; + } + + return io_domain; +} + +static void ipmmu_domain_free_dma(struct iommu_domain *io_domain) +{ + switch (io_domain->type) { + case IOMMU_DOMAIN_DMA: + iommu_put_dma_cookie(io_domain); + /* fall-through */ + default: + ipmmu_domain_free(io_domain); + break; + } +} + +static int ipmmu_add_device_dma(struct device *dev) +{ + struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; + struct iommu_group *group; + + /* The device has been verified in xlate() */ + if (!archdata) + return -ENODEV; + + group = iommu_group_get_for_dev(dev); + if (IS_ERR(group)) + return PTR_ERR(group); + + spin_lock(&ipmmu_slave_devices_lock); + list_add(&archdata->list, &ipmmu_slave_devices); + spin_unlock(&ipmmu_slave_devices_lock); + return 0; +} + +static void ipmmu_remove_device_dma(struct device *dev) +{ + struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; + + spin_lock(&ipmmu_slave_devices_lock); + list_del(&archdata->list); + spin_unlock(&ipmmu_slave_devices_lock); + + iommu_group_remove_device(dev); +} + +static struct device *ipmmu_find_sibling_device(struct device *dev) +{ + struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; + struct ipmmu_vmsa_archdata *sibling_archdata = NULL; + bool found = false; + + spin_lock(&ipmmu_slave_devices_lock); + + list_for_each_entry(sibling_archdata, &ipmmu_slave_devices, list) { + if (archdata == sibling_archdata) + continue; + if (sibling_archdata->mmu == archdata->mmu) { + found = true; + break; + } + } + + spin_unlock(&ipmmu_slave_devices_lock); + + return found ? sibling_archdata->dev : NULL; +} + +static struct iommu_group *ipmmu_find_group_dma(struct device *dev) +{ + struct iommu_group *group; + struct device *sibling; + + sibling = ipmmu_find_sibling_device(dev); + if (sibling) + group = iommu_group_get(sibling); + if (!sibling || IS_ERR(group)) + group = generic_device_group(dev); + + return group; +} + +static int ipmmu_of_xlate_dma(struct device *dev, + struct of_phandle_args *spec) +{ + /* If the IPMMU device is disabled in DT then return error + * to make sure the of_iommu code does not install ops + * even though the iommu device is disabled + */ + if (!of_device_is_available(spec->np)) + return -ENODEV; + + return ipmmu_init_platform_device(dev); +} + +static const struct iommu_ops ipmmu_ops = { + .domain_alloc = ipmmu_domain_alloc_dma, + .domain_free = ipmmu_domain_free_dma, + .attach_dev = ipmmu_attach_device, + .detach_dev = ipmmu_detach_device, + .map = ipmmu_map, + .unmap = ipmmu_unmap, + .map_sg = default_iommu_map_sg, + .iova_to_phys = ipmmu_iova_to_phys, + .add_device = ipmmu_add_device_dma, + .remove_device = ipmmu_remove_device_dma, + .device_group = ipmmu_find_group_dma, + .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K, + .of_xlate = ipmmu_of_xlate_dma, +}; + +#endif /* CONFIG_IOMMU_DMA */ + /* ----------------------------------------------------------------------------- * Probe/remove and init */ @@ -914,7 +1058,9 @@ static int ipmmu_remove(struct platform_device *pdev) list_del(&mmu->list); spin_unlock(&ipmmu_devices_lock); +#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA) arm_iommu_release_mapping(mmu->mapping); +#endif ipmmu_device_reset(mmu); From 0fbc8b04c34fef1d730712135c504978c939a5a3 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 17 May 2017 19:07:20 +0900 Subject: [PATCH 08/60] iommu/ipmmu-vmsa: Use fwspec iommu_priv on ARM64 Convert from archdata to iommu_priv via iommu_fwspec on ARM64 but let 32-bit ARM keep on using archdata for now. Once the 32-bit ARM code and the IPMMU driver is able to move over to CONFIG_IOMMU_DMA=y then coverting to fwspec via ->of_xlate() will be easy. For now fwspec ids and num_ids are not used to allow code sharing between 32-bit and 64-bit ARM code inside the driver. Signed-off-by: Magnus Damm Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 97 +++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 8b648f6e0e4d..a04babbd7de9 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -56,7 +56,7 @@ struct ipmmu_vmsa_domain { spinlock_t lock; /* Protects mappings */ }; -struct ipmmu_vmsa_archdata { +struct ipmmu_vmsa_iommu_priv { struct ipmmu_vmsa_device *mmu; unsigned int *utlbs; unsigned int num_utlbs; @@ -72,6 +72,24 @@ static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom) return container_of(dom, struct ipmmu_vmsa_domain, io_domain); } + +static struct ipmmu_vmsa_iommu_priv *to_priv(struct device *dev) +{ +#if defined(CONFIG_ARM) + return dev->archdata.iommu; +#else + return dev->iommu_fwspec->iommu_priv; +#endif +} +static void set_priv(struct device *dev, struct ipmmu_vmsa_iommu_priv *p) +{ +#if defined(CONFIG_ARM) + dev->archdata.iommu = p; +#else + dev->iommu_fwspec->iommu_priv = p; +#endif +} + #define TLB_LOOP_TIMEOUT 100 /* 100us */ /* ----------------------------------------------------------------------------- @@ -543,8 +561,8 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) static int ipmmu_attach_device(struct iommu_domain *io_domain, struct device *dev) { - struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; - struct ipmmu_vmsa_device *mmu = archdata->mmu; + struct ipmmu_vmsa_iommu_priv *priv = to_priv(dev); + struct ipmmu_vmsa_device *mmu = priv->mmu; struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); unsigned long flags; unsigned int i; @@ -577,8 +595,8 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, if (ret < 0) return ret; - for (i = 0; i < archdata->num_utlbs; ++i) - ipmmu_utlb_enable(domain, archdata->utlbs[i]); + for (i = 0; i < priv->num_utlbs; ++i) + ipmmu_utlb_enable(domain, priv->utlbs[i]); return 0; } @@ -586,12 +604,12 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, static void ipmmu_detach_device(struct iommu_domain *io_domain, struct device *dev) { - struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; + struct ipmmu_vmsa_iommu_priv *priv = to_priv(dev); struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); unsigned int i; - for (i = 0; i < archdata->num_utlbs; ++i) - ipmmu_utlb_disable(domain, archdata->utlbs[i]); + for (i = 0; i < priv->num_utlbs; ++i) + ipmmu_utlb_disable(domain, priv->utlbs[i]); /* * TODO: Optimize by disabling the context when no device is attached. @@ -654,7 +672,7 @@ static int ipmmu_find_utlbs(struct ipmmu_vmsa_device *mmu, struct device *dev, static int ipmmu_init_platform_device(struct device *dev) { - struct ipmmu_vmsa_archdata *archdata; + struct ipmmu_vmsa_iommu_priv *priv; struct ipmmu_vmsa_device *mmu; unsigned int *utlbs; unsigned int i; @@ -697,17 +715,17 @@ static int ipmmu_init_platform_device(struct device *dev) } } - archdata = kzalloc(sizeof(*archdata), GFP_KERNEL); - if (!archdata) { + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { ret = -ENOMEM; goto error; } - archdata->mmu = mmu; - archdata->utlbs = utlbs; - archdata->num_utlbs = num_utlbs; - archdata->dev = dev; - dev->archdata.iommu = archdata; + priv->mmu = mmu; + priv->utlbs = utlbs; + priv->num_utlbs = num_utlbs; + priv->dev = dev; + set_priv(dev, priv); return 0; error: @@ -727,12 +745,11 @@ static struct iommu_domain *ipmmu_domain_alloc(unsigned type) static int ipmmu_add_device(struct device *dev) { - struct ipmmu_vmsa_archdata *archdata; struct ipmmu_vmsa_device *mmu = NULL; struct iommu_group *group; int ret; - if (dev->archdata.iommu) { + if (to_priv(dev)) { dev_warn(dev, "IOMMU driver already assigned to device %s\n", dev_name(dev)); return -EINVAL; @@ -768,8 +785,7 @@ static int ipmmu_add_device(struct device *dev) * - Make the mapping size configurable ? We currently use a 2GB mapping * at a 1GB offset to ensure that NULL VAs will fault. */ - archdata = dev->archdata.iommu; - mmu = archdata->mmu; + mmu = to_priv(dev)->mmu; if (!mmu->mapping) { struct dma_iommu_mapping *mapping; @@ -800,24 +816,24 @@ error: if (!IS_ERR_OR_NULL(group)) iommu_group_remove_device(dev); - kfree(archdata->utlbs); - kfree(archdata); - dev->archdata.iommu = NULL; + kfree(to_priv(dev)->utlbs); + kfree(to_priv(dev)); + set_priv(dev, NULL); return ret; } static void ipmmu_remove_device(struct device *dev) { - struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; + struct ipmmu_vmsa_iommu_priv *priv = to_priv(dev); arm_iommu_detach_device(dev); iommu_group_remove_device(dev); - kfree(archdata->utlbs); - kfree(archdata); + kfree(priv->utlbs); + kfree(priv); - dev->archdata.iommu = NULL; + set_priv(dev, NULL); } static const struct iommu_ops ipmmu_ops = { @@ -874,11 +890,14 @@ static void ipmmu_domain_free_dma(struct iommu_domain *io_domain) static int ipmmu_add_device_dma(struct device *dev) { - struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; + struct iommu_fwspec *fwspec = dev->iommu_fwspec; struct iommu_group *group; - /* The device has been verified in xlate() */ - if (!archdata) + /* + * Only let through devices that have been verified in xlate() + * We may get called with dev->iommu_fwspec set to NULL. + */ + if (!fwspec || !fwspec->iommu_priv) return -ENODEV; group = iommu_group_get_for_dev(dev); @@ -886,17 +905,17 @@ static int ipmmu_add_device_dma(struct device *dev) return PTR_ERR(group); spin_lock(&ipmmu_slave_devices_lock); - list_add(&archdata->list, &ipmmu_slave_devices); + list_add(&to_priv(dev)->list, &ipmmu_slave_devices); spin_unlock(&ipmmu_slave_devices_lock); return 0; } static void ipmmu_remove_device_dma(struct device *dev) { - struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; + struct ipmmu_vmsa_iommu_priv *priv = to_priv(dev); spin_lock(&ipmmu_slave_devices_lock); - list_del(&archdata->list); + list_del(&priv->list); spin_unlock(&ipmmu_slave_devices_lock); iommu_group_remove_device(dev); @@ -904,16 +923,16 @@ static void ipmmu_remove_device_dma(struct device *dev) static struct device *ipmmu_find_sibling_device(struct device *dev) { - struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; - struct ipmmu_vmsa_archdata *sibling_archdata = NULL; + struct ipmmu_vmsa_iommu_priv *priv = to_priv(dev); + struct ipmmu_vmsa_iommu_priv *sibling_priv = NULL; bool found = false; spin_lock(&ipmmu_slave_devices_lock); - list_for_each_entry(sibling_archdata, &ipmmu_slave_devices, list) { - if (archdata == sibling_archdata) + list_for_each_entry(sibling_priv, &ipmmu_slave_devices, list) { + if (priv == sibling_priv) continue; - if (sibling_archdata->mmu == archdata->mmu) { + if (sibling_priv->mmu == priv->mmu) { found = true; break; } @@ -921,7 +940,7 @@ static struct device *ipmmu_find_sibling_device(struct device *dev) spin_unlock(&ipmmu_slave_devices_lock); - return found ? sibling_archdata->dev : NULL; + return found ? sibling_priv->dev : NULL; } static struct iommu_group *ipmmu_find_group_dma(struct device *dev) From d74c67d46b8e03883326163587de783adf61ef6a Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 17 May 2017 19:07:30 +0900 Subject: [PATCH 09/60] iommu/ipmmu-vmsa: Drop LPAE Kconfig dependency Neither the ARM page table code enabled by IOMMU_IO_PGTABLE_LPAE nor the IPMMU_VMSA driver actually depends on ARM_LPAE, so get rid of the dependency. Tested with ipmmu-vmsa on r8a7794 ALT and a kernel config using: # CONFIG_ARM_LPAE is not set Signed-off-by: Magnus Damm Acked-by: Laurent Pinchart Reviewed-by: Joerg Roedel Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 504ba025a54c..45cb1905b9bc 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -275,7 +275,6 @@ config EXYNOS_IOMMU_DEBUG config IPMMU_VMSA bool "Renesas VMSA-compatible IPMMU" depends on ARM || IOMMU_DMA - depends on ARM_LPAE depends on ARCH_RENESAS || COMPILE_TEST select IOMMU_API select IOMMU_IO_PGTABLE_LPAE From 26b6aec6e726597149b4bcd4cc8583f402f3da14 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 17 May 2017 19:07:41 +0900 Subject: [PATCH 10/60] iommu/ipmmu-vmsa: Fix pgsize_bitmap semicolon typo Fix comma-instead-of-semicolon typo error present in the latest version of the IPMMU driver. Signed-off-by: Magnus Damm Reviewed-by: Geert Uytterhoeven Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index a04babbd7de9..2a38aa15be17 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -358,7 +358,7 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain) * non-secure mode. */ domain->cfg.quirks = IO_PGTABLE_QUIRK_ARM_NS; - domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K, + domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K; domain->cfg.ias = 32; domain->cfg.oas = 40; domain->cfg.tlb = &ipmmu_gather_ops; From 757c370f036e1f9f9a816cd481a13cdbcb346eb9 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 16 May 2017 12:26:48 +0100 Subject: [PATCH 11/60] iommu/iova: Sort out rbtree limit_pfn handling When walking the rbtree, the fact that iovad->start_pfn and limit_pfn are both inclusive limits creates an ambiguity once limit_pfn reaches the bottom of the address space and they overlap. Commit 5016bdb796b3 ("iommu/iova: Fix underflow bug in __alloc_and_insert_iova_range") fixed the worst side-effect of this, that of underflow wraparound leading to bogus allocations, but the remaining fallout is that any attempt to allocate start_pfn itself erroneously fails. The cleanest way to resolve the ambiguity is to simply make limit_pfn an exclusive limit when inside the guts of the rbtree. Since we're working with PFNs, representing one past the top of the address space is always possible without fear of overflow, and elsewhere it just makes life a little more straightforward. Reported-by: Aaron Sierra Signed-off-by: Robin Murphy Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 2 +- drivers/iommu/iova.c | 21 +++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 8348f366ddd1..aaf6fa304240 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -314,7 +314,7 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, * If we have devices with different DMA masks, move the free * area cache limit down for the benefit of the smaller one. */ - iovad->dma_32bit_pfn = min(end_pfn, iovad->dma_32bit_pfn); + iovad->dma_32bit_pfn = min(end_pfn + 1, iovad->dma_32bit_pfn); return 0; } diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 5c88ba70e4e0..3f24c9a831c9 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -48,7 +48,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule, iovad->cached32_node = NULL; iovad->granule = granule; iovad->start_pfn = start_pfn; - iovad->dma_32bit_pfn = pfn_32bit; + iovad->dma_32bit_pfn = pfn_32bit + 1; init_iova_rcaches(iovad); } EXPORT_SYMBOL_GPL(init_iova_domain); @@ -63,7 +63,7 @@ __get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn) struct rb_node *prev_node = rb_prev(iovad->cached32_node); struct iova *curr_iova = rb_entry(iovad->cached32_node, struct iova, node); - *limit_pfn = curr_iova->pfn_lo - 1; + *limit_pfn = curr_iova->pfn_lo; return prev_node; } } @@ -135,7 +135,7 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova, static unsigned int iova_get_pad_size(unsigned int size, unsigned int limit_pfn) { - return (limit_pfn + 1 - size) & (__roundup_pow_of_two(size) - 1); + return (limit_pfn - size) & (__roundup_pow_of_two(size) - 1); } static int __alloc_and_insert_iova_range(struct iova_domain *iovad, @@ -155,18 +155,15 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad, while (curr) { struct iova *curr_iova = rb_entry(curr, struct iova, node); - if (limit_pfn < curr_iova->pfn_lo) + if (limit_pfn <= curr_iova->pfn_lo) { goto move_left; - else if (limit_pfn < curr_iova->pfn_hi) - goto adjust_limit_pfn; - else { + } else if (limit_pfn > curr_iova->pfn_hi) { if (size_aligned) pad_size = iova_get_pad_size(size, limit_pfn); - if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn) + if ((curr_iova->pfn_hi + size + pad_size) < limit_pfn) break; /* found a free slot */ } -adjust_limit_pfn: - limit_pfn = curr_iova->pfn_lo ? (curr_iova->pfn_lo - 1) : 0; + limit_pfn = curr_iova->pfn_lo; move_left: prev = curr; curr = rb_prev(curr); @@ -182,7 +179,7 @@ move_left: } /* pfn_lo will point to size aligned address if size_aligned is set */ - new->pfn_lo = limit_pfn - (size + pad_size) + 1; + new->pfn_lo = limit_pfn - (size + pad_size); new->pfn_hi = new->pfn_lo + size - 1; /* If we have 'prev', it's a valid place to start the insertion. */ @@ -269,7 +266,7 @@ alloc_iova(struct iova_domain *iovad, unsigned long size, if (!new_iova) return NULL; - ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn, + ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1, new_iova, size_aligned); if (ret) { From b316d02a13c3a8ebe681b7232defbdf3fa28c41b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 22 May 2017 18:28:51 +0800 Subject: [PATCH 12/60] iommu/vt-d: Unwrap __get_valid_domain_for_dev() We do find_domain() in __get_valid_domain_for_dev(), while we do the same thing in get_valid_domain_for_dev(). No need to do it twice. Signed-off-by: Peter Xu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 90ab0115d78e..ee9c258d3ae0 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2387,7 +2387,7 @@ static struct dmar_domain *find_domain(struct device *dev) /* No lock here, assumes no domain exit in normal case */ info = dev->archdata.iommu; - if (info) + if (likely(info)) return info->domain; return NULL; } @@ -3475,7 +3475,7 @@ static unsigned long intel_alloc_iova(struct device *dev, return iova_pfn; } -static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev) +static struct dmar_domain *get_valid_domain_for_dev(struct device *dev) { struct dmar_domain *domain, *tmp; struct dmar_rmrr_unit *rmrr; @@ -3522,18 +3522,6 @@ out: return domain; } -static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev) -{ - struct device_domain_info *info; - - /* No lock here, assumes no domain exit in normal case */ - info = dev->archdata.iommu; - if (likely(info)) - return info->domain; - - return __get_valid_domain_for_dev(dev); -} - /* Check if the dev needs to go through non-identity map and unmap process.*/ static int iommu_no_mapping(struct device *dev) { From 71bb620df634b22a08efd62a0f93c3f2aceaa8e2 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 24 May 2017 16:31:23 +0200 Subject: [PATCH 13/60] iommu/vt-d: Constify irq_domain_ops struct irq_domain_ops is not modified, so it can be made const. Signed-off-by: Tobias Klauser Signed-off-by: Joerg Roedel --- drivers/iommu/intel_irq_remapping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index a190cbd76ef7..f7ef4a5d4785 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -76,7 +76,7 @@ static struct hpet_scope ir_hpet[MAX_HPET_TBS]; * the dmar_global_lock. */ static DEFINE_RAW_SPINLOCK(irq_2_ir_lock); -static struct irq_domain_ops intel_ir_domain_ops; +static const struct irq_domain_ops intel_ir_domain_ops; static void iommu_disable_irq_remapping(struct intel_iommu *iommu); static int __init parse_ioapics_under_ir(void); @@ -1396,7 +1396,7 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain, modify_irte(&data->irq_2_iommu, &entry); } -static struct irq_domain_ops intel_ir_domain_ops = { +static const struct irq_domain_ops intel_ir_domain_ops = { .alloc = intel_irq_remapping_alloc, .free = intel_irq_remapping_free, .activate = intel_irq_remapping_activate, From 30bf2df65b0d18f53ee9c92131239f66c8f55d63 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 15 May 2017 16:25:03 +0200 Subject: [PATCH 14/60] iommu/amd: Ratelimit io-page-faults per device Misbehaving devices can cause an endless chain of io-page-faults, flooding dmesg and making the system-log unusable or even prevent the system from booting. So ratelimit the error messages about io-page-faults on a per-device basis. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 40 ++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 63cacf5d6cf2..d3ff766a8ca6 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -138,6 +138,8 @@ struct iommu_dev_data { PPR completions */ u32 errata; /* Bitmap for errata to apply */ bool use_vapic; /* Enable device to use vapic mode */ + + struct ratelimit_state rs; /* Ratelimit IOPF messages */ }; /* @@ -253,6 +255,8 @@ static struct iommu_dev_data *alloc_dev_data(u16 devid) list_add_tail(&dev_data->dev_data_list, &dev_data_list); spin_unlock_irqrestore(&dev_data_list_lock, flags); + ratelimit_default_init(&dev_data->rs); + return dev_data; } @@ -551,6 +555,29 @@ static void dump_command(unsigned long phys_addr) pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); } +static void amd_iommu_report_page_fault(u16 devid, u16 domain_id, + u64 address, int flags) +{ + struct iommu_dev_data *dev_data = NULL; + struct pci_dev *pdev; + + pdev = pci_get_bus_and_slot(PCI_BUS_NUM(devid), devid & 0xff); + if (pdev) + dev_data = get_dev_data(&pdev->dev); + + if (dev_data && __ratelimit(&dev_data->rs)) { + dev_err(&pdev->dev, "AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%016llx flags=0x%04x]\n", + domain_id, address, flags); + } else if (printk_ratelimit()) { + pr_err("AMD-Vi: Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%016llx flags=0x%04x]\n", + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), + domain_id, address, flags); + } + + if (pdev) + pci_dev_put(pdev); +} + static void iommu_print_event(struct amd_iommu *iommu, void *__evt) { int type, devid, domid, flags; @@ -575,7 +602,12 @@ retry: goto retry; } - printk(KERN_ERR "AMD-Vi: Event logged ["); + if (type == EVENT_TYPE_IO_FAULT) { + amd_iommu_report_page_fault(devid, domid, address, flags); + return; + } else { + printk(KERN_ERR "AMD-Vi: Event logged ["); + } switch (type) { case EVENT_TYPE_ILL_DEV: @@ -585,12 +617,6 @@ retry: address, flags); dump_dte_entry(devid); break; - case EVENT_TYPE_IO_FAULT: - printk("IO_PAGE_FAULT device=%02x:%02x.%x " - "domain=0x%04x address=0x%016llx flags=0x%04x]\n", - PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), - domid, address, flags); - break; case EVENT_TYPE_DEV_TAB_ERR: printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " "address=0x%016llx flags=0x%04x]\n", From e2f9d45fb452c60c9aaab55ce6f40601f9ec6516 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 24 May 2017 16:31:16 +0200 Subject: [PATCH 15/60] iommu/amd: Constify irq_domain_ops struct irq_domain_ops is not modified, so it can be made const. Signed-off-by: Tobias Klauser Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index d3ff766a8ca6..d7748955184b 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4299,7 +4299,7 @@ static void irq_remapping_deactivate(struct irq_domain *domain, irte_info->index); } -static struct irq_domain_ops amd_ir_domain_ops = { +static const struct irq_domain_ops amd_ir_domain_ops = { .alloc = irq_remapping_alloc, .free = irq_remapping_free, .activate = irq_remapping_activate, From d334a5637dfb53f7d07017afc1e491903b482ef8 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 5 Jun 2017 14:52:12 -0500 Subject: [PATCH 16/60] iommu/amd: Reduce amount of MMIO when submitting commands As newer, higher speed devices are developed, perf data shows that the amount of MMIO that is performed when submitting commands to the IOMMU causes performance issues. Currently, the command submission path reads the command buffer head and tail pointers and then writes the tail pointer once the command is ready. The tail pointer is only ever updated by the driver so it can be tracked by the driver without having to read it from the hardware. The head pointer is updated by the hardware, but can be read opportunistically. Reading the head pointer only when it appears that there might not be room in the command buffer and then re-checking the available space reduces the number of times the head pointer has to be read. Signed-off-by: Tom Lendacky Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 35 +++++++++++++++++++++------------ drivers/iommu/amd_iommu_init.c | 2 ++ drivers/iommu/amd_iommu_types.h | 2 ++ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index d7748955184b..d81c895ff4f4 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -874,19 +874,20 @@ static int wait_on_sem(volatile u64 *sem) } static void copy_cmd_to_buffer(struct amd_iommu *iommu, - struct iommu_cmd *cmd, - u32 tail) + struct iommu_cmd *cmd) { u8 *target; - target = iommu->cmd_buf + tail; - tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; + target = iommu->cmd_buf + iommu->cmd_buf_tail; + + iommu->cmd_buf_tail += sizeof(*cmd); + iommu->cmd_buf_tail %= CMD_BUFFER_SIZE; /* Copy command to buffer */ memcpy(target, cmd, sizeof(*cmd)); /* Tell the IOMMU about it */ - writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + writel(iommu->cmd_buf_tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); } static void build_completion_wait(struct iommu_cmd *cmd, u64 address) @@ -1044,23 +1045,31 @@ static int __iommu_queue_command_sync(struct amd_iommu *iommu, struct iommu_cmd *cmd, bool sync) { - u32 left, tail, head, next_tail; + bool read_head = true; + u32 left, next_tail; + next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; again: - - head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - next_tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; - left = (head - next_tail) % CMD_BUFFER_SIZE; + left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE; if (left <= 0x20) { struct iommu_cmd sync_cmd; int ret; + if (read_head) { + /* Update head and recheck remaining space */ + iommu->cmd_buf_head = readl(iommu->mmio_base + + MMIO_CMD_HEAD_OFFSET); + read_head = false; + goto again; + } + + read_head = true; + iommu->cmd_sem = 0; build_completion_wait(&sync_cmd, (u64)&iommu->cmd_sem); - copy_cmd_to_buffer(iommu, &sync_cmd, tail); + copy_cmd_to_buffer(iommu, &sync_cmd); if ((ret = wait_on_sem(&iommu->cmd_sem)) != 0) return ret; @@ -1068,7 +1077,7 @@ again: goto again; } - copy_cmd_to_buffer(iommu, cmd, tail); + copy_cmd_to_buffer(iommu, cmd); /* We need to sync now to make sure all commands are processed */ iommu->need_sync = sync; diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 5a11328f4d98..3fa7e3b35507 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -588,6 +588,8 @@ void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu) writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + iommu->cmd_buf_head = 0; + iommu->cmd_buf_tail = 0; iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 4de8f4160bb8..6960d7db2fab 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -516,6 +516,8 @@ struct amd_iommu { /* command buffer virtual address */ u8 *cmd_buf; + u32 cmd_buf_head; + u32 cmd_buf_tail; /* event buffer virtual address */ u8 *evt_buf; From 23e967e17c58779b38f69f8d41d727f59440d36a Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 5 Jun 2017 14:52:26 -0500 Subject: [PATCH 17/60] iommu/amd: Reduce delay waiting for command buffer space Currently if there is no room to add a command to the command buffer, the driver performs a "completion wait" which only returns when all commands on the queue have been processed. There is no need to wait for the entire command queue to be executed before adding the next command. Update the driver to perform the same udelay() loop that the "completion wait" performs, but instead re-read the head pointer to determine if sufficient space is available. The very first time it is found that there is no space available, the udelay() will be skipped to immediately perform the opportunistic read of the head pointer. If it is still found that there is not sufficient space, then the udelay() will be performed. Signed-off-by: Leo Duran Signed-off-by: Tom Lendacky Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index d81c895ff4f4..1efbef7f3b61 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1045,7 +1045,7 @@ static int __iommu_queue_command_sync(struct amd_iommu *iommu, struct iommu_cmd *cmd, bool sync) { - bool read_head = true; + unsigned int count = 0; u32 left, next_tail; next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE; @@ -1053,33 +1053,26 @@ again: left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE; if (left <= 0x20) { - struct iommu_cmd sync_cmd; - int ret; + /* Skip udelay() the first time around */ + if (count++) { + if (count == LOOP_TIMEOUT) { + pr_err("AMD-Vi: Command buffer timeout\n"); + return -EIO; + } - if (read_head) { - /* Update head and recheck remaining space */ - iommu->cmd_buf_head = readl(iommu->mmio_base + - MMIO_CMD_HEAD_OFFSET); - read_head = false; - goto again; + udelay(1); } - read_head = true; - - iommu->cmd_sem = 0; - - build_completion_wait(&sync_cmd, (u64)&iommu->cmd_sem); - copy_cmd_to_buffer(iommu, &sync_cmd); - - if ((ret = wait_on_sem(&iommu->cmd_sem)) != 0) - return ret; + /* Update head and recheck remaining space */ + iommu->cmd_buf_head = readl(iommu->mmio_base + + MMIO_CMD_HEAD_OFFSET); goto again; } copy_cmd_to_buffer(iommu, cmd); - /* We need to sync now to make sure all commands are processed */ + /* Do we need to make sure all commands are processed? */ iommu->need_sync = sync; return 0; From 460c26d05bf7357a4d5b41b3d7a97727f5bdffe1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 2 Jun 2017 14:28:01 +0200 Subject: [PATCH 18/60] iommu/amd: Rip out old queue flushing code The queue flushing is pretty inefficient when it flushes the queues for all cpus at once. Further it flushes all domains from all IOMMUs for all CPUs, which is overkill as well. Rip it out to make room for something more efficient. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 143 ++------------------------------------ 1 file changed, 6 insertions(+), 137 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1efbef7f3b61..ec9da25c06a1 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -89,25 +89,6 @@ LIST_HEAD(ioapic_map); LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); -#define FLUSH_QUEUE_SIZE 256 - -struct flush_queue_entry { - unsigned long iova_pfn; - unsigned long pages; - struct dma_ops_domain *dma_dom; -}; - -struct flush_queue { - spinlock_t lock; - unsigned next; - struct flush_queue_entry *entries; -}; - -static DEFINE_PER_CPU(struct flush_queue, flush_queue); - -static atomic_t queue_timer_on; -static struct timer_list queue_timer; - /* * Domain for untranslated devices - only allocated * if iommu=pt passed on kernel cmd line. @@ -2253,92 +2234,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev) * *****************************************************************************/ -static void __queue_flush(struct flush_queue *queue) -{ - struct protection_domain *domain; - unsigned long flags; - int idx; - - /* First flush TLB of all known domains */ - spin_lock_irqsave(&amd_iommu_pd_lock, flags); - list_for_each_entry(domain, &amd_iommu_pd_list, list) - domain_flush_tlb(domain); - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); - - /* Wait until flushes have completed */ - domain_flush_complete(NULL); - - for (idx = 0; idx < queue->next; ++idx) { - struct flush_queue_entry *entry; - - entry = queue->entries + idx; - - free_iova_fast(&entry->dma_dom->iovad, - entry->iova_pfn, - entry->pages); - - /* Not really necessary, just to make sure we catch any bugs */ - entry->dma_dom = NULL; - } - - queue->next = 0; -} - -static void queue_flush_all(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct flush_queue *queue; - unsigned long flags; - - queue = per_cpu_ptr(&flush_queue, cpu); - spin_lock_irqsave(&queue->lock, flags); - if (queue->next > 0) - __queue_flush(queue); - spin_unlock_irqrestore(&queue->lock, flags); - } -} - -static void queue_flush_timeout(unsigned long unsused) -{ - atomic_set(&queue_timer_on, 0); - queue_flush_all(); -} - -static void queue_add(struct dma_ops_domain *dma_dom, - unsigned long address, unsigned long pages) -{ - struct flush_queue_entry *entry; - struct flush_queue *queue; - unsigned long flags; - int idx; - - pages = __roundup_pow_of_two(pages); - address >>= PAGE_SHIFT; - - queue = get_cpu_ptr(&flush_queue); - spin_lock_irqsave(&queue->lock, flags); - - if (queue->next == FLUSH_QUEUE_SIZE) - __queue_flush(queue); - - idx = queue->next++; - entry = queue->entries + idx; - - entry->iova_pfn = address; - entry->pages = pages; - entry->dma_dom = dma_dom; - - spin_unlock_irqrestore(&queue->lock, flags); - - if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0) - mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10)); - - put_cpu_ptr(&flush_queue); -} - - /* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the @@ -2490,7 +2385,10 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); } else { - queue_add(dma_dom, dma_addr, pages); + /* Keep the if() around, we need it later again */ + dma_ops_free_iova(dma_dom, dma_addr, pages); + domain_flush_tlb(&dma_dom->domain); + domain_flush_complete(&dma_dom->domain); } } @@ -2825,7 +2723,7 @@ static int init_reserved_iova_ranges(void) int __init amd_iommu_init_api(void) { - int ret, cpu, err = 0; + int ret, err = 0; ret = iova_cache_get(); if (ret) @@ -2835,18 +2733,6 @@ int __init amd_iommu_init_api(void) if (ret) return ret; - for_each_possible_cpu(cpu) { - struct flush_queue *queue = per_cpu_ptr(&flush_queue, cpu); - - queue->entries = kzalloc(FLUSH_QUEUE_SIZE * - sizeof(*queue->entries), - GFP_KERNEL); - if (!queue->entries) - goto out_put_iova; - - spin_lock_init(&queue->lock); - } - err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops); if (err) return err; @@ -2858,23 +2744,12 @@ int __init amd_iommu_init_api(void) err = bus_set_iommu(&platform_bus_type, &amd_iommu_ops); if (err) return err; + return 0; - -out_put_iova: - for_each_possible_cpu(cpu) { - struct flush_queue *queue = per_cpu_ptr(&flush_queue, cpu); - - kfree(queue->entries); - } - - return -ENOMEM; } int __init amd_iommu_init_dma_ops(void) { - setup_timer(&queue_timer, queue_flush_timeout, 0); - atomic_set(&queue_timer_on, 0); - swiotlb = iommu_pass_through ? 1 : 0; iommu_detected = 1; @@ -3030,12 +2905,6 @@ static void amd_iommu_domain_free(struct iommu_domain *dom) switch (dom->type) { case IOMMU_DOMAIN_DMA: - /* - * First make sure the domain is no longer referenced from the - * flush queue - */ - queue_flush_all(); - /* Now release the domain */ dma_dom = to_dma_ops_domain(domain); dma_ops_domain_free(dma_dom); From d4241a276119bf404e6c0e23f06f84b84c4ecfc0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 2 Jun 2017 14:55:56 +0200 Subject: [PATCH 19/60] iommu/amd: Add per-domain flush-queue data structures Make the flush-queue per dma-ops domain and add code allocate and free the flush-queues; Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 69 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index ec9da25c06a1..2418fcc28fbe 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -136,6 +136,18 @@ static void update_domain(struct protection_domain *domain); static int protection_domain_init(struct protection_domain *domain); static void detach_device(struct device *dev); +#define FLUSH_QUEUE_SIZE 256 + +struct flush_queue_entry { + unsigned long iova_pfn; + unsigned long pages; +}; + +struct flush_queue { + struct flush_queue_entry *entries; + unsigned head, tail; +}; + /* * Data container for a dma_ops specific protection domain */ @@ -145,6 +157,8 @@ struct dma_ops_domain { /* IOVA RB-Tree */ struct iova_domain iovad; + + struct flush_queue __percpu *flush_queue; }; static struct iova_domain reserved_iova_ranges; @@ -1742,6 +1756,56 @@ static void free_gcr3_table(struct protection_domain *domain) free_page((unsigned long)domain->gcr3_tbl); } +static void dma_ops_domain_free_flush_queue(struct dma_ops_domain *dom) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct flush_queue *queue; + + queue = per_cpu_ptr(dom->flush_queue, cpu); + kfree(queue->entries); + } + + free_percpu(dom->flush_queue); + + dom->flush_queue = NULL; +} + +static int dma_ops_domain_alloc_flush_queue(struct dma_ops_domain *dom) +{ + int cpu; + + dom->flush_queue = alloc_percpu(struct flush_queue); + if (!dom->flush_queue) + return -ENOMEM; + + /* First make sure everything is cleared */ + for_each_possible_cpu(cpu) { + struct flush_queue *queue; + + queue = per_cpu_ptr(dom->flush_queue, cpu); + queue->head = 0; + queue->tail = 0; + queue->entries = NULL; + } + + /* Now start doing the allocation */ + for_each_possible_cpu(cpu) { + struct flush_queue *queue; + + queue = per_cpu_ptr(dom->flush_queue, cpu); + queue->entries = kzalloc(FLUSH_QUEUE_SIZE * sizeof(*queue->entries), + GFP_KERNEL); + if (!queue->entries) { + dma_ops_domain_free_flush_queue(dom); + return -ENOMEM; + } + } + + return 0; +} + /* * Free a domain, only used if something went wrong in the * allocation path and we need to free an already allocated page table @@ -1753,6 +1817,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) del_domain_from_list(&dom->domain); + dma_ops_domain_free_flush_queue(dom); + put_iova_domain(&dom->iovad); free_pagetable(&dom->domain); @@ -1791,6 +1857,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(void) /* Initialize reserved ranges */ copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad); + if (dma_ops_domain_alloc_flush_queue(dma_dom)) + goto free_dma_dom; + add_domain_to_list(&dma_dom->domain); return dma_dom; From fd62190a67d6bdf9b93dea056adfcd7fd29b0f92 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 2 Jun 2017 15:37:26 +0200 Subject: [PATCH 20/60] iommu/amd: Make use of the per-domain flush queue Fill the flush-queue on unmap and only flush the IOMMU and device TLBs when a per-cpu queue gets full. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 60 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 2418fcc28fbe..9fafc3026865 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1806,6 +1806,61 @@ static int dma_ops_domain_alloc_flush_queue(struct dma_ops_domain *dom) return 0; } +static inline bool queue_ring_full(struct flush_queue *queue) +{ + return (((queue->tail + 1) % FLUSH_QUEUE_SIZE) == queue->head); +} + +#define queue_ring_for_each(i, q) \ + for (i = (q)->head; i != (q)->tail; i = (i + 1) % FLUSH_QUEUE_SIZE) + +static void queue_release(struct dma_ops_domain *dom, + struct flush_queue *queue) +{ + unsigned i; + + queue_ring_for_each(i, queue) + free_iova_fast(&dom->iovad, + queue->entries[i].iova_pfn, + queue->entries[i].pages); + + queue->head = queue->tail = 0; +} + +static inline unsigned queue_ring_add(struct flush_queue *queue) +{ + unsigned idx = queue->tail; + + queue->tail = (idx + 1) % FLUSH_QUEUE_SIZE; + + return idx; +} + +static void queue_add(struct dma_ops_domain *dom, + unsigned long address, unsigned long pages) +{ + struct flush_queue *queue; + int idx; + + pages = __roundup_pow_of_two(pages); + address >>= PAGE_SHIFT; + + queue = get_cpu_ptr(dom->flush_queue); + + if (queue_ring_full(queue)) { + domain_flush_tlb(&dom->domain); + domain_flush_complete(&dom->domain); + queue_release(dom, queue); + } + + idx = queue_ring_add(queue); + + queue->entries[idx].iova_pfn = address; + queue->entries[idx].pages = pages; + + put_cpu_ptr(dom->flush_queue); +} + /* * Free a domain, only used if something went wrong in the * allocation path and we need to free an already allocated page table @@ -2454,10 +2509,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); } else { - /* Keep the if() around, we need it later again */ - dma_ops_free_iova(dma_dom, dma_addr, pages); - domain_flush_tlb(&dma_dom->domain); - domain_flush_complete(&dma_dom->domain); + queue_add(dma_dom, dma_addr, pages); } } From e241f8e76c152e000d481fc8334d41d22c013fe8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 2 Jun 2017 15:44:57 +0200 Subject: [PATCH 21/60] iommu/amd: Add locking to per-domain flush-queue With locking we can safely access the flush-queues of other cpus. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 9fafc3026865..9a06acc8cc9d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -146,6 +146,7 @@ struct flush_queue_entry { struct flush_queue { struct flush_queue_entry *entries; unsigned head, tail; + spinlock_t lock; }; /* @@ -1801,6 +1802,8 @@ static int dma_ops_domain_alloc_flush_queue(struct dma_ops_domain *dom) dma_ops_domain_free_flush_queue(dom); return -ENOMEM; } + + spin_lock_init(&queue->lock); } return 0; @@ -1808,6 +1811,8 @@ static int dma_ops_domain_alloc_flush_queue(struct dma_ops_domain *dom) static inline bool queue_ring_full(struct flush_queue *queue) { + assert_spin_locked(&queue->lock); + return (((queue->tail + 1) % FLUSH_QUEUE_SIZE) == queue->head); } @@ -1819,6 +1824,8 @@ static void queue_release(struct dma_ops_domain *dom, { unsigned i; + assert_spin_locked(&queue->lock); + queue_ring_for_each(i, queue) free_iova_fast(&dom->iovad, queue->entries[i].iova_pfn, @@ -1831,6 +1838,7 @@ static inline unsigned queue_ring_add(struct flush_queue *queue) { unsigned idx = queue->tail; + assert_spin_locked(&queue->lock); queue->tail = (idx + 1) % FLUSH_QUEUE_SIZE; return idx; @@ -1840,12 +1848,14 @@ static void queue_add(struct dma_ops_domain *dom, unsigned long address, unsigned long pages) { struct flush_queue *queue; + unsigned long flags; int idx; pages = __roundup_pow_of_two(pages); address >>= PAGE_SHIFT; queue = get_cpu_ptr(dom->flush_queue); + spin_lock_irqsave(&queue->lock, flags); if (queue_ring_full(queue)) { domain_flush_tlb(&dom->domain); @@ -1858,6 +1868,7 @@ static void queue_add(struct dma_ops_domain *dom, queue->entries[idx].iova_pfn = address; queue->entries[idx].pages = pages; + spin_unlock_irqrestore(&queue->lock, flags); put_cpu_ptr(dom->flush_queue); } From a6e3f6f030396c0576c729fd8ca4bfb654d35bfe Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 2 Jun 2017 16:01:53 +0200 Subject: [PATCH 22/60] iommu/amd: Add flush counters to struct dma_ops_domain The counters are increased every time the TLB for a given domain is flushed. We also store the current value of that counter into newly added entries of the flush-queue, so that we can tell whether this entry is already flushed. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 52 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 9a06acc8cc9d..795208bd39bd 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -141,6 +141,7 @@ static void detach_device(struct device *dev); struct flush_queue_entry { unsigned long iova_pfn; unsigned long pages; + u64 counter; /* Flush counter when this entry was added to the queue */ }; struct flush_queue { @@ -160,6 +161,27 @@ struct dma_ops_domain { struct iova_domain iovad; struct flush_queue __percpu *flush_queue; + + /* + * We need two counter here to be race-free wrt. IOTLB flushing and + * adding entries to the flush queue. + * + * The flush_start_cnt is incremented _before_ the IOTLB flush starts. + * New entries added to the flush ring-buffer get their 'counter' value + * from here. This way we can make sure that entries added to the queue + * (or other per-cpu queues of the same domain) while the TLB is about + * to be flushed are not considered to be flushed already. + */ + atomic64_t flush_start_cnt; + + /* + * The flush_finish_cnt is incremented when an IOTLB flush is complete. + * This value is always smaller than flush_start_cnt. The queue_add + * function frees all IOVAs that have a counter value smaller than + * flush_finish_cnt. This makes sure that we only free IOVAs that are + * flushed out of the IOTLB of the domain. + */ + atomic64_t flush_finish_cnt; }; static struct iova_domain reserved_iova_ranges; @@ -1777,6 +1799,9 @@ static int dma_ops_domain_alloc_flush_queue(struct dma_ops_domain *dom) { int cpu; + atomic64_set(&dom->flush_start_cnt, 0); + atomic64_set(&dom->flush_finish_cnt, 0); + dom->flush_queue = alloc_percpu(struct flush_queue); if (!dom->flush_queue) return -ENOMEM; @@ -1844,22 +1869,48 @@ static inline unsigned queue_ring_add(struct flush_queue *queue) return idx; } +static inline void queue_ring_remove_head(struct flush_queue *queue) +{ + assert_spin_locked(&queue->lock); + queue->head = (queue->head + 1) % FLUSH_QUEUE_SIZE; +} + static void queue_add(struct dma_ops_domain *dom, unsigned long address, unsigned long pages) { struct flush_queue *queue; unsigned long flags; + u64 counter; int idx; pages = __roundup_pow_of_two(pages); address >>= PAGE_SHIFT; + counter = atomic64_read(&dom->flush_finish_cnt); + queue = get_cpu_ptr(dom->flush_queue); spin_lock_irqsave(&queue->lock, flags); + queue_ring_for_each(idx, queue) { + /* + * This assumes that counter values in the ring-buffer are + * monotonously rising. + */ + if (queue->entries[idx].counter >= counter) + break; + + free_iova_fast(&dom->iovad, + queue->entries[idx].iova_pfn, + queue->entries[idx].pages); + + queue_ring_remove_head(queue); + } + if (queue_ring_full(queue)) { + atomic64_inc(&dom->flush_start_cnt); domain_flush_tlb(&dom->domain); domain_flush_complete(&dom->domain); + atomic64_inc(&dom->flush_finish_cnt); queue_release(dom, queue); } @@ -1867,6 +1918,7 @@ static void queue_add(struct dma_ops_domain *dom, queue->entries[idx].iova_pfn = address; queue->entries[idx].pages = pages; + queue->entries[idx].counter = atomic64_read(&dom->flush_start_cnt); spin_unlock_irqrestore(&queue->lock, flags); put_cpu_ptr(dom->flush_queue); From fca6af6a5976dfa00182232f666b4f789c98bd0c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 2 Jun 2017 18:13:37 +0200 Subject: [PATCH 23/60] iommu/amd: Add per-domain timer to flush per-cpu queues Add a timer to each dma_ops domain so that we flush unused IOTLB entries regularily, even if the queues don't get full all the time. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 84 +++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 795208bd39bd..00c1796e07bf 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -182,6 +182,13 @@ struct dma_ops_domain { * flushed out of the IOTLB of the domain. */ atomic64_t flush_finish_cnt; + + /* + * Timer to make sure we don't keep IOVAs around unflushed + * for too long + */ + struct timer_list flush_timer; + atomic_t flush_timer_on; }; static struct iova_domain reserved_iova_ranges; @@ -1834,6 +1841,14 @@ static int dma_ops_domain_alloc_flush_queue(struct dma_ops_domain *dom) return 0; } +static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom) +{ + atomic64_inc(&dom->flush_start_cnt); + domain_flush_tlb(&dom->domain); + domain_flush_complete(&dom->domain); + atomic64_inc(&dom->flush_finish_cnt); +} + static inline bool queue_ring_full(struct flush_queue *queue) { assert_spin_locked(&queue->lock); @@ -1875,22 +1890,12 @@ static inline void queue_ring_remove_head(struct flush_queue *queue) queue->head = (queue->head + 1) % FLUSH_QUEUE_SIZE; } -static void queue_add(struct dma_ops_domain *dom, - unsigned long address, unsigned long pages) +static void queue_ring_free_flushed(struct dma_ops_domain *dom, + struct flush_queue *queue) { - struct flush_queue *queue; - unsigned long flags; - u64 counter; + u64 counter = atomic64_read(&dom->flush_finish_cnt); int idx; - pages = __roundup_pow_of_two(pages); - address >>= PAGE_SHIFT; - - counter = atomic64_read(&dom->flush_finish_cnt); - - queue = get_cpu_ptr(dom->flush_queue); - spin_lock_irqsave(&queue->lock, flags); - queue_ring_for_each(idx, queue) { /* * This assumes that counter values in the ring-buffer are @@ -1905,12 +1910,25 @@ static void queue_add(struct dma_ops_domain *dom, queue_ring_remove_head(queue); } +} + +static void queue_add(struct dma_ops_domain *dom, + unsigned long address, unsigned long pages) +{ + struct flush_queue *queue; + unsigned long flags; + int idx; + + pages = __roundup_pow_of_two(pages); + address >>= PAGE_SHIFT; + + queue = get_cpu_ptr(dom->flush_queue); + spin_lock_irqsave(&queue->lock, flags); + + queue_ring_free_flushed(dom, queue); if (queue_ring_full(queue)) { - atomic64_inc(&dom->flush_start_cnt); - domain_flush_tlb(&dom->domain); - domain_flush_complete(&dom->domain); - atomic64_inc(&dom->flush_finish_cnt); + dma_ops_domain_flush_tlb(dom); queue_release(dom, queue); } @@ -1921,9 +1939,33 @@ static void queue_add(struct dma_ops_domain *dom, queue->entries[idx].counter = atomic64_read(&dom->flush_start_cnt); spin_unlock_irqrestore(&queue->lock, flags); + + if (atomic_cmpxchg(&dom->flush_timer_on, 0, 1) == 0) + mod_timer(&dom->flush_timer, jiffies + msecs_to_jiffies(10)); + put_cpu_ptr(dom->flush_queue); } +static void queue_flush_timeout(unsigned long data) +{ + struct dma_ops_domain *dom = (struct dma_ops_domain *)data; + int cpu; + + atomic_set(&dom->flush_timer_on, 0); + + dma_ops_domain_flush_tlb(dom); + + for_each_possible_cpu(cpu) { + struct flush_queue *queue; + unsigned long flags; + + queue = per_cpu_ptr(dom->flush_queue, cpu); + spin_lock_irqsave(&queue->lock, flags); + queue_ring_free_flushed(dom, queue); + spin_unlock_irqrestore(&queue->lock, flags); + } +} + /* * Free a domain, only used if something went wrong in the * allocation path and we need to free an already allocated page table @@ -1935,6 +1977,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) del_domain_from_list(&dom->domain); + if (timer_pending(&dom->flush_timer)) + del_timer(&dom->flush_timer); + dma_ops_domain_free_flush_queue(dom); put_iova_domain(&dom->iovad); @@ -1978,6 +2023,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(void) if (dma_ops_domain_alloc_flush_queue(dma_dom)) goto free_dma_dom; + setup_timer(&dma_dom->flush_timer, queue_flush_timeout, + (unsigned long)dma_dom); + + atomic_set(&dma_dom->flush_timer_on, 0); + add_domain_to_list(&dma_dom->domain); return dma_dom; From ac3b708ad49f77b44faa104057783f161491e9e4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 7 Jun 2017 14:38:15 +0200 Subject: [PATCH 24/60] iommu/amd: Remove queue_release() function We can use queue_ring_free_flushed() instead, so remove this redundancy. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 00c1796e07bf..6498f5baa7ea 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1859,21 +1859,6 @@ static inline bool queue_ring_full(struct flush_queue *queue) #define queue_ring_for_each(i, q) \ for (i = (q)->head; i != (q)->tail; i = (i + 1) % FLUSH_QUEUE_SIZE) -static void queue_release(struct dma_ops_domain *dom, - struct flush_queue *queue) -{ - unsigned i; - - assert_spin_locked(&queue->lock); - - queue_ring_for_each(i, queue) - free_iova_fast(&dom->iovad, - queue->entries[i].iova_pfn, - queue->entries[i].pages); - - queue->head = queue->tail = 0; -} - static inline unsigned queue_ring_add(struct flush_queue *queue) { unsigned idx = queue->tail; @@ -1925,12 +1910,15 @@ static void queue_add(struct dma_ops_domain *dom, queue = get_cpu_ptr(dom->flush_queue); spin_lock_irqsave(&queue->lock, flags); - queue_ring_free_flushed(dom, queue); - - if (queue_ring_full(queue)) { + /* + * When ring-queue is full, flush the entries from the IOTLB so + * that we can free all entries with queue_ring_free_flushed() + * below. + */ + if (queue_ring_full(queue)) dma_ops_domain_flush_tlb(dom); - queue_release(dom, queue); - } + + queue_ring_free_flushed(dom, queue); idx = queue_ring_add(queue); From 54bd63570484167cb13edf81e31fff107b879981 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 15 Jun 2017 10:36:22 +0200 Subject: [PATCH 25/60] iommu/amd: Suppress IO_PAGE_FAULTs in kdump kernel When booting into a kdump kernel, suppress IO_PAGE_FAULTs by default for all devices. But allow the faults again when a domain is assigned to a device. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 3 ++- drivers/iommu/amd_iommu_init.c | 9 +++++++++ drivers/iommu/amd_iommu_types.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 6498f5baa7ea..95ee360a5199 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2078,7 +2078,8 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) flags |= tmp; } - flags &= ~(0xffffUL); + + flags &= ~(DTE_FLAG_SA | 0xffffULL); flags |= domain->id; amd_iommu_dev_table[devid].data[1] = flags; diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 3fa7e3b35507..cf7896550e75 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1900,6 +1901,14 @@ static void init_device_table_dma(void) for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { set_dev_entry_bit(devid, DEV_ENTRY_VALID); set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); + /* + * In kdump kernels in-flight DMA from the old kernel might + * cause IO_PAGE_FAULTs. There are no reports that a kdump + * actually failed because of that, so just disable fault + * reporting in the hardware to get rid of the messages + */ + if (is_kdump_kernel()) + set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT); } } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 6960d7db2fab..294a409e283b 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -322,6 +322,7 @@ #define IOMMU_PTE_IW (1ULL << 62) #define DTE_FLAG_IOTLB (1ULL << 32) +#define DTE_FLAG_SA (1ULL << 34) #define DTE_FLAG_GV (1ULL << 55) #define DTE_FLAG_MASK (0x3ffULL << 32) #define DTE_GLX_SHIFT (56) From 111237415393acdef9b4f700c747a2e92c9be62e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 16 Jun 2017 16:09:54 +0200 Subject: [PATCH 26/60] iommu/amd: Disable IOMMUs at boot if they are enabled When booting, make sure the IOMMUs are disabled. They could be previously enabled if we boot into a kexec or kdump kernel. So make sure they are off. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index cf7896550e75..f123fa5a7adb 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -2318,6 +2318,9 @@ static int __init early_amd_iommu_init(void) if (ret) goto out; + /* Disable any previously enabled IOMMUs */ + disable_iommus(); + if (amd_iommu_irq_remap) amd_iommu_irq_remap = check_ioapic_information(); From 90b3eb03e1b2b2ece299c485ae9da0cd221e700d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 16 Jun 2017 16:09:55 +0200 Subject: [PATCH 27/60] iommu/amd: Rename free_on_init_error() The function will also be used to free iommu resources when amd_iommu=off was specified on the kernel command line. So rename the function to reflect that. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index f123fa5a7adb..74f4eea6be77 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -2108,7 +2108,7 @@ static struct syscore_ops amd_iommu_syscore_ops = { .resume = amd_iommu_resume, }; -static void __init free_on_init_error(void) +static void __init free_iommu_resources(void) { kmemleak_free(irq_lookup_table); free_pages((unsigned long)irq_lookup_table, @@ -2536,7 +2536,7 @@ static int __init amd_iommu_init(void) free_dma_resources(); if (!irq_remapping_enabled) { disable_iommus(); - free_on_init_error(); + free_iommu_resources(); } else { struct amd_iommu *iommu; From 1b1e942e344d3d41de187c1275024e915cb15844 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 16 Jun 2017 16:09:56 +0200 Subject: [PATCH 28/60] iommu/amd: Add new init-state IOMMU_CMDLINE_DISABLED This will be used when during initialization we detect that the iommu should be disabled. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 74f4eea6be77..df9ec85271f5 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -237,6 +237,7 @@ enum iommu_init_state { IOMMU_INITIALIZED, IOMMU_NOT_FOUND, IOMMU_INIT_ERROR, + IOMMU_CMDLINE_DISABLED, }; /* Early ioapic and hpet maps from kernel command line */ @@ -2452,6 +2453,7 @@ static int __init state_next(void) break; case IOMMU_NOT_FOUND: case IOMMU_INIT_ERROR: + case IOMMU_CMDLINE_DISABLED: /* Error states => do nothing */ ret = -EINVAL; break; @@ -2469,8 +2471,9 @@ static int __init iommu_go_to_state(enum iommu_init_state state) while (init_state != state) { ret = state_next(); - if (init_state == IOMMU_NOT_FOUND || - init_state == IOMMU_INIT_ERROR) + if (init_state == IOMMU_NOT_FOUND || + init_state == IOMMU_INIT_ERROR || + init_state == IOMMU_CMDLINE_DISABLED) break; } From 151b09031a76ba6b6b83f94953074d6f10aa30b3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 16 Jun 2017 16:09:57 +0200 Subject: [PATCH 29/60] iommu/amd: Check for error states first in iommu_go_to_state() Check if we are in an error state already before calling into state_next(). Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index df9ec85271f5..a6b81a05a0d1 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -2467,14 +2467,14 @@ static int __init state_next(void) static int __init iommu_go_to_state(enum iommu_init_state state) { - int ret = 0; + int ret = -EINVAL; while (init_state != state) { - ret = state_next(); if (init_state == IOMMU_NOT_FOUND || init_state == IOMMU_INIT_ERROR || init_state == IOMMU_CMDLINE_DISABLED) break; + ret = state_next(); } return ret; From f601927136d69be49f0a14ae820b44c02fa591ba Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 16 Jun 2017 16:09:58 +0200 Subject: [PATCH 30/60] iommu/amd: Set global pointers to NULL after freeing them Avoid any tries to double-free these pointers. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index a6b81a05a0d1..8cc507f96f3a 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -2114,18 +2114,22 @@ static void __init free_iommu_resources(void) kmemleak_free(irq_lookup_table); free_pages((unsigned long)irq_lookup_table, get_order(rlookup_table_size)); + irq_lookup_table = NULL; kmem_cache_destroy(amd_iommu_irq_cache); amd_iommu_irq_cache = NULL; free_pages((unsigned long)amd_iommu_rlookup_table, get_order(rlookup_table_size)); + amd_iommu_rlookup_table = NULL; free_pages((unsigned long)amd_iommu_alias_table, get_order(alias_table_size)); + amd_iommu_alias_table = NULL; free_pages((unsigned long)amd_iommu_dev_table, get_order(dev_table_size)); + amd_iommu_dev_table = NULL; free_iommu_all(); @@ -2195,6 +2199,7 @@ static void __init free_dma_resources(void) { free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); + amd_iommu_pd_alloc_bitmap = NULL; free_unity_maps(); } From 7ad820e43330bbf974792c5ab4e2c2355e08d597 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 16 Jun 2017 16:09:59 +0200 Subject: [PATCH 31/60] iommu/amd: Free IOMMU resources when disabled on command line After we made sure that all IOMMUs have been disabled we need to make sure that all resources we allocated are released again. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 8cc507f96f3a..128f9665c326 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -2430,6 +2430,13 @@ static int __init state_next(void) case IOMMU_IVRS_DETECTED: ret = early_amd_iommu_init(); init_state = ret ? IOMMU_INIT_ERROR : IOMMU_ACPI_FINISHED; + if (init_state == IOMMU_ACPI_FINISHED && amd_iommu_disabled) { + pr_info("AMD-Vi: AMD IOMMU disabled on kernel command-line\n"); + free_dma_resources(); + free_iommu_resources(); + init_state = IOMMU_CMDLINE_DISABLED; + ret = -EINVAL; + } break; case IOMMU_ACPI_FINISHED: early_enable_iommus(); From ffa080ebb5405b27b0b84a3a75c9cdf4ed3d28da Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 16 Jun 2017 16:10:00 +0200 Subject: [PATCH 32/60] iommu/amd: Remove amd_iommu_disabled check from amd_iommu_detect() This check needs to happens later now, when all previously enabled IOMMUs have been disabled. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 128f9665c326..5cc597b383c7 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -2578,9 +2578,6 @@ int __init amd_iommu_detect(void) if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return -ENODEV; - if (amd_iommu_disabled) - return -ENODEV; - ret = iommu_go_to_state(IOMMU_IVRS_DETECTED); if (ret) return ret; From 9ce3a72cd7f7e0b9ba1c5952e4461b363824bca9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 22 Jun 2017 12:16:33 +0200 Subject: [PATCH 33/60] iommu/amd: Free already flushed ring-buffer entries before full-check To benefit from IOTLB flushes on other CPUs we have to free the already flushed IOVAs from the ring-buffer before we do the queue_ring_full() check. Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 95ee360a5199..c618c26a3b33 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1910,15 +1910,21 @@ static void queue_add(struct dma_ops_domain *dom, queue = get_cpu_ptr(dom->flush_queue); spin_lock_irqsave(&queue->lock, flags); + /* + * First remove the enries from the ring-buffer that are already + * flushed to make the below queue_ring_full() check less likely + */ + queue_ring_free_flushed(dom, queue); + /* * When ring-queue is full, flush the entries from the IOTLB so * that we can free all entries with queue_ring_free_flushed() * below. */ - if (queue_ring_full(queue)) + if (queue_ring_full(queue)) { dma_ops_domain_flush_tlb(dom); - - queue_ring_free_flushed(dom, queue); + queue_ring_free_flushed(dom, queue); + } idx = queue_ring_add(queue); From b847de4e5087fc8577c38a697d14fd2a5ce93352 Mon Sep 17 00:00:00 2001 From: Sunil Goutham Date: Fri, 5 May 2017 16:47:46 +0530 Subject: [PATCH 34/60] iommu/arm-smmu-v3: Increase CMDQ drain timeout value Waiting for a CMD_SYNC to be processed involves waiting for the command queue to drain, which can take an awful lot longer than waiting for a single entry to become available. Consequently, the common timeout value of 100us has been observed to be too short on some platforms when a CMD_SYNC is issued into a queued full of TLBI commands. This patch resolves the issue by using a different (1s) timeout when waiting for the CMDQ to drain and using a simple back-off mechanism when polling the cons pointer in the absence of WFE support. Signed-off-by: Sunil Goutham [will: rewrote commit message and cosmetic changes] Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 380969aa60d5..6a06be7626db 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -408,6 +408,7 @@ /* High-level queue structures */ #define ARM_SMMU_POLL_TIMEOUT_US 100 +#define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US 1000000 /* 1s! */ #define MSI_IOVA_BASE 0x8000000 #define MSI_IOVA_LENGTH 0x100000 @@ -737,7 +738,13 @@ static void queue_inc_prod(struct arm_smmu_queue *q) */ static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe) { - ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US); + ktime_t timeout; + unsigned int delay = 1; + + /* Wait longer if it's queue drain */ + timeout = ktime_add_us(ktime_get(), drain ? + ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US : + ARM_SMMU_POLL_TIMEOUT_US); while (queue_sync_cons(q), (drain ? !queue_empty(q) : queue_full(q))) { if (ktime_compare(ktime_get(), timeout) > 0) @@ -747,7 +754,8 @@ static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe) wfe(); } else { cpu_relax(); - udelay(1); + udelay(delay); + delay *= 2; } } From 60ab7a75c8d83049b0e6189b4128247513880b19 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 13 Jun 2017 15:58:30 +0530 Subject: [PATCH 35/60] iommu/io-pgtable-arm-v7s: constify dummy_tlb_ops. File size before: text data bss dec hex filename 6146 56 9 6211 1843 drivers/iommu/io-pgtable-arm-v7s.o File size After adding 'const': text data bss dec hex filename 6170 24 9 6203 183b drivers/iommu/io-pgtable-arm-v7s.o Signed-off-by: Arvind Yadav Signed-off-by: Will Deacon --- drivers/iommu/io-pgtable-arm-v7s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 8d6ca28c3e1f..f8869951610c 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -749,7 +749,7 @@ static void dummy_tlb_sync(void *cookie) WARN_ON(cookie != cfg_cookie); } -static struct iommu_gather_ops dummy_tlb_ops = { +static const struct iommu_gather_ops dummy_tlb_ops = { .tlb_flush_all = dummy_tlb_flush_all, .tlb_add_flush = dummy_tlb_add_flush, .tlb_sync = dummy_tlb_sync, From 84c24379a783c514e5ff7c8fc8a21cf8d64fd05f Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 19 Jun 2017 16:41:56 +0100 Subject: [PATCH 36/60] iommu/arm-smmu: Plumb in new ACPI identifiers Revision C of IORT now allows us to identify ARM MMU-401 and the Cavium ThunderX implementation. Wire them up so that we can probe these models once firmware starts using the new codes in place of generic ones, and so that the appropriate features and quirks get enabled when we do. For the sake of backports and mitigating sychronisation problems with the ACPICA headers, we'll carry a backup copy of the new definitions locally for the short term to make life simpler. CC: stable@vger.kernel.org # 4.10 Acked-by: Robert Richter Tested-by: Robert Richter Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 7ec30b08b3bd..7ecd1a0b8419 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -312,6 +312,14 @@ enum arm_smmu_implementation { CAVIUM_SMMUV2, }; +/* Until ACPICA headers cover IORT rev. C */ +#ifndef ACPI_IORT_SMMU_CORELINK_MMU401 +#define ACPI_IORT_SMMU_CORELINK_MMU401 0x4 +#endif +#ifndef ACPI_IORT_SMMU_CAVIUM_THUNDERX +#define ACPI_IORT_SMMU_CAVIUM_THUNDERX 0x5 +#endif + struct arm_smmu_s2cr { struct iommu_group *group; int count; @@ -2073,6 +2081,10 @@ static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu) smmu->version = ARM_SMMU_V1; smmu->model = GENERIC_SMMU; break; + case ACPI_IORT_SMMU_CORELINK_MMU401: + smmu->version = ARM_SMMU_V1_64K; + smmu->model = GENERIC_SMMU; + break; case ACPI_IORT_SMMU_V2: smmu->version = ARM_SMMU_V2; smmu->model = GENERIC_SMMU; @@ -2081,6 +2093,10 @@ static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu) smmu->version = ARM_SMMU_V2; smmu->model = ARM_MMU500; break; + case ACPI_IORT_SMMU_CAVIUM_THUNDERX: + smmu->version = ARM_SMMU_V2; + smmu->model = CAVIUM_SMMUV2; + break; default: ret = -ENODEV; } From ebdd13c93f8e878afcdba642f48cd1bd85619e2a Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 22 Jun 2017 12:51:00 +0530 Subject: [PATCH 37/60] iommu: arm-smmu-v3: make of_device_ids const of_device_ids are not supposed to change at runtime. All functions working with of_device_ids provided by work with const of_device_ids. So mark the non-const structs as const. Signed-off-by: Arvind Yadav Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 6a06be7626db..0fd09745822f 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2776,7 +2776,7 @@ static int arm_smmu_device_remove(struct platform_device *pdev) return 0; } -static struct of_device_id arm_smmu_of_match[] = { +static const struct of_device_id arm_smmu_of_match[] = { { .compatible = "arm,smmu-v3", }, { }, }; From 5c2d0218290afa3c335f38583bf4f8e8adad4c76 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 22 Jun 2017 12:57:42 +0530 Subject: [PATCH 38/60] iommu: arm-smmu: Handle return of iommu_device_register. iommu_device_register returns an error code and, although it currently never fails, we should check its return value anyway. Signed-off-by: Arvind Yadav [will: adjusted to follow arm-smmu.c] Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 0fd09745822f..029fe0cffee7 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2744,6 +2744,10 @@ static int arm_smmu_device_probe(struct platform_device *pdev) iommu_device_set_fwnode(&smmu->iommu, dev->fwnode); ret = iommu_device_register(&smmu->iommu); + if (ret) { + dev_err(dev, "Failed to register iommu\n"); + return ret; + } #ifdef CONFIG_PCI if (pci_bus_type.iommu_ops != &arm_smmu_ops) { From 9db829d2818501f07583542c05d01513b9161e14 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 22 Jun 2017 16:53:50 +0100 Subject: [PATCH 39/60] iommu/io-pgtable-arm-v7s: Check table PTEs more precisely Whilst we don't support the PXN bit at all, so should never encounter a level 1 section or supersection PTE with it set, it would still be wise to check both table type bits to resolve any theoretical ambiguity. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/io-pgtable-arm-v7s.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index f8869951610c..46da7aa7c7d0 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -92,7 +92,8 @@ #define ARM_V7S_PTE_TYPE_CONT_PAGE 0x1 #define ARM_V7S_PTE_IS_VALID(pte) (((pte) & 0x3) != 0) -#define ARM_V7S_PTE_IS_TABLE(pte, lvl) (lvl == 1 && ((pte) & ARM_V7S_PTE_TYPE_TABLE)) +#define ARM_V7S_PTE_IS_TABLE(pte, lvl) \ + ((lvl) == 1 && (((pte) & 0x3) == ARM_V7S_PTE_TYPE_TABLE)) /* Page table bits */ #define ARM_V7S_ATTR_XN(lvl) BIT(4 * (2 - (lvl))) From fb3a95795da53d05a4fc5fcdc0d3ec69e7163355 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 22 Jun 2017 16:53:51 +0100 Subject: [PATCH 40/60] iommu/io-pgtable-arm: Improve split_blk_unmap The current split_blk_unmap implementation suffers from some inscrutable pointer trickery for creating the tables to replace the block entry, but more than that it also suffers from hideous inefficiency. For example, the most pathological case of unmapping a level 3 page from a level 1 block will allocate 513 lower-level tables to remap the entire block at page granularity, when only 2 are actually needed (the rest can be covered by level 2 block entries). Also, we would like to be able to relax the spinlock requirement in future, for which the roll-back-and-try-again logic for race resolution would be pretty hideous under the current paradigm. Both issues can be resolved most neatly by turning things sideways: instead of repeatedly recursing into __arm_lpae_map() map to build up an entire new sub-table depth-first, we can directly replace the block entry with a next-level table of block/page entries, then repeat by unmapping at the next level if necessary. With a little refactoring of some helper functions, the code ends up not much bigger than before, but considerably easier to follow and to adapt in future. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/io-pgtable-arm.c | 118 +++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 49 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 6e5df5e0a3bd..dd7477010291 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -264,19 +264,38 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, unsigned long iova, size_t size, int lvl, arm_lpae_iopte *ptep); +static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, + phys_addr_t paddr, arm_lpae_iopte prot, + int lvl, arm_lpae_iopte *ptep) +{ + arm_lpae_iopte pte = prot; + + if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS) + pte |= ARM_LPAE_PTE_NS; + + if (lvl == ARM_LPAE_MAX_LEVELS - 1) + pte |= ARM_LPAE_PTE_TYPE_PAGE; + else + pte |= ARM_LPAE_PTE_TYPE_BLOCK; + + pte |= ARM_LPAE_PTE_AF | ARM_LPAE_PTE_SH_IS; + pte |= pfn_to_iopte(paddr >> data->pg_shift, data); + + __arm_lpae_set_pte(ptep, pte, &data->iop.cfg); +} + static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, unsigned long iova, phys_addr_t paddr, arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep) { - arm_lpae_iopte pte = prot; - struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_lpae_iopte pte = *ptep; - if (iopte_leaf(*ptep, lvl)) { + if (iopte_leaf(pte, lvl)) { /* We require an unmap first */ WARN_ON(!selftest_running); return -EEXIST; - } else if (iopte_type(*ptep, lvl) == ARM_LPAE_PTE_TYPE_TABLE) { + } else if (iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_TABLE) { /* * We need to unmap and free the old table before * overwriting it with a block entry. @@ -289,21 +308,24 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, return -EINVAL; } - if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) - pte |= ARM_LPAE_PTE_NS; - - if (lvl == ARM_LPAE_MAX_LEVELS - 1) - pte |= ARM_LPAE_PTE_TYPE_PAGE; - else - pte |= ARM_LPAE_PTE_TYPE_BLOCK; - - pte |= ARM_LPAE_PTE_AF | ARM_LPAE_PTE_SH_IS; - pte |= pfn_to_iopte(paddr >> data->pg_shift, data); - - __arm_lpae_set_pte(ptep, pte, cfg); + __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); return 0; } +static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table, + arm_lpae_iopte *ptep, + struct io_pgtable_cfg *cfg) +{ + arm_lpae_iopte new; + + new = __pa(table) | ARM_LPAE_PTE_TYPE_TABLE; + if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) + new |= ARM_LPAE_PTE_NSTABLE; + + __arm_lpae_set_pte(ptep, new, cfg); + return new; +} + static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, phys_addr_t paddr, size_t size, arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep) @@ -331,10 +353,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, if (!cptep) return -ENOMEM; - pte = __pa(cptep) | ARM_LPAE_PTE_TYPE_TABLE; - if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) - pte |= ARM_LPAE_PTE_NSTABLE; - __arm_lpae_set_pte(ptep, pte, cfg); + arm_lpae_install_table(cptep, ptep, cfg); } else if (!iopte_leaf(pte, lvl)) { cptep = iopte_deref(pte, data); } else { @@ -452,40 +471,43 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop) static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, unsigned long iova, size_t size, - arm_lpae_iopte prot, int lvl, - arm_lpae_iopte *ptep, size_t blk_size) + arm_lpae_iopte blk_pte, int lvl, + arm_lpae_iopte *ptep) { - unsigned long blk_start, blk_end; + struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_lpae_iopte pte, *tablep; phys_addr_t blk_paddr; - arm_lpae_iopte table = 0; + size_t tablesz = ARM_LPAE_GRANULE(data); + size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data); + int i, unmap_idx = -1; - blk_start = iova & ~(blk_size - 1); - blk_end = blk_start + blk_size; - blk_paddr = iopte_to_pfn(*ptep, data) << data->pg_shift; + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) + return 0; - for (; blk_start < blk_end; blk_start += size, blk_paddr += size) { - arm_lpae_iopte *tablep; + tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg); + if (!tablep) + return 0; /* Bytes unmapped */ + if (size == split_sz) + unmap_idx = ARM_LPAE_LVL_IDX(iova, lvl, data); + + blk_paddr = iopte_to_pfn(blk_pte, data) << data->pg_shift; + pte = iopte_prot(blk_pte); + + for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) { /* Unmap! */ - if (blk_start == iova) + if (i == unmap_idx) continue; - /* __arm_lpae_map expects a pointer to the start of the table */ - tablep = &table - ARM_LPAE_LVL_IDX(blk_start, lvl, data); - if (__arm_lpae_map(data, blk_start, blk_paddr, size, prot, lvl, - tablep) < 0) { - if (table) { - /* Free the table we allocated */ - tablep = iopte_deref(table, data); - __arm_lpae_free_pgtable(data, lvl + 1, tablep); - } - return 0; /* Bytes unmapped */ - } + __arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]); } - __arm_lpae_set_pte(ptep, table, &data->iop.cfg); - iova &= ~(blk_size - 1); - io_pgtable_tlb_add_flush(&data->iop, iova, blk_size, blk_size, true); + arm_lpae_install_table(tablep, ptep, cfg); + + if (unmap_idx < 0) + return __arm_lpae_unmap(data, iova, size, lvl, tablep); + + io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); return size; } @@ -495,7 +517,6 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, { arm_lpae_iopte pte; struct io_pgtable *iop = &data->iop; - size_t blk_size = ARM_LPAE_BLOCK_SIZE(lvl, data); /* Something went horribly wrong and we ran out of page table */ if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) @@ -507,7 +528,7 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, return 0; /* If the size matches this level, we're in the right place */ - if (size == blk_size) { + if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { __arm_lpae_set_pte(ptep, 0, &iop->cfg); if (!iopte_leaf(pte, lvl)) { @@ -527,9 +548,8 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, * Insert a table at the next level to map the old region, * minus the part we want to unmap */ - return arm_lpae_split_blk_unmap(data, iova, size, - iopte_prot(pte), lvl, ptep, - blk_size); + return arm_lpae_split_blk_unmap(data, iova, size, pte, + lvl + 1, ptep); } /* Keep on walkin' */ From b9f1ef30ac2e9942b8628d551f4a21e8cec1415c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 22 Jun 2017 16:53:52 +0100 Subject: [PATCH 41/60] iommu/io-pgtable-arm-v7s: Refactor split_blk_unmap Whilst the short-descriptor format's split_blk_unmap implementation has no need to be recursive, it followed the pattern of the LPAE version anyway for the sake of consistency. With the latter now reworked for both efficiency and future scalability improvements, tweak the former similarly, not least to make it less obtuse. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/io-pgtable-arm-v7s.c | 85 ++++++++++++++++-------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 46da7aa7c7d0..dc74631322e4 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -281,6 +281,13 @@ static arm_v7s_iopte arm_v7s_prot_to_pte(int prot, int lvl, else if (prot & IOMMU_CACHE) pte |= ARM_V7S_ATTR_B | ARM_V7S_ATTR_C; + pte |= ARM_V7S_PTE_TYPE_PAGE; + if (lvl == 1 && (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)) + pte |= ARM_V7S_ATTR_NS_SECTION; + + if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_4GB) + pte |= ARM_V7S_ATTR_MTK_4GB; + return pte; } @@ -353,7 +360,7 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, int lvl, int num_entries, arm_v7s_iopte *ptep) { struct io_pgtable_cfg *cfg = &data->iop.cfg; - arm_v7s_iopte pte = arm_v7s_prot_to_pte(prot, lvl, cfg); + arm_v7s_iopte pte; int i; for (i = 0; i < num_entries; i++) @@ -375,13 +382,7 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, return -EEXIST; } - pte |= ARM_V7S_PTE_TYPE_PAGE; - if (lvl == 1 && (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)) - pte |= ARM_V7S_ATTR_NS_SECTION; - - if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_4GB) - pte |= ARM_V7S_ATTR_MTK_4GB; - + pte = arm_v7s_prot_to_pte(prot, lvl, cfg); if (num_entries > 1) pte = arm_v7s_pte_to_cont(pte, lvl); @@ -391,6 +392,20 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, return 0; } +static arm_v7s_iopte arm_v7s_install_table(arm_v7s_iopte *table, + arm_v7s_iopte *ptep, + struct io_pgtable_cfg *cfg) +{ + arm_v7s_iopte new; + + new = virt_to_phys(table) | ARM_V7S_PTE_TYPE_TABLE; + if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) + new |= ARM_V7S_ATTR_NS_TABLE; + + __arm_v7s_set_pte(ptep, new, 1, cfg); + return new; +} + static int __arm_v7s_map(struct arm_v7s_io_pgtable *data, unsigned long iova, phys_addr_t paddr, size_t size, int prot, int lvl, arm_v7s_iopte *ptep) @@ -418,11 +433,7 @@ static int __arm_v7s_map(struct arm_v7s_io_pgtable *data, unsigned long iova, if (!cptep) return -ENOMEM; - pte = virt_to_phys(cptep) | ARM_V7S_PTE_TYPE_TABLE; - if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) - pte |= ARM_V7S_ATTR_NS_TABLE; - - __arm_v7s_set_pte(ptep, pte, 1, cfg); + arm_v7s_install_table(cptep, ptep, cfg); } else if (ARM_V7S_PTE_IS_TABLE(pte, lvl)) { cptep = iopte_deref(pte, lvl); } else { @@ -503,41 +514,35 @@ static void arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, static int arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, unsigned long iova, size_t size, - arm_v7s_iopte *ptep) + arm_v7s_iopte blk_pte, arm_v7s_iopte *ptep) { - unsigned long blk_start, blk_end, blk_size; - phys_addr_t blk_paddr; - arm_v7s_iopte table = 0; - int prot = arm_v7s_pte_to_prot(*ptep, 1); + struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_v7s_iopte pte, *tablep; + int i, unmap_idx, num_entries, num_ptes; - blk_size = ARM_V7S_BLOCK_SIZE(1); - blk_start = iova & ARM_V7S_LVL_MASK(1); - blk_end = blk_start + ARM_V7S_BLOCK_SIZE(1); - blk_paddr = *ptep & ARM_V7S_LVL_MASK(1); + tablep = __arm_v7s_alloc_table(2, GFP_ATOMIC, data); + if (!tablep) + return 0; /* Bytes unmapped */ - for (; blk_start < blk_end; blk_start += size, blk_paddr += size) { - arm_v7s_iopte *tablep; + num_ptes = ARM_V7S_PTES_PER_LVL(2); + num_entries = size >> ARM_V7S_LVL_SHIFT(2); + unmap_idx = ARM_V7S_LVL_IDX(iova, 2); + pte = arm_v7s_prot_to_pte(arm_v7s_pte_to_prot(blk_pte, 1), 2, cfg); + if (num_entries > 1) + pte = arm_v7s_pte_to_cont(pte, 2); + + for (i = 0; i < num_ptes; i += num_entries, pte += size) { /* Unmap! */ - if (blk_start == iova) + if (i == unmap_idx) continue; - /* __arm_v7s_map expects a pointer to the start of the table */ - tablep = &table - ARM_V7S_LVL_IDX(blk_start, 1); - if (__arm_v7s_map(data, blk_start, blk_paddr, size, prot, 1, - tablep) < 0) { - if (table) { - /* Free the table we allocated */ - tablep = iopte_deref(table, 1); - __arm_v7s_free_table(tablep, 2, data); - } - return 0; /* Bytes unmapped */ - } + __arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg); } - __arm_v7s_set_pte(ptep, table, 1, &data->iop.cfg); - iova &= ~(blk_size - 1); - io_pgtable_tlb_add_flush(&data->iop, iova, blk_size, blk_size, true); + arm_v7s_install_table(tablep, ptep, cfg); + + io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); return size; } @@ -594,7 +599,7 @@ static int __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, * Insert a table at the next level to map the old region, * minus the part we want to unmap */ - return arm_v7s_split_blk_unmap(data, iova, size, ptep); + return arm_v7s_split_blk_unmap(data, iova, size, pte[0], ptep); } /* Keep on walkin' */ From 81b3c25218447c65f93adf08b099a322b6803536 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 22 Jun 2017 16:53:53 +0100 Subject: [PATCH 42/60] iommu/io-pgtable: Introduce explicit coherency Once we remove the serialising spinlock, a potential race opens up for non-coherent IOMMUs whereby a caller of .map() can be sure that cache maintenance has been performed on their new PTE, but will have no guarantee that such maintenance for table entries above it has actually completed (e.g. if another CPU took an interrupt immediately after writing the table entry, but before initiating the DMA sync). Handling this race safely will add some potentially non-trivial overhead to installing a table entry, which we would much rather avoid on coherent systems where it will be unnecessary, and where we are stirivng to minimise latency by removing the locking in the first place. To that end, let's introduce an explicit notion of cache-coherency to io-pgtable, such that we will be able to avoid penalising IOMMUs which know enough to know when they are coherent. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 3 +++ drivers/iommu/arm-smmu.c | 3 +++ drivers/iommu/io-pgtable-arm-v7s.c | 17 ++++++++++------- drivers/iommu/io-pgtable-arm.c | 11 ++++++----- drivers/iommu/io-pgtable.h | 6 ++++++ 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 029fe0cffee7..d50c8d4b9af9 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1563,6 +1563,9 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) .iommu_dev = smmu->dev, }; + if (smmu->features & ARM_SMMU_FEAT_COHERENCY) + pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA; + pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); if (!pgtbl_ops) return -ENOMEM; diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 7ecd1a0b8419..2bc5df8a490f 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1018,6 +1018,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, .iommu_dev = smmu->dev, }; + if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) + pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA; + smmu_domain->smmu = smmu; pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); if (!pgtbl_ops) { diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index dc74631322e4..ec024c75a09e 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -187,7 +187,8 @@ static arm_v7s_iopte *iopte_deref(arm_v7s_iopte pte, int lvl) static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp, struct arm_v7s_io_pgtable *data) { - struct device *dev = data->iop.cfg.iommu_dev; + struct io_pgtable_cfg *cfg = &data->iop.cfg; + struct device *dev = cfg->iommu_dev; dma_addr_t dma; size_t size = ARM_V7S_TABLE_SIZE(lvl); void *table = NULL; @@ -196,7 +197,7 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp, table = (void *)__get_dma_pages(__GFP_ZERO, get_order(size)); else if (lvl == 2) table = kmem_cache_zalloc(data->l2_tables, gfp | GFP_DMA); - if (table && !selftest_running) { + if (table && !(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) { dma = dma_map_single(dev, table, size, DMA_TO_DEVICE); if (dma_mapping_error(dev, dma)) goto out_free; @@ -225,10 +226,11 @@ out_free: static void __arm_v7s_free_table(void *table, int lvl, struct arm_v7s_io_pgtable *data) { - struct device *dev = data->iop.cfg.iommu_dev; + struct io_pgtable_cfg *cfg = &data->iop.cfg; + struct device *dev = cfg->iommu_dev; size_t size = ARM_V7S_TABLE_SIZE(lvl); - if (!selftest_running) + if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) dma_unmap_single(dev, __arm_v7s_dma_addr(table), size, DMA_TO_DEVICE); if (lvl == 1) @@ -240,7 +242,7 @@ static void __arm_v7s_free_table(void *table, int lvl, static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries, struct io_pgtable_cfg *cfg) { - if (selftest_running) + if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) return; dma_sync_single_for_device(cfg->iommu_dev, __arm_v7s_dma_addr(ptep), @@ -657,7 +659,8 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_PERMS | IO_PGTABLE_QUIRK_TLBI_ON_MAP | - IO_PGTABLE_QUIRK_ARM_MTK_4GB)) + IO_PGTABLE_QUIRK_ARM_MTK_4GB | + IO_PGTABLE_QUIRK_NO_DMA)) return NULL; /* If ARM_MTK_4GB is enabled, the NO_PERMS is also expected. */ @@ -774,7 +777,7 @@ static int __init arm_v7s_do_selftests(void) .tlb = &dummy_tlb_ops, .oas = 32, .ias = 32, - .quirks = IO_PGTABLE_QUIRK_ARM_NS, + .quirks = IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA, .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M, }; unsigned int iova, size, iova_start; diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index dd7477010291..6334f51912ea 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -217,7 +217,7 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, if (!pages) return NULL; - if (!selftest_running) { + if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) { dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE); if (dma_mapping_error(dev, dma)) goto out_free; @@ -243,7 +243,7 @@ out_free: static void __arm_lpae_free_pages(void *pages, size_t size, struct io_pgtable_cfg *cfg) { - if (!selftest_running) + if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages), size, DMA_TO_DEVICE); free_pages_exact(pages, size); @@ -254,7 +254,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte, { *ptep = pte; - if (!selftest_running) + if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep), sizeof(pte), DMA_TO_DEVICE); @@ -693,7 +693,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) u64 reg; struct arm_lpae_io_pgtable *data; - if (cfg->quirks & ~IO_PGTABLE_QUIRK_ARM_NS) + if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA)) return NULL; data = arm_lpae_alloc_pgtable(cfg); @@ -782,7 +782,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) struct arm_lpae_io_pgtable *data; /* The NS quirk doesn't apply at stage 2 */ - if (cfg->quirks) + if (cfg->quirks & ~IO_PGTABLE_QUIRK_NO_DMA) return NULL; data = arm_lpae_alloc_pgtable(cfg); @@ -1086,6 +1086,7 @@ static int __init arm_lpae_do_selftests(void) struct io_pgtable_cfg cfg = { .tlb = &dummy_tlb_ops, .oas = 48, + .quirks = IO_PGTABLE_QUIRK_NO_DMA, }; for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h index 969d82cc92ca..524263a7ae6f 100644 --- a/drivers/iommu/io-pgtable.h +++ b/drivers/iommu/io-pgtable.h @@ -65,11 +65,17 @@ struct io_pgtable_cfg { * PTEs, for Mediatek IOMMUs which treat it as a 33rd address bit * when the SoC is in "4GB mode" and they can only access the high * remap of DRAM (0x1_00000000 to 0x1_ffffffff). + * + * IO_PGTABLE_QUIRK_NO_DMA: Guarantees that the tables will only ever + * be accessed by a fully cache-coherent IOMMU or CPU (e.g. for a + * software-emulated IOMMU), such that pagetable updates need not + * be treated as explicit DMA data. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) #define IO_PGTABLE_QUIRK_TLBI_ON_MAP BIT(2) #define IO_PGTABLE_QUIRK_ARM_MTK_4GB BIT(3) + #define IO_PGTABLE_QUIRK_NO_DMA BIT(4) unsigned long quirks; unsigned long pgsize_bitmap; unsigned int ias; From 2c3d273eabe8b1ed3b3cffe2c79643b1bf7e2d4a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 22 Jun 2017 16:53:54 +0100 Subject: [PATCH 43/60] iommu/io-pgtable-arm: Support lockless operation For parallel I/O with multiple concurrent threads servicing the same device (or devices, if several share a domain), serialising page table updates becomes a massive bottleneck. On reflection, though, we don't strictly need to do that - for valid IOMMU API usage, there are in fact only two races that we need to guard against: multiple map requests for different blocks within the same region, when the intermediate-level table for that region does not yet exist; and multiple unmaps of different parts of the same block entry. Both of those are fairly easily solved by using a cmpxchg to install the new table, such that if we then find that someone else's table got there first, we can simply free ours and continue. Make the requisite changes such that we can withstand being called without the caller maintaining a lock. In theory, this opens up a few corners in which wildly misbehaving callers making nonsensical overlapping requests might lead to crashes instead of just unpredictable results, but correct code really does not deserve to pay a significant performance cost for the sake of masking bugs in theoretical broken code. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/io-pgtable-arm.c | 72 +++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 6334f51912ea..52700fa958c2 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -20,6 +20,7 @@ #define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt +#include #include #include #include @@ -99,6 +100,8 @@ #define ARM_LPAE_PTE_ATTR_HI_MASK (((arm_lpae_iopte)6) << 52) #define ARM_LPAE_PTE_ATTR_MASK (ARM_LPAE_PTE_ATTR_LO_MASK | \ ARM_LPAE_PTE_ATTR_HI_MASK) +/* Software bit for solving coherency races */ +#define ARM_LPAE_PTE_SW_SYNC (((arm_lpae_iopte)1) << 55) /* Stage-1 PTE */ #define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6) @@ -249,15 +252,20 @@ static void __arm_lpae_free_pages(void *pages, size_t size, free_pages_exact(pages, size); } +static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, + struct io_pgtable_cfg *cfg) +{ + dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep), + sizeof(*ptep), DMA_TO_DEVICE); +} + static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte, struct io_pgtable_cfg *cfg) { *ptep = pte; if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) - dma_sync_single_for_device(cfg->iommu_dev, - __arm_lpae_dma_addr(ptep), - sizeof(pte), DMA_TO_DEVICE); + __arm_lpae_sync_pte(ptep, cfg); } static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, @@ -314,16 +322,30 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table, arm_lpae_iopte *ptep, + arm_lpae_iopte curr, struct io_pgtable_cfg *cfg) { - arm_lpae_iopte new; + arm_lpae_iopte old, new; new = __pa(table) | ARM_LPAE_PTE_TYPE_TABLE; if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) new |= ARM_LPAE_PTE_NSTABLE; - __arm_lpae_set_pte(ptep, new, cfg); - return new; + /* Ensure the table itself is visible before its PTE can be */ + wmb(); + + old = cmpxchg64_relaxed(ptep, curr, new); + + if ((cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) || + (old & ARM_LPAE_PTE_SW_SYNC)) + return old; + + /* Even if it's not ours, there's no point waiting; just kick it */ + __arm_lpae_sync_pte(ptep, cfg); + if (old == curr) + WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC); + + return old; } static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, @@ -332,6 +354,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, { arm_lpae_iopte *cptep, pte; size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data); + size_t tblsz = ARM_LPAE_GRANULE(data); struct io_pgtable_cfg *cfg = &data->iop.cfg; /* Find our entry at the current level */ @@ -346,17 +369,23 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, return -EINVAL; /* Grab a pointer to the next level */ - pte = *ptep; + pte = READ_ONCE(*ptep); if (!pte) { - cptep = __arm_lpae_alloc_pages(ARM_LPAE_GRANULE(data), - GFP_ATOMIC, cfg); + cptep = __arm_lpae_alloc_pages(tblsz, GFP_ATOMIC, cfg); if (!cptep) return -ENOMEM; - arm_lpae_install_table(cptep, ptep, cfg); - } else if (!iopte_leaf(pte, lvl)) { + pte = arm_lpae_install_table(cptep, ptep, 0, cfg); + if (pte) + __arm_lpae_free_pages(cptep, tblsz, cfg); + } else if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) && + !(pte & ARM_LPAE_PTE_SW_SYNC)) { + __arm_lpae_sync_pte(ptep, cfg); + } + + if (pte && !iopte_leaf(pte, lvl)) { cptep = iopte_deref(pte, data); - } else { + } else if (pte) { /* We require an unmap first */ WARN_ON(!selftest_running); return -EEXIST; @@ -502,7 +531,19 @@ static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, __arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]); } - arm_lpae_install_table(tablep, ptep, cfg); + pte = arm_lpae_install_table(tablep, ptep, blk_pte, cfg); + if (pte != blk_pte) { + __arm_lpae_free_pages(tablep, tablesz, cfg); + /* + * We may race against someone unmapping another part of this + * block, but anything else is invalid. We can't misinterpret + * a page entry here since we're never at the last level. + */ + if (iopte_type(pte, lvl - 1) != ARM_LPAE_PTE_TYPE_TABLE) + return 0; + + tablep = iopte_deref(pte, data); + } if (unmap_idx < 0) return __arm_lpae_unmap(data, iova, size, lvl, tablep); @@ -523,7 +564,7 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, return 0; ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); - pte = *ptep; + pte = READ_ONCE(*ptep); if (WARN_ON(!pte)) return 0; @@ -585,7 +626,8 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, return 0; /* Grab the IOPTE we're interested in */ - pte = *(ptep + ARM_LPAE_LVL_IDX(iova, lvl, data)); + ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); + pte = READ_ONCE(*ptep); /* Valid entry? */ if (!pte) From 119ff305b02793dc31eb1e921b85925ce10dc0a1 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 22 Jun 2017 16:53:55 +0100 Subject: [PATCH 44/60] iommu/io-pgtable-arm-v7s: Support lockless operation Mirroring the LPAE implementation, rework the v7s code to be robust against concurrent operations. The same two potential races exist, and are solved in the same manner, with the fixed 2-level structure making life ever so slightly simpler. What complicates matters compared to LPAE, however, is large page entries, since we can't update a block of 16 PTEs atomically, nor assume available software bits to do clever things with. As most users are never likely to do partial unmaps anyway (due to DMA API rules), it doesn't seem unreasonable for this case to remain behind a serialising lock; we just pull said lock down into the bowels of the implementation so it's well out of the way of the normal call paths. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/io-pgtable-arm-v7s.c | 84 ++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index ec024c75a09e..690629457565 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -32,6 +32,7 @@ #define pr_fmt(fmt) "arm-v7s io-pgtable: " fmt +#include #include #include #include @@ -39,6 +40,7 @@ #include #include #include +#include #include #include @@ -168,6 +170,7 @@ struct arm_v7s_io_pgtable { arm_v7s_iopte *pgd; struct kmem_cache *l2_tables; + spinlock_t split_lock; }; static dma_addr_t __arm_v7s_dma_addr(void *pages) @@ -396,16 +399,22 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, static arm_v7s_iopte arm_v7s_install_table(arm_v7s_iopte *table, arm_v7s_iopte *ptep, + arm_v7s_iopte curr, struct io_pgtable_cfg *cfg) { - arm_v7s_iopte new; + arm_v7s_iopte old, new; new = virt_to_phys(table) | ARM_V7S_PTE_TYPE_TABLE; if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) new |= ARM_V7S_ATTR_NS_TABLE; - __arm_v7s_set_pte(ptep, new, 1, cfg); - return new; + /* Ensure the table itself is visible before its PTE can be */ + wmb(); + + old = cmpxchg_relaxed(ptep, curr, new); + __arm_v7s_pte_sync(ptep, 1, cfg); + + return old; } static int __arm_v7s_map(struct arm_v7s_io_pgtable *data, unsigned long iova, @@ -429,16 +438,23 @@ static int __arm_v7s_map(struct arm_v7s_io_pgtable *data, unsigned long iova, return -EINVAL; /* Grab a pointer to the next level */ - pte = *ptep; + pte = READ_ONCE(*ptep); if (!pte) { cptep = __arm_v7s_alloc_table(lvl + 1, GFP_ATOMIC, data); if (!cptep) return -ENOMEM; - arm_v7s_install_table(cptep, ptep, cfg); - } else if (ARM_V7S_PTE_IS_TABLE(pte, lvl)) { - cptep = iopte_deref(pte, lvl); + pte = arm_v7s_install_table(cptep, ptep, 0, cfg); + if (pte) + __arm_v7s_free_table(cptep, lvl + 1, data); } else { + /* We've no easy way of knowing if it's synced yet, so... */ + __arm_v7s_pte_sync(ptep, 1, cfg); + } + + if (ARM_V7S_PTE_IS_TABLE(pte, lvl)) { + cptep = iopte_deref(pte, lvl); + } else if (pte) { /* We require an unmap first */ WARN_ON(!selftest_running); return -EEXIST; @@ -491,27 +507,31 @@ static void arm_v7s_free_pgtable(struct io_pgtable *iop) kfree(data); } -static void arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, - unsigned long iova, int idx, int lvl, - arm_v7s_iopte *ptep) +static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, + unsigned long iova, int idx, int lvl, + arm_v7s_iopte *ptep) { struct io_pgtable *iop = &data->iop; arm_v7s_iopte pte; size_t size = ARM_V7S_BLOCK_SIZE(lvl); int i; + /* Check that we didn't lose a race to get the lock */ + pte = *ptep; + if (!arm_v7s_pte_is_cont(pte, lvl)) + return pte; + ptep -= idx & (ARM_V7S_CONT_PAGES - 1); - pte = arm_v7s_cont_to_pte(*ptep, lvl); - for (i = 0; i < ARM_V7S_CONT_PAGES; i++) { - ptep[i] = pte; - pte += size; - } + pte = arm_v7s_cont_to_pte(pte, lvl); + for (i = 0; i < ARM_V7S_CONT_PAGES; i++) + ptep[i] = pte + i * size; __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg); size *= ARM_V7S_CONT_PAGES; io_pgtable_tlb_add_flush(iop, iova, size, size, true); io_pgtable_tlb_sync(iop); + return pte; } static int arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, @@ -542,7 +562,16 @@ static int arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, __arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg); } - arm_v7s_install_table(tablep, ptep, cfg); + pte = arm_v7s_install_table(tablep, ptep, blk_pte, cfg); + if (pte != blk_pte) { + __arm_v7s_free_table(tablep, 2, data); + + if (!ARM_V7S_PTE_IS_TABLE(pte, 1)) + return 0; + + tablep = iopte_deref(pte, 1); + return __arm_v7s_unmap(data, iova, size, 2, tablep); + } io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); return size; @@ -563,17 +592,28 @@ static int __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, idx = ARM_V7S_LVL_IDX(iova, lvl); ptep += idx; do { - if (WARN_ON(!ARM_V7S_PTE_IS_VALID(ptep[i]))) + pte[i] = READ_ONCE(ptep[i]); + if (WARN_ON(!ARM_V7S_PTE_IS_VALID(pte[i]))) return 0; - pte[i] = ptep[i]; } while (++i < num_entries); /* * If we've hit a contiguous 'large page' entry at this level, it * needs splitting first, unless we're unmapping the whole lot. + * + * For splitting, we can't rewrite 16 PTEs atomically, and since we + * can't necessarily assume TEX remap we don't have a software bit to + * mark live entries being split. In practice (i.e. DMA API code), we + * will never be splitting large pages anyway, so just wrap this edge + * case in a lock for the sake of correctness and be done with it. */ - if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) - arm_v7s_split_cont(data, iova, idx, lvl, ptep); + if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) { + unsigned long flags; + + spin_lock_irqsave(&data->split_lock, flags); + pte[0] = arm_v7s_split_cont(data, iova, idx, lvl, ptep); + spin_unlock_irqrestore(&data->split_lock, flags); + } /* If the size matches this level, we're in the right place */ if (num_entries) { @@ -631,7 +671,8 @@ static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops, u32 mask; do { - pte = ptep[ARM_V7S_LVL_IDX(iova, ++lvl)]; + ptep += ARM_V7S_LVL_IDX(iova, ++lvl); + pte = READ_ONCE(*ptep); ptep = iopte_deref(pte, lvl); } while (ARM_V7S_PTE_IS_TABLE(pte, lvl)); @@ -672,6 +713,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, if (!data) return NULL; + spin_lock_init(&data->split_lock); data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2", ARM_V7S_TABLE_SIZE(2), ARM_V7S_TABLE_SIZE(2), From 523d7423e21bfe78452a640878d1de189ac45d91 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 22 Jun 2017 16:53:56 +0100 Subject: [PATCH 45/60] iommu/arm-smmu: Remove io-pgtable spinlock With the io-pgtable code now robust against (valid) races, we no longer need to serialise all operations with a lock. This might make broken callers who issue concurrent operations on overlapping addresses go even more wrong than before, but hey, they already had little hope of useful or deterministic results. We do however still have to keep a lock around to serialise the ATS1* translation ops, as parallel iova_to_phys() calls could lead to unpredictable hardware behaviour otherwise. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu.c | 45 +++++++++++++--------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 2bc5df8a490f..bc89b4d6c043 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -433,10 +433,10 @@ enum arm_smmu_domain_stage { struct arm_smmu_domain { struct arm_smmu_device *smmu; struct io_pgtable_ops *pgtbl_ops; - spinlock_t pgtbl_lock; struct arm_smmu_cfg cfg; enum arm_smmu_domain_stage stage; struct mutex init_mutex; /* Protects smmu pointer */ + spinlock_t cb_lock; /* Serialises ATS1* ops */ struct iommu_domain domain; }; @@ -1113,7 +1113,7 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) } mutex_init(&smmu_domain->init_mutex); - spin_lock_init(&smmu_domain->pgtbl_lock); + spin_lock_init(&smmu_domain->cb_lock); return &smmu_domain->domain; } @@ -1391,35 +1391,23 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { - int ret; - unsigned long flags; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops; + struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; if (!ops) return -ENODEV; - spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags); - ret = ops->map(ops, iova, paddr, size, prot); - spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags); - return ret; + return ops->map(ops, iova, paddr, size, prot); } static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { - size_t ret; - unsigned long flags; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops; + struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; if (!ops) return 0; - spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags); - ret = ops->unmap(ops, iova, size); - spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags); - return ret; + return ops->unmap(ops, iova, size); } static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain, @@ -1433,10 +1421,11 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain, void __iomem *cb_base; u32 tmp; u64 phys; - unsigned long va; + unsigned long va, flags; cb_base = ARM_SMMU_CB(smmu, cfg->cbndx); + spin_lock_irqsave(&smmu_domain->cb_lock, flags); /* ATS1 registers can only be written atomically */ va = iova & ~0xfffUL; if (smmu->version == ARM_SMMU_V2) @@ -1446,6 +1435,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain, if (readl_poll_timeout_atomic(cb_base + ARM_SMMU_CB_ATSR, tmp, !(tmp & ATSR_ACTIVE), 5, 50)) { + spin_unlock_irqrestore(&smmu_domain->cb_lock, flags); dev_err(dev, "iova to phys timed out on %pad. Falling back to software table walk.\n", &iova); @@ -1453,6 +1443,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain, } phys = readq_relaxed(cb_base + ARM_SMMU_CB_PAR); + spin_unlock_irqrestore(&smmu_domain->cb_lock, flags); if (phys & CB_PAR_F) { dev_err(dev, "translation fault!\n"); dev_err(dev, "PAR = 0x%llx\n", phys); @@ -1465,10 +1456,8 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain, static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { - phys_addr_t ret; - unsigned long flags; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops; + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; if (domain->type == IOMMU_DOMAIN_IDENTITY) return iova; @@ -1476,17 +1465,11 @@ static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain, if (!ops) return 0; - spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags); if (smmu_domain->smmu->features & ARM_SMMU_FEAT_TRANS_OPS && - smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { - ret = arm_smmu_iova_to_phys_hard(domain, iova); - } else { - ret = ops->iova_to_phys(ops, iova); - } + smmu_domain->stage == ARM_SMMU_DOMAIN_S1) + return arm_smmu_iova_to_phys_hard(domain, iova); - spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags); - - return ret; + return ops->iova_to_phys(ops, iova); } static bool arm_smmu_capable(enum iommu_cap cap) From 58188afeb727e0f73706f1460707bd3ba6ccc221 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 22 Jun 2017 16:53:57 +0100 Subject: [PATCH 46/60] iommu/arm-smmu-v3: Remove io-pgtable spinlock As for SMMUv2, take advantage of io-pgtable's newfound tolerance for concurrency. Unfortunately in this case the command queue lock remains a point of serialisation for the unmap path, but there may be a little more we can do to ameliorate that in future. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index d50c8d4b9af9..884b1a49a52a 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -646,7 +646,6 @@ struct arm_smmu_domain { struct mutex init_mutex; /* Protects smmu pointer */ struct io_pgtable_ops *pgtbl_ops; - spinlock_t pgtbl_lock; enum arm_smmu_domain_stage stage; union { @@ -1414,7 +1413,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) } mutex_init(&smmu_domain->init_mutex); - spin_lock_init(&smmu_domain->pgtbl_lock); return &smmu_domain->domain; } @@ -1686,44 +1684,29 @@ out_unlock: static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { - int ret; - unsigned long flags; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; if (!ops) return -ENODEV; - spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags); - ret = ops->map(ops, iova, paddr, size, prot); - spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags); - return ret; + return ops->map(ops, iova, paddr, size, prot); } static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { - size_t ret; - unsigned long flags; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; if (!ops) return 0; - spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags); - ret = ops->unmap(ops, iova, size); - spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags); - return ret; + return ops->unmap(ops, iova, size); } static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { - phys_addr_t ret; - unsigned long flags; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; if (domain->type == IOMMU_DOMAIN_IDENTITY) return iova; @@ -1731,11 +1714,7 @@ arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) if (!ops) return 0; - spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags); - ret = ops->iova_to_phys(ops, iova); - spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags); - - return ret; + return ops->iova_to_phys(ops, iova); } static struct platform_driver arm_smmu_driver; From c1004803b40596c1aabbbc78a6b1b33e4dfd96c6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 23 Jun 2017 11:45:57 +0100 Subject: [PATCH 47/60] iommu/io-pgtable: depend on !GENERIC_ATOMIC64 when using COMPILE_TEST with LPAE The LPAE/ARMv8 page table format relies on the ability to read and write 64-bit page table entries in an atomic fashion. With the move to a lockless implementation, we also need support for cmpxchg64 to resolve races when installing table entries concurrently. Unfortunately, not all architectures support cmpxchg64, so the code can fail to compiler when building for these architectures using COMPILE_TEST. Rather than disable COMPILE_TEST altogether, instead check that GENERIC_ATOMIC64 is not selected, which is a reasonable indication that the architecture has support for 64-bit cmpxchg. Reported-by: kbuild test robot Signed-off-by: Will Deacon --- drivers/iommu/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6ee3a25ae731..c88cfa7522b2 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -23,7 +23,7 @@ config IOMMU_IO_PGTABLE config IOMMU_IO_PGTABLE_LPAE bool "ARMv7/v8 Long Descriptor Format" select IOMMU_IO_PGTABLE - depends on HAS_DMA && (ARM || ARM64 || COMPILE_TEST) + depends on HAS_DMA && (ARM || ARM64 || (COMPILE_TEST && !GENERIC_ATOMIC64)) help Enable support for the ARM long descriptor pagetable format. This allocator supports 4K/2M/1G, 16K/32M and 64K/512M page From 77f3445866c39d8866b31d8d9fa47c7c20938e05 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 23 Jun 2017 12:02:38 +0100 Subject: [PATCH 48/60] iommu/io-pgtable-arm: Use dma_wmb() instead of wmb() when publishing table When writing a new table entry, we must ensure that the contents of the table is made visible to the SMMU page table walker before the updated table entry itself. This is currently achieved using wmb(), which expands to an expensive and unnecessary DSB instruction. Ideally, we'd just use cmpxchg64_release when writing the table entry, but this doesn't have memory ordering semantics on !SMP systems. Instead, use dma_wmb(), which emits DMB OSHST. Strictly speaking, this does more than we require (since it targets the outer-shareable domain), but it's likely to be significantly faster than the DSB approach. Reported-by: Linu Cherian Suggested-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/iommu/io-pgtable-arm-v7s.c | 8 ++++++-- drivers/iommu/io-pgtable-arm.c | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 690629457565..af330f513653 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -408,8 +408,12 @@ static arm_v7s_iopte arm_v7s_install_table(arm_v7s_iopte *table, if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) new |= ARM_V7S_ATTR_NS_TABLE; - /* Ensure the table itself is visible before its PTE can be */ - wmb(); + /* + * Ensure the table itself is visible before its PTE can be. + * Whilst we could get away with cmpxchg64_release below, this + * doesn't have any ordering semantics when !CONFIG_SMP. + */ + dma_wmb(); old = cmpxchg_relaxed(ptep, curr, new); __arm_v7s_pte_sync(ptep, 1, cfg); diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 52700fa958c2..b182039862c5 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -331,8 +331,12 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table, if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) new |= ARM_LPAE_PTE_NSTABLE; - /* Ensure the table itself is visible before its PTE can be */ - wmb(); + /* + * Ensure the table itself is visible before its PTE can be. + * Whilst we could get away with cmpxchg64_release below, this + * doesn't have any ordering semantics when !CONFIG_SMP. + */ + dma_wmb(); old = cmpxchg64_relaxed(ptep, curr, new); From 12275bf0a4deb690a5dc9903d207060737b7bae6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 22 Jun 2017 21:20:54 +0200 Subject: [PATCH 49/60] iommu/arm-smmu-v3, acpi: Add temporary Cavium SMMU-V3 IORT model number definitions The model number is already defined in acpica and we are actually waiting for the acpi maintainers to include it: https://github.com/acpica/acpica/commit/d00a4eb86e64 Adding those temporary definitions until the change makes it into include/acpi/actbl2.h. Once that is done this patch can be reverted. Acked-by: Lorenzo Pieralisi Signed-off-by: Robert Richter Signed-off-by: Will Deacon --- drivers/acpi/arm64/iort.c | 5 +++++ drivers/iommu/arm-smmu-v3.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index c5fecf97ee2f..da225710b009 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -31,6 +31,11 @@ #define IORT_IOMMU_TYPE ((1 << ACPI_IORT_NODE_SMMU) | \ (1 << ACPI_IORT_NODE_SMMU_V3)) +/* Until ACPICA headers cover IORT rev. C */ +#ifndef ACPI_IORT_SMMU_V3_CAVIUM_CN99XX +#define ACPI_IORT_SMMU_V3_CAVIUM_CN99XX 0x2 +#endif + struct iort_its_msi_chip { struct list_head list; struct fwnode_handle *fw_node; diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 884b1a49a52a..da481d53e09b 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -413,6 +413,11 @@ #define MSI_IOVA_BASE 0x8000000 #define MSI_IOVA_LENGTH 0x100000 +/* Until ACPICA headers cover IORT rev. C */ +#ifndef ACPI_IORT_SMMU_V3_CAVIUM_CN99XX +#define ACPI_IORT_SMMU_V3_CAVIUM_CN99XX 0x2 +#endif + static bool disable_bypass; module_param_named(disable_bypass, disable_bypass, bool, S_IRUGO); MODULE_PARM_DESC(disable_bypass, From 403e8c7c5bcaff3291a2c7012fe80f707a854d10 Mon Sep 17 00:00:00 2001 From: Linu Cherian Date: Thu, 22 Jun 2017 17:35:36 +0530 Subject: [PATCH 50/60] ACPI/IORT: Fixup SMMUv3 resource size for Cavium ThunderX2 SMMUv3 model Cavium ThunderX2 implementation doesn't support second page in SMMU register space. Hence, resource size is set as 64k for this model. Signed-off-by: Linu Cherian Signed-off-by: Geetha Sowjanya Signed-off-by: Will Deacon --- drivers/acpi/arm64/iort.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index da225710b009..a8ebda9f7e97 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -833,6 +833,18 @@ static int __init arm_smmu_v3_count_resources(struct acpi_iort_node *node) return num_res; } +static unsigned long arm_smmu_v3_resource_size(struct acpi_iort_smmu_v3 *smmu) +{ + /* + * Override the size, for Cavium ThunderX2 implementation + * which doesn't support the page 1 SMMU register space. + */ + if (smmu->model == ACPI_IORT_SMMU_V3_CAVIUM_CN99XX) + return SZ_64K; + + return SZ_128K; +} + static void __init arm_smmu_v3_init_resources(struct resource *res, struct acpi_iort_node *node) { @@ -843,7 +855,8 @@ static void __init arm_smmu_v3_init_resources(struct resource *res, smmu = (struct acpi_iort_smmu_v3 *)node->node_data; res[num_res].start = smmu->base_address; - res[num_res].end = smmu->base_address + SZ_128K - 1; + res[num_res].end = smmu->base_address + + arm_smmu_v3_resource_size(smmu) - 1; res[num_res].flags = IORESOURCE_MEM; num_res++; From e5b829de053d9994dfc8652ce558e90e3406c578 Mon Sep 17 00:00:00 2001 From: Linu Cherian Date: Thu, 22 Jun 2017 17:35:37 +0530 Subject: [PATCH 51/60] iommu/arm-smmu-v3: Add workaround for Cavium ThunderX2 erratum #74 Cavium ThunderX2 SMMU implementation doesn't support page 1 register space and PAGE0_REGS_ONLY option is enabled as an errata workaround. This option when turned on, replaces all page 1 offsets used for EVTQ_PROD/CONS, PRIQ_PROD/CONS register access with page 0 offsets. SMMU resource size checks are now based on SMMU option PAGE0_REGS_ONLY, since resource size can be either 64k/128k. For this, arm_smmu_device_dt_probe/acpi_probe has been moved before platform_get_resource call, so that SMMU options are set beforehand. Signed-off-by: Linu Cherian Signed-off-by: Geetha Sowjanya Signed-off-by: Will Deacon --- Documentation/arm64/silicon-errata.txt | 1 + .../devicetree/bindings/iommu/arm,smmu-v3.txt | 6 ++ drivers/iommu/arm-smmu-v3.c | 68 ++++++++++++++----- 3 files changed, 57 insertions(+), 18 deletions(-) diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 10f2dddbf449..4693a328947a 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -62,6 +62,7 @@ stable kernels. | Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | | Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | | Cavium | ThunderX SMMUv2 | #27704 | N/A | +| Cavium | ThunderX2 SMMUv3| #74 | N/A | | | | | | | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | | | | | | diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt b/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt index be57550e14e4..e7855cf6038e 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt +++ b/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt @@ -49,6 +49,12 @@ the PCIe specification. - hisilicon,broken-prefetch-cmd : Avoid sending CMD_PREFETCH_* commands to the SMMU. +- cavium,cn9900-broken-page1-regspace + : Replaces all page 1 offsets used for EVTQ_PROD/CONS, + PRIQ_PROD/CONS register access with page 0 offsets. + Set for Cavium ThunderX2 silicon that doesn't support + SMMU page1 register space. + ** Example smmu@2b400000 { diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index da481d53e09b..2d5b48b4260a 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -603,6 +603,7 @@ struct arm_smmu_device { u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) +#define ARM_SMMU_OPT_PAGE0_REGS_ONLY (1 << 1) u32 options; struct arm_smmu_cmdq cmdq; @@ -668,9 +669,20 @@ struct arm_smmu_option_prop { static struct arm_smmu_option_prop arm_smmu_options[] = { { ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" }, + { ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"}, { 0, NULL}, }; +static inline void __iomem *arm_smmu_page1_fixup(unsigned long offset, + struct arm_smmu_device *smmu) +{ + if ((offset > SZ_64K) && + (smmu->options & ARM_SMMU_OPT_PAGE0_REGS_ONLY)) + offset -= SZ_64K; + + return smmu->base + offset; +} + static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) { return container_of(dom, struct arm_smmu_domain, domain); @@ -1956,8 +1968,8 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, return -ENOMEM; } - q->prod_reg = smmu->base + prod_off; - q->cons_reg = smmu->base + cons_off; + q->prod_reg = arm_smmu_page1_fixup(prod_off, smmu); + q->cons_reg = arm_smmu_page1_fixup(cons_off, smmu); q->ent_dwords = dwords; q->q_base = Q_BASE_RWA; @@ -2358,8 +2370,10 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) /* Event queue */ writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE); - writel_relaxed(smmu->evtq.q.prod, smmu->base + ARM_SMMU_EVTQ_PROD); - writel_relaxed(smmu->evtq.q.cons, smmu->base + ARM_SMMU_EVTQ_CONS); + writel_relaxed(smmu->evtq.q.prod, + arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu)); + writel_relaxed(smmu->evtq.q.cons, + arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu)); enables |= CR0_EVTQEN; ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0, @@ -2374,9 +2388,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) writeq_relaxed(smmu->priq.q.q_base, smmu->base + ARM_SMMU_PRIQ_BASE); writel_relaxed(smmu->priq.q.prod, - smmu->base + ARM_SMMU_PRIQ_PROD); + arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu)); writel_relaxed(smmu->priq.q.cons, - smmu->base + ARM_SMMU_PRIQ_CONS); + arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu)); enables |= CR0_PRIQEN; ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0, @@ -2600,6 +2614,14 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) } #ifdef CONFIG_ACPI +static void acpi_smmu_get_options(u32 model, struct arm_smmu_device *smmu) +{ + if (model == ACPI_IORT_SMMU_V3_CAVIUM_CN99XX) + smmu->options |= ARM_SMMU_OPT_PAGE0_REGS_ONLY; + + dev_notice(smmu->dev, "option mask 0x%x\n", smmu->options); +} + static int arm_smmu_device_acpi_probe(struct platform_device *pdev, struct arm_smmu_device *smmu) { @@ -2612,6 +2634,8 @@ static int arm_smmu_device_acpi_probe(struct platform_device *pdev, /* Retrieve SMMUv3 specific data */ iort_smmu = (struct acpi_iort_smmu_v3 *)node->node_data; + acpi_smmu_get_options(iort_smmu->model, smmu); + if (iort_smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) smmu->features |= ARM_SMMU_FEAT_COHERENCY; @@ -2647,6 +2671,14 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev, return ret; } +static unsigned long arm_smmu_resource_size(struct arm_smmu_device *smmu) +{ + if (smmu->options & ARM_SMMU_OPT_PAGE0_REGS_ONLY) + return SZ_64K; + else + return SZ_128K; +} + static int arm_smmu_device_probe(struct platform_device *pdev) { int irq, ret; @@ -2663,9 +2695,20 @@ static int arm_smmu_device_probe(struct platform_device *pdev) } smmu->dev = dev; + if (dev->of_node) { + ret = arm_smmu_device_dt_probe(pdev, smmu); + } else { + ret = arm_smmu_device_acpi_probe(pdev, smmu); + if (ret == -ENODEV) + return ret; + } + + /* Set bypass mode according to firmware probing result */ + bypass = !!ret; + /* Base address */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (resource_size(res) + 1 < SZ_128K) { + if (resource_size(res) + 1 < arm_smmu_resource_size(smmu)) { dev_err(dev, "MMIO region too small (%pr)\n", res); return -EINVAL; } @@ -2692,17 +2735,6 @@ static int arm_smmu_device_probe(struct platform_device *pdev) if (irq > 0) smmu->gerr_irq = irq; - if (dev->of_node) { - ret = arm_smmu_device_dt_probe(pdev, smmu); - } else { - ret = arm_smmu_device_acpi_probe(pdev, smmu); - if (ret == -ENODEV) - return ret; - } - - /* Set bypass mode according to firmware probing result */ - bypass = !!ret; - /* Probe the h/w */ ret = arm_smmu_device_hw_probe(smmu); if (ret) From 99caf177f6fd3e67575f6ce05b36e8e041bcef60 Mon Sep 17 00:00:00 2001 From: shameer Date: Wed, 17 May 2017 10:12:05 +0100 Subject: [PATCH 52/60] iommu/arm-smmu-v3: Enable ACPI based HiSilicon CMD_PREFETCH quirk(erratum 161010701) HiSilicon SMMUv3 on Hip06/Hip07 platforms doesn't support CMD_PREFETCH command. The dt based support for this quirk is already present in the driver(hisilicon,broken-prefetch-cmd). This adds ACPI support for the quirk using the IORT smmu model number. Signed-off-by: shameer Signed-off-by: hanjun [will: rewrote patch] Signed-off-by: Will Deacon --- Documentation/arm64/silicon-errata.txt | 1 + drivers/iommu/arm-smmu-v3.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 4693a328947a..ef4e43590685 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -67,6 +67,7 @@ stable kernels. | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | | | | | | | Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 | +| Hisilicon | Hip0{6,7} | #161010701 | N/A | | | | | | | Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 2d5b48b4260a..81fc1b5c91ee 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -414,6 +414,10 @@ #define MSI_IOVA_LENGTH 0x100000 /* Until ACPICA headers cover IORT rev. C */ +#ifndef ACPI_IORT_SMMU_HISILICON_HI161X +#define ACPI_IORT_SMMU_HISILICON_HI161X 0x1 +#endif + #ifndef ACPI_IORT_SMMU_V3_CAVIUM_CN99XX #define ACPI_IORT_SMMU_V3_CAVIUM_CN99XX 0x2 #endif @@ -2616,8 +2620,14 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) #ifdef CONFIG_ACPI static void acpi_smmu_get_options(u32 model, struct arm_smmu_device *smmu) { - if (model == ACPI_IORT_SMMU_V3_CAVIUM_CN99XX) + switch (model) { + case ACPI_IORT_SMMU_V3_CAVIUM_CN99XX: smmu->options |= ARM_SMMU_OPT_PAGE0_REGS_ONLY; + break; + case ACPI_IORT_SMMU_HISILICON_HI161X: + smmu->options |= ARM_SMMU_OPT_SKIP_PREFETCH; + break; + } dev_notice(smmu->dev, "option mask 0x%x\n", smmu->options); } From f935448acf462c26142e8b04f1c8829b28d3b9d8 Mon Sep 17 00:00:00 2001 From: Geetha Sowjanya Date: Fri, 23 Jun 2017 19:04:36 +0530 Subject: [PATCH 53/60] iommu/arm-smmu-v3: Add workaround for Cavium ThunderX2 erratum #126 Cavium ThunderX2 SMMU doesn't support MSI and also doesn't have unique irq lines for gerror, eventq and cmdq-sync. New named irq "combined" is set as a errata workaround, which allows to share the irq line by register single irq handler for all the interrupts. Acked-by: Lorenzo Pieralisi Signed-off-by: Geetha sowjanya [will: reworked irq equality checking and added SPI check] Signed-off-by: Will Deacon --- Documentation/arm64/silicon-errata.txt | 1 + .../devicetree/bindings/iommu/arm,smmu-v3.txt | 6 ++ drivers/acpi/arm64/iort.c | 57 +++++++---- drivers/iommu/arm-smmu-v3.c | 94 ++++++++++++++----- 4 files changed, 118 insertions(+), 40 deletions(-) diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index ef4e43590685..856479525776 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -63,6 +63,7 @@ stable kernels. | Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | | Cavium | ThunderX SMMUv2 | #27704 | N/A | | Cavium | ThunderX2 SMMUv3| #74 | N/A | +| Cavium | ThunderX2 SMMUv3| #126 | N/A | | | | | | | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | | | | | | diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt b/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt index e7855cf6038e..c9abbf3e4f68 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt +++ b/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt @@ -26,6 +26,12 @@ the PCIe specification. * "priq" - PRI Queue not empty * "cmdq-sync" - CMD_SYNC complete * "gerror" - Global Error activated + * "combined" - The combined interrupt is optional, + and should only be provided if the + hardware supports just a single, + combined interrupt line. + If provided, then the combined interrupt + will be used in preference to any others. - #iommu-cells : See the generic IOMMU binding described in devicetree/bindings/pci/pci-iommu.txt diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index a8ebda9f7e97..83d65d96676f 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -833,6 +833,24 @@ static int __init arm_smmu_v3_count_resources(struct acpi_iort_node *node) return num_res; } +static bool arm_smmu_v3_is_combined_irq(struct acpi_iort_smmu_v3 *smmu) +{ + /* + * Cavium ThunderX2 implementation doesn't not support unique + * irq line. Use single irq line for all the SMMUv3 interrupts. + */ + if (smmu->model != ACPI_IORT_SMMU_V3_CAVIUM_CN99XX) + return false; + + /* + * ThunderX2 doesn't support MSIs from the SMMU, so we're checking + * SPI numbers here. + */ + return smmu->event_gsiv == smmu->pri_gsiv && + smmu->event_gsiv == smmu->gerr_gsiv && + smmu->event_gsiv == smmu->sync_gsiv; +} + static unsigned long arm_smmu_v3_resource_size(struct acpi_iort_smmu_v3 *smmu) { /* @@ -860,26 +878,33 @@ static void __init arm_smmu_v3_init_resources(struct resource *res, res[num_res].flags = IORESOURCE_MEM; num_res++; + if (arm_smmu_v3_is_combined_irq(smmu)) { + if (smmu->event_gsiv) + acpi_iort_register_irq(smmu->event_gsiv, "combined", + ACPI_EDGE_SENSITIVE, + &res[num_res++]); + } else { - if (smmu->event_gsiv) - acpi_iort_register_irq(smmu->event_gsiv, "eventq", - ACPI_EDGE_SENSITIVE, - &res[num_res++]); + if (smmu->event_gsiv) + acpi_iort_register_irq(smmu->event_gsiv, "eventq", + ACPI_EDGE_SENSITIVE, + &res[num_res++]); - if (smmu->pri_gsiv) - acpi_iort_register_irq(smmu->pri_gsiv, "priq", - ACPI_EDGE_SENSITIVE, - &res[num_res++]); + if (smmu->pri_gsiv) + acpi_iort_register_irq(smmu->pri_gsiv, "priq", + ACPI_EDGE_SENSITIVE, + &res[num_res++]); - if (smmu->gerr_gsiv) - acpi_iort_register_irq(smmu->gerr_gsiv, "gerror", - ACPI_EDGE_SENSITIVE, - &res[num_res++]); + if (smmu->gerr_gsiv) + acpi_iort_register_irq(smmu->gerr_gsiv, "gerror", + ACPI_EDGE_SENSITIVE, + &res[num_res++]); - if (smmu->sync_gsiv) - acpi_iort_register_irq(smmu->sync_gsiv, "cmdq-sync", - ACPI_EDGE_SENSITIVE, - &res[num_res++]); + if (smmu->sync_gsiv) + acpi_iort_register_irq(smmu->sync_gsiv, "cmdq-sync", + ACPI_EDGE_SENSITIVE, + &res[num_res++]); + } } static bool __init arm_smmu_v3_is_coherent(struct acpi_iort_node *node) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 81fc1b5c91ee..568c400eeaed 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -615,6 +615,7 @@ struct arm_smmu_device { struct arm_smmu_priq priq; int gerr_irq; + int combined_irq; unsigned long ias; /* IPA */ unsigned long oas; /* PA */ @@ -1330,6 +1331,24 @@ static irqreturn_t arm_smmu_gerror_handler(int irq, void *dev) return IRQ_HANDLED; } +static irqreturn_t arm_smmu_combined_irq_thread(int irq, void *dev) +{ + struct arm_smmu_device *smmu = dev; + + arm_smmu_evtq_thread(irq, dev); + if (smmu->features & ARM_SMMU_FEAT_PRI) + arm_smmu_priq_thread(irq, dev); + + return IRQ_HANDLED; +} + +static irqreturn_t arm_smmu_combined_irq_handler(int irq, void *dev) +{ + arm_smmu_gerror_handler(irq, dev); + arm_smmu_cmdq_sync_handler(irq, dev); + return IRQ_WAKE_THREAD; +} + /* IO_PGTABLE API */ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu) { @@ -2229,18 +2248,9 @@ static void arm_smmu_setup_msis(struct arm_smmu_device *smmu) devm_add_action(dev, arm_smmu_free_msis, dev); } -static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu) +static void arm_smmu_setup_unique_irqs(struct arm_smmu_device *smmu) { - int ret, irq; - u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN; - - /* Disable IRQs first */ - ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_IRQ_CTRL, - ARM_SMMU_IRQ_CTRLACK); - if (ret) { - dev_err(smmu->dev, "failed to disable irqs\n"); - return ret; - } + int irq, ret; arm_smmu_setup_msis(smmu); @@ -2283,10 +2293,41 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu) if (ret < 0) dev_warn(smmu->dev, "failed to enable priq irq\n"); - else - irqen_flags |= IRQ_CTRL_PRIQ_IRQEN; } } +} + +static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu) +{ + int ret, irq; + u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN; + + /* Disable IRQs first */ + ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_IRQ_CTRL, + ARM_SMMU_IRQ_CTRLACK); + if (ret) { + dev_err(smmu->dev, "failed to disable irqs\n"); + return ret; + } + + irq = smmu->combined_irq; + if (irq) { + /* + * Cavium ThunderX2 implementation doesn't not support unique + * irq lines. Use single irq line for all the SMMUv3 interrupts. + */ + ret = devm_request_threaded_irq(smmu->dev, irq, + arm_smmu_combined_irq_handler, + arm_smmu_combined_irq_thread, + IRQF_ONESHOT, + "arm-smmu-v3-combined-irq", smmu); + if (ret < 0) + dev_warn(smmu->dev, "failed to enable combined irq\n"); + } else + arm_smmu_setup_unique_irqs(smmu); + + if (smmu->features & ARM_SMMU_FEAT_PRI) + irqen_flags |= IRQ_CTRL_PRIQ_IRQEN; /* Enable interrupt generation on the SMMU */ ret = arm_smmu_write_reg_sync(smmu, irqen_flags, @@ -2729,22 +2770,27 @@ static int arm_smmu_device_probe(struct platform_device *pdev) return PTR_ERR(smmu->base); /* Interrupt lines */ - irq = platform_get_irq_byname(pdev, "eventq"); - if (irq > 0) - smmu->evtq.q.irq = irq; - irq = platform_get_irq_byname(pdev, "priq"); + irq = platform_get_irq_byname(pdev, "combined"); if (irq > 0) - smmu->priq.q.irq = irq; + smmu->combined_irq = irq; + else { + irq = platform_get_irq_byname(pdev, "eventq"); + if (irq > 0) + smmu->evtq.q.irq = irq; - irq = platform_get_irq_byname(pdev, "cmdq-sync"); - if (irq > 0) - smmu->cmdq.q.irq = irq; + irq = platform_get_irq_byname(pdev, "priq"); + if (irq > 0) + smmu->priq.q.irq = irq; - irq = platform_get_irq_byname(pdev, "gerror"); - if (irq > 0) - smmu->gerr_irq = irq; + irq = platform_get_irq_byname(pdev, "cmdq-sync"); + if (irq > 0) + smmu->cmdq.q.irq = irq; + irq = platform_get_irq_byname(pdev, "gerror"); + if (irq > 0) + smmu->gerr_irq = irq; + } /* Probe the h/w */ ret = arm_smmu_device_hw_probe(smmu); if (ret) From aaffaa8a3b5950c47e5f7573c34bc47de8894a18 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 27 Jun 2017 18:16:47 +0200 Subject: [PATCH 54/60] iommu/iova: Don't disable preempt around this_cpu_ptr() Commit 583248e6620a ("iommu/iova: Disable preemption around use of this_cpu_ptr()") disables preemption while accessing a per-CPU variable. This does keep lockdep quiet. However I don't see the point why it is bad if we get migrated after its access to another CPU. __iova_rcache_insert() and __iova_rcache_get() immediately locks the variable after obtaining it - before accessing its members. _If_ we get migrated away after retrieving the address of cpu_rcache before taking the lock then the *other* task on the same CPU will retrieve the same address of cpu_rcache and will spin on the lock. alloc_iova_fast() disables preemption while invoking free_cpu_cached_iovas() on each CPU. The function itself uses per_cpu_ptr() which does not trigger a warning (like this_cpu_ptr() does). It _could_ make sense to use get_online_cpus() instead but the we have a hotplug notifier for CPU down (and none for up) so we are good. Cc: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: Andrew Morton Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Joerg Roedel --- drivers/iommu/iova.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 5c88ba70e4e0..f0ff0aa04081 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -22,6 +22,7 @@ #include #include #include +#include static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn, @@ -398,10 +399,8 @@ retry: /* Try replenishing IOVAs by flushing rcache. */ flushed_rcache = true; - preempt_disable(); for_each_online_cpu(cpu) free_cpu_cached_iovas(cpu, iovad); - preempt_enable(); goto retry; } @@ -729,7 +728,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, bool can_insert = false; unsigned long flags; - cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches); + cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); spin_lock_irqsave(&cpu_rcache->lock, flags); if (!iova_magazine_full(cpu_rcache->loaded)) { @@ -759,7 +758,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, iova_magazine_push(cpu_rcache->loaded, iova_pfn); spin_unlock_irqrestore(&cpu_rcache->lock, flags); - put_cpu_ptr(rcache->cpu_rcaches); if (mag_to_free) { iova_magazine_free_pfns(mag_to_free, iovad); @@ -793,7 +791,7 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache, bool has_pfn = false; unsigned long flags; - cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches); + cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); spin_lock_irqsave(&cpu_rcache->lock, flags); if (!iova_magazine_empty(cpu_rcache->loaded)) { @@ -815,7 +813,6 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache, iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn); spin_unlock_irqrestore(&cpu_rcache->lock, flags); - put_cpu_ptr(rcache->cpu_rcaches); return iova_pfn; } From 58c4a95f90839624b67f67acdb8a129f4383b569 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 27 Jun 2017 18:16:48 +0200 Subject: [PATCH 55/60] iommu/vt-d: Don't disable preemption while accessing deferred_flush() get_cpu() disables preemption and returns the current CPU number. The CPU number is only used once while retrieving the address of the local's CPU deferred_flush pointer. We can instead use raw_cpu_ptr() while we remain preemptible. The worst thing that can happen is that flush_unmaps_timeout() is invoked multiple times: once by taskA after seeing HIGH_WATER_MARK and then preempted to another CPU and then by taskB which saw HIGH_WATER_MARK on the same CPU as taskA. It is also likely that ->size got from HIGH_WATER_MARK to 0 right after its read because another CPU invoked flush_unmaps_timeout() for this CPU. The access to flush_data is protected by a spinlock so even if we get migrated to another CPU or preempted - the data structure is protected. While at it, I marked deferred_flush static since I can't find a reference to it outside of this file. Cc: David Woodhouse Cc: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: Andrew Morton Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index ee9c258d3ae0..0ca985b418ec 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -481,7 +481,7 @@ struct deferred_flush_data { struct deferred_flush_table *tables; }; -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush); +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush); /* bitmap for indexing intel_iommus */ static int g_num_of_iommus; @@ -3710,10 +3710,8 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn, struct intel_iommu *iommu; struct deferred_flush_entry *entry; struct deferred_flush_data *flush_data; - unsigned int cpuid; - cpuid = get_cpu(); - flush_data = per_cpu_ptr(&deferred_flush, cpuid); + flush_data = raw_cpu_ptr(&deferred_flush); /* Flush all CPUs' entries to avoid deferring too much. If * this becomes a bottleneck, can just flush us, and rely on @@ -3746,8 +3744,6 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn, } flush_data->size++; spin_unlock_irqrestore(&flush_data->lock, flags); - - put_cpu(); } static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) From 0929deca40bbdd7e821aada2906ac67882cfbf28 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 15 Jun 2017 15:11:51 +0200 Subject: [PATCH 56/60] iommu/s390: Use iommu_group_get_for_dev() in s390_iommu_add_device() The iommu_group_get_for_dev() function also attaches the device to its group, so this code doesn't need to be in the iommu driver. Further by using this function the driver can make use of default domains in the future. Reviewed-by: Gerald Schaefer Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 179e636a4d91..8788640756a7 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -165,20 +165,14 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, static int s390_iommu_add_device(struct device *dev) { - struct iommu_group *group; - int rc; + struct iommu_group *group = iommu_group_get_for_dev(dev); - group = iommu_group_get(dev); - if (!group) { - group = iommu_group_alloc(); - if (IS_ERR(group)) - return PTR_ERR(group); - } + if (IS_ERR(group)) + return PTR_ERR(group); - rc = iommu_group_add_device(group, dev); iommu_group_put(group); - return rc; + return 0; } static void s390_iommu_remove_device(struct device *dev) @@ -344,6 +338,7 @@ static struct iommu_ops s390_iommu_ops = { .iova_to_phys = s390_iommu_iova_to_phys, .add_device = s390_iommu_add_device, .remove_device = s390_iommu_remove_device, + .device_group = generic_device_group, .pgsize_bitmap = S390_IOMMU_PGSIZES, }; From 7f7a2304aabc4a8102bbbbeed2ec9eaee4a480c2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Jun 2017 12:45:31 +0200 Subject: [PATCH 57/60] iommu: Return ERR_PTR() values from device_group call-backs The generic device_group call-backs in iommu.c return NULL in case of error. Since they are getting ERR_PTR values from iommu_group_alloc(), just pass them up instead. Reported-by: Gerald Schaefer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index cf7ca7e70777..de09e1e35830 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -915,13 +915,7 @@ static int get_pci_alias_or_group(struct pci_dev *pdev, u16 alias, void *opaque) */ struct iommu_group *generic_device_group(struct device *dev) { - struct iommu_group *group; - - group = iommu_group_alloc(); - if (IS_ERR(group)) - return NULL; - - return group; + return iommu_group_alloc(); } /* @@ -988,11 +982,7 @@ struct iommu_group *pci_device_group(struct device *dev) return group; /* No shared group found, allocate new */ - group = iommu_group_alloc(); - if (IS_ERR(group)) - return NULL; - - return group; + return iommu_group_alloc(); } /** From 8faf5e5a12c511410de1590cf310ec331c5ec7b1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Jun 2017 12:50:16 +0200 Subject: [PATCH 58/60] iommu/omap: Return ERR_PTR in device_group call-back Make sure that the device_group callback returns an ERR_PTR instead of NULL. Signed-off-by: Joerg Roedel --- drivers/iommu/omap-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 95dfca36ccb9..641e035cf866 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1309,7 +1309,7 @@ static void omap_iommu_remove_device(struct device *dev) static struct iommu_group *omap_iommu_device_group(struct device *dev) { struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; - struct iommu_group *group = NULL; + struct iommu_group *group = ERR_PTR(-EINVAL); if (arch_data->iommu_dev) group = arch_data->iommu_dev->group; From 72dcac633475a5b331cf21f3525467d0e123395a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Jun 2017 12:52:48 +0200 Subject: [PATCH 59/60] iommu: Warn once when device_group callback returns NULL This callback should never return NULL. Print a warning if that happens so that we notice and can fix it. Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index de09e1e35830..3f6ea160afed 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1010,6 +1010,9 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev) if (ops && ops->device_group) group = ops->device_group(dev); + if (WARN_ON_ONCE(group == NULL)) + return ERR_PTR(-EINVAL); + if (IS_ERR(group)) return group; From 01e1932a1748e20b69ba23d0a01db5eb3a525782 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 28 Jun 2017 16:39:32 +0530 Subject: [PATCH 60/60] iommu/vt-d: Constify intel_dma_ops Most dma_map_ops structures are never modified. Constify these structures such that these can be write-protected. Signed-off-by: Arvind Yadav Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 0ca985b418ec..1a79a4ec6f09 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3954,7 +3954,7 @@ static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr) return !dma_addr; } -struct dma_map_ops intel_dma_ops = { +const struct dma_map_ops intel_dma_ops = { .alloc = intel_alloc_coherent, .free = intel_free_coherent, .map_sg = intel_map_sg,