1249 lines
42 KiB
Diff
1249 lines
42 KiB
Diff
From ba7256030f1b04e56096e9796bdf478f12872403 Mon Sep 17 00:00:00 2001
|
|
From: Russell Currey <ruscur@russell.cc>
|
|
Date: Mon, 9 Apr 2018 17:29:36 +1000
|
|
Subject: [PATCH 1/9] powerpc/powernv/pci: Track largest available TCE order
|
|
per PHB
|
|
|
|
Knowing the largest possible TCE size of a PHB is useful, so get it out
|
|
of the device tree. This relies on the property being added in OPAL.
|
|
|
|
It is assumed that any PHB4 or later machine would be running firmware
|
|
that implemented this property, and otherwise assumed to be PHB3, which
|
|
has a maximum TCE order of 28 bits or 256MB TCEs.
|
|
|
|
This is used later in the series.
|
|
|
|
Signed-off-by: Russell Currey <ruscur@russell.cc>
|
|
---
|
|
arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++++++++++++++++
|
|
arch/powerpc/platforms/powernv/pci.h | 3 +++
|
|
2 files changed, 19 insertions(+)
|
|
|
|
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
index cde710297a4e..9f40f235b39e 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
@@ -3751,11 +3751,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
|
|
struct resource r;
|
|
const __be64 *prop64;
|
|
const __be32 *prop32;
|
|
+ struct property *prop;
|
|
int len;
|
|
unsigned int segno;
|
|
u64 phb_id;
|
|
void *aux;
|
|
long rc;
|
|
+ u32 val;
|
|
|
|
if (!of_device_is_available(np))
|
|
return;
|
|
@@ -3894,6 +3896,20 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
|
|
}
|
|
phb->ioda.pe_array = aux + pemap_off;
|
|
|
|
+ phb->ioda.max_tce_order = 0;
|
|
+ // Get TCE order from the DT. If it's not present, assume P8
|
|
+ if (!of_get_property(np, "ibm,supported-tce-sizes", NULL)) {
|
|
+ phb->ioda.max_tce_order = 28; // assume P8 256mb TCEs
|
|
+ } else {
|
|
+ of_property_for_each_u32(np, "ibm,supported-tce-sizes", prop,
|
|
+ prop32, val) {
|
|
+ if (val > phb->ioda.max_tce_order)
|
|
+ phb->ioda.max_tce_order = val;
|
|
+ }
|
|
+ pr_debug("PHB%llx Found max TCE order of %d bits\n",
|
|
+ phb->opal_id, phb->ioda.max_tce_order);
|
|
+ }
|
|
+
|
|
/*
|
|
* Choose PE number for root bus, which shouldn't have
|
|
* M64 resources consumed by its child devices. To pick
|
|
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
|
|
index 8b37b28e3831..ca5414055972 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci.h
|
|
+++ b/arch/powerpc/platforms/powernv/pci.h
|
|
@@ -172,6 +172,9 @@ struct pnv_phb {
|
|
struct list_head pe_list;
|
|
struct mutex pe_list_mutex;
|
|
|
|
+ /* Largest supported TCE order bits */
|
|
+ uint8_t max_tce_order;
|
|
+
|
|
/* Reverse map of PEs, indexed by {bus, devfn} */
|
|
unsigned int pe_rmap[0x10000];
|
|
} ioda;
|
|
--
|
|
2.17.1
|
|
|
|
|
|
From b3f3546c4b4225093f41a4caa25648718170a093 Mon Sep 17 00:00:00 2001
|
|
From: Russell Currey <ruscur@russell.cc>
|
|
Date: Mon, 9 Apr 2018 17:34:37 +1000
|
|
Subject: [PATCH 2/9] powerpc/powernv: DMA operations for discontiguous
|
|
allocation
|
|
|
|
Cognitive DMA is a new set of DMA operations that solve some issues for
|
|
devices that want to address more than 32 bits but can't address the 59
|
|
bits required to enable direct DMA.
|
|
|
|
The previous implementation for POWER8/PHB3 worked around this by
|
|
configuring a bypass from the default 32-bit address space into 64-bit
|
|
address space. This approach does not work for POWER9/PHB4 because
|
|
regions of memory are discontiguous and many devices will be unable to
|
|
address memory beyond the first node.
|
|
|
|
Instead, implement a new set of DMA operations that allocate TCEs as DMA
|
|
mappings are requested so that all memory is addressable even when a
|
|
one-to-one mapping between real addresses and DMA addresses isn't
|
|
possible. These TCEs are the maximum size available on the platform,
|
|
which is 256M on PHB3 and 1G on PHB4.
|
|
|
|
Devices can now map any region of memory up to the maximum amount they can
|
|
address according to the DMA mask set, in chunks of the largest available
|
|
TCE size.
|
|
|
|
This implementation replaces the need for the existing PHB3 solution and
|
|
should be compatible with future PHB versions.
|
|
|
|
Signed-off-by: Russell Currey <ruscur@russell.cc>
|
|
---
|
|
arch/powerpc/include/asm/dma-mapping.h | 1 +
|
|
arch/powerpc/platforms/powernv/Makefile | 2 +-
|
|
arch/powerpc/platforms/powernv/pci-dma.c | 319 ++++++++++++++++++++++
|
|
arch/powerpc/platforms/powernv/pci-ioda.c | 102 +++----
|
|
arch/powerpc/platforms/powernv/pci.h | 7 +
|
|
5 files changed, 381 insertions(+), 50 deletions(-)
|
|
create mode 100644 arch/powerpc/platforms/powernv/pci-dma.c
|
|
|
|
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
|
|
index 8fa394520af6..354f435160f3 100644
|
|
--- a/arch/powerpc/include/asm/dma-mapping.h
|
|
+++ b/arch/powerpc/include/asm/dma-mapping.h
|
|
@@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev)
|
|
extern struct dma_map_ops dma_iommu_ops;
|
|
#endif
|
|
extern const struct dma_map_ops dma_nommu_ops;
|
|
+extern const struct dma_map_ops dma_pseudo_bypass_ops;
|
|
|
|
static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
|
|
{
|
|
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
|
|
index b540ce8eec55..7cfc821508c3 100644
|
|
--- a/arch/powerpc/platforms/powernv/Makefile
|
|
+++ b/arch/powerpc/platforms/powernv/Makefile
|
|
@@ -6,7 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
|
|
obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
|
|
|
|
obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o
|
|
-obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
|
|
+obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o pci-dma.o
|
|
obj-$(CONFIG_CXL_BASE) += pci-cxl.o
|
|
obj-$(CONFIG_EEH) += eeh-powernv.o
|
|
obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
|
|
diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
|
|
new file mode 100644
|
|
index 000000000000..1d5409be343e
|
|
--- /dev/null
|
|
+++ b/arch/powerpc/platforms/powernv/pci-dma.c
|
|
@@ -0,0 +1,319 @@
|
|
+/*
|
|
+ * DMA operations supporting pseudo-bypass for PHB3+
|
|
+ *
|
|
+ * Author: Russell Currey <ruscur@russell.cc>
|
|
+ *
|
|
+ * Copyright 2018 IBM Corporation.
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or modify it
|
|
+ * under the terms of the GNU General Public License as published by the
|
|
+ * Free Software Foundation; either version 2 of the License, or (at your
|
|
+ * option) any later version.
|
|
+ */
|
|
+
|
|
+#include <linux/export.h>
|
|
+#include <linux/memblock.h>
|
|
+#include <linux/device.h>
|
|
+#include <linux/dma-mapping.h>
|
|
+#include <linux/hash.h>
|
|
+
|
|
+#include <asm/pci-bridge.h>
|
|
+#include <asm/ppc-pci.h>
|
|
+#include <asm/pnv-pci.h>
|
|
+#include <asm/tce.h>
|
|
+
|
|
+#include "pci.h"
|
|
+
|
|
+/* select and allocate a TCE using the bitmap */
|
|
+static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr)
|
|
+{
|
|
+ int tce;
|
|
+ __be64 old, new;
|
|
+
|
|
+ spin_lock(&pe->tce_alloc_lock);
|
|
+ tce = bitmap_find_next_zero_area(pe->tce_bitmap,
|
|
+ pe->tce_count,
|
|
+ 0,
|
|
+ 1,
|
|
+ 0);
|
|
+ bitmap_set(pe->tce_bitmap, tce, 1);
|
|
+ old = pe->tces[tce];
|
|
+ new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
|
|
+ pe->tces[tce] = new;
|
|
+ pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
|
|
+ tce, new, old);
|
|
+ spin_unlock(&pe->tce_alloc_lock);
|
|
+
|
|
+ return tce;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The tracking table for assigning TCEs has two entries per TCE.
|
|
+ * - @entry1 contains the physical address and the smallest bit indicates
|
|
+ * if it's currently valid.
|
|
+ * - @entry2 contains the DMA address returned in the upper 34 bits, and a
|
|
+ * refcount in the lower 30 bits.
|
|
+ */
|
|
+static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev,
|
|
+ phys_addr_t addr)
|
|
+{
|
|
+ struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
|
|
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
|
|
+ struct pnv_phb *phb = hose->private_data;
|
|
+ struct pnv_ioda_pe *pe;
|
|
+ u64 i, entry1, entry2, dma_prefix, tce, ret;
|
|
+ u64 offset = addr & ((1 << phb->ioda.max_tce_order) - 1);
|
|
+
|
|
+ pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
|
|
+
|
|
+ /* look through the tracking table for a free entry */
|
|
+ for (i = 0; i < pe->tce_count; i++) {
|
|
+ entry1 = pe->tce_tracker[i * 2];
|
|
+ entry2 = pe->tce_tracker[i * 2 + 1];
|
|
+ dma_prefix = entry2 >> 34;
|
|
+
|
|
+ /* if the address is the same and the entry is valid */
|
|
+ if (entry1 == ((addr - offset) | 1)) {
|
|
+ /* all we need to do here is increment the refcount */
|
|
+ ret = cmpxchg(&pe->tce_tracker[i * 2 + 1],
|
|
+ entry2, entry2 + 1);
|
|
+ if (ret != entry2) {
|
|
+ /* conflict, start looking again just in case */
|
|
+ i--;
|
|
+ continue;
|
|
+ }
|
|
+ return (dma_prefix << phb->ioda.max_tce_order) | offset;
|
|
+ /* if the entry is invalid then we want to replace it */
|
|
+ } else if (!(entry1 & 1)) {
|
|
+ /* set the real address, note that it isn't valid yet */
|
|
+ ret = cmpxchg(&pe->tce_tracker[i * 2],
|
|
+ entry1, (addr - offset));
|
|
+ if (ret != entry1) {
|
|
+ /* conflict, start looking again */
|
|
+ i--;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* now we can allocate a TCE */
|
|
+ tce = dma_pseudo_bypass_select_tce(pe, addr - offset);
|
|
+
|
|
+ /* set new value, including TCE index and new refcount */
|
|
+ ret = cmpxchg(&pe->tce_tracker[i * 2 + 1],
|
|
+ entry2, tce << 34 | 1);
|
|
+ if (ret != entry2) {
|
|
+ /*
|
|
+ * XXX In this case we need to throw out
|
|
+ * everything, including the TCE we just
|
|
+ * allocated. For now, just leave it.
|
|
+ */
|
|
+ i--;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* now set the valid bit */
|
|
+ ret = cmpxchg(&pe->tce_tracker[i * 2],
|
|
+ (addr - offset), (addr - offset) | 1);
|
|
+ if (ret != (addr - offset)) {
|
|
+ /*
|
|
+ * XXX Same situation as above. We'd probably
|
|
+ * want to null out entry2 as well.
|
|
+ */
|
|
+ i--;
|
|
+ continue;
|
|
+ }
|
|
+ return (tce << phb->ioda.max_tce_order) | offset;
|
|
+ /* it's a valid entry but not ours, keep looking */
|
|
+ } else {
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+ /* If we get here, the table must be full, so error out. */
|
|
+ return -1ULL;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * For the moment, unmapping just decrements the refcount and doesn't actually
|
|
+ * remove the TCE. This is because it's very likely that a previously allocated
|
|
+ * TCE will be used again, and this saves having to invalidate it.
|
|
+ *
|
|
+ * TODO implement some kind of garbage collection that clears unused TCE entries
|
|
+ * once the table reaches a certain size.
|
|
+ */
|
|
+static void dma_pseudo_bypass_unmap_address(struct device *dev, dma_addr_t dma_addr)
|
|
+{
|
|
+ struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
|
|
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
|
|
+ struct pnv_phb *phb = hose->private_data;
|
|
+ struct pnv_ioda_pe *pe;
|
|
+ u64 i, entry1, entry2, dma_prefix, refcount;
|
|
+
|
|
+ pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
|
|
+
|
|
+ for (i = 0; i < pe->tce_count; i++) {
|
|
+ entry1 = pe->tce_tracker[i * 2];
|
|
+ entry2 = pe->tce_tracker[i * 2 + 1];
|
|
+ dma_prefix = entry2 >> 34;
|
|
+ refcount = entry2 & ((1 << 30) - 1);
|
|
+
|
|
+ /* look through entry2 until we find our address */
|
|
+ if (dma_prefix == (dma_addr >> phb->ioda.max_tce_order)) {
|
|
+ refcount--;
|
|
+ cmpxchg(&pe->tce_tracker[i * 2 + 1], entry2, (dma_prefix << 34) | refcount);
|
|
+ if (!refcount) {
|
|
+ /*
|
|
+ * Here is where we would remove the valid bit
|
|
+ * from entry1, clear the entry in the TCE table
|
|
+ * and invalidate the TCE - but we want to leave
|
|
+ * them until the table fills up (for now).
|
|
+ */
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static int dma_pseudo_bypass_dma_supported(struct device *dev, u64 mask)
|
|
+{
|
|
+ /*
|
|
+ * Normally dma_supported() checks if the mask is capable of addressing
|
|
+ * all of memory. Since we map physical memory in chunks that the
|
|
+ * device can address, the device will be able to address whatever it
|
|
+ * wants - just not all at once.
|
|
+ */
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static void *dma_pseudo_bypass_alloc_coherent(struct device *dev,
|
|
+ size_t size,
|
|
+ dma_addr_t *dma_handle,
|
|
+ gfp_t flag,
|
|
+ unsigned long attrs)
|
|
+{
|
|
+ void *ret;
|
|
+ struct page *page;
|
|
+ int node = dev_to_node(dev);
|
|
+
|
|
+ /* ignore region specifiers */
|
|
+ flag &= ~(__GFP_HIGHMEM);
|
|
+
|
|
+ page = alloc_pages_node(node, flag, get_order(size));
|
|
+ if (page == NULL)
|
|
+ return NULL;
|
|
+ ret = page_address(page);
|
|
+ memset(ret, 0, size);
|
|
+ *dma_handle = dma_pseudo_bypass_get_address(dev, __pa(ret));
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void dma_pseudo_bypass_free_coherent(struct device *dev,
|
|
+ size_t size,
|
|
+ void *vaddr,
|
|
+ dma_addr_t dma_handle,
|
|
+ unsigned long attrs)
|
|
+{
|
|
+ free_pages((unsigned long)vaddr, get_order(size));
|
|
+}
|
|
+
|
|
+static int dma_pseudo_bypass_mmap_coherent(struct device *dev,
|
|
+ struct vm_area_struct *vma,
|
|
+ void *cpu_addr,
|
|
+ dma_addr_t handle,
|
|
+ size_t size,
|
|
+ unsigned long attrs)
|
|
+{
|
|
+ unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
|
|
+
|
|
+ return remap_pfn_range(vma, vma->vm_start,
|
|
+ pfn + vma->vm_pgoff,
|
|
+ vma->vm_end - vma->vm_start,
|
|
+ vma->vm_page_prot);
|
|
+}
|
|
+
|
|
+static inline dma_addr_t dma_pseudo_bypass_map_page(struct device *dev,
|
|
+ struct page *page,
|
|
+ unsigned long offset,
|
|
+ size_t size,
|
|
+ enum dma_data_direction dir,
|
|
+ unsigned long attrs)
|
|
+{
|
|
+ BUG_ON(dir == DMA_NONE);
|
|
+
|
|
+ /* XXX I don't know if this is necessary (or even desired) */
|
|
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
|
|
+ __dma_sync_page(page, offset, size, dir);
|
|
+
|
|
+ return dma_pseudo_bypass_get_address(dev, page_to_phys(page) + offset);
|
|
+}
|
|
+
|
|
+static inline void dma_pseudo_bypass_unmap_page(struct device *dev,
|
|
+ dma_addr_t dma_address,
|
|
+ size_t size,
|
|
+ enum dma_data_direction direction,
|
|
+ unsigned long attrs)
|
|
+{
|
|
+ dma_pseudo_bypass_unmap_address(dev, dma_address);
|
|
+}
|
|
+
|
|
+
|
|
+static int dma_pseudo_bypass_map_sg(struct device *dev, struct scatterlist *sgl,
|
|
+ int nents, enum dma_data_direction direction,
|
|
+ unsigned long attrs)
|
|
+{
|
|
+ struct scatterlist *sg;
|
|
+ int i;
|
|
+
|
|
+
|
|
+ for_each_sg(sgl, sg, nents, i) {
|
|
+ sg->dma_address = dma_pseudo_bypass_get_address(dev, sg_phys(sg));
|
|
+ sg->dma_length = sg->length;
|
|
+
|
|
+ if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
|
|
+ continue;
|
|
+
|
|
+ __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
|
|
+ }
|
|
+
|
|
+ return nents;
|
|
+}
|
|
+
|
|
+static void dma_pseudo_bypass_unmap_sg(struct device *dev, struct scatterlist *sgl,
|
|
+ int nents, enum dma_data_direction direction,
|
|
+ unsigned long attrs)
|
|
+{
|
|
+ struct scatterlist *sg;
|
|
+ int i;
|
|
+
|
|
+ for_each_sg(sgl, sg, nents, i) {
|
|
+ dma_pseudo_bypass_unmap_address(dev, sg->dma_address);
|
|
+ }
|
|
+}
|
|
+
|
|
+static u64 dma_pseudo_bypass_get_required_mask(struct device *dev)
|
|
+{
|
|
+ /*
|
|
+ * there's no limitation on our end, the driver should just call
|
|
+ * set_mask() with as many bits as the device can address.
|
|
+ */
|
|
+ return -1ULL;
|
|
+}
|
|
+
|
|
+static int dma_pseudo_bypass_mapping_error(struct device *dev, dma_addr_t dma_addr)
|
|
+{
|
|
+ return dma_addr == -1ULL;
|
|
+}
|
|
+
|
|
+
|
|
+const struct dma_map_ops dma_pseudo_bypass_ops = {
|
|
+ .alloc = dma_pseudo_bypass_alloc_coherent,
|
|
+ .free = dma_pseudo_bypass_free_coherent,
|
|
+ .mmap = dma_pseudo_bypass_mmap_coherent,
|
|
+ .map_sg = dma_pseudo_bypass_map_sg,
|
|
+ .unmap_sg = dma_pseudo_bypass_unmap_sg,
|
|
+ .dma_supported = dma_pseudo_bypass_dma_supported,
|
|
+ .map_page = dma_pseudo_bypass_map_page,
|
|
+ .unmap_page = dma_pseudo_bypass_unmap_page,
|
|
+ .get_required_mask = dma_pseudo_bypass_get_required_mask,
|
|
+ .mapping_error = dma_pseudo_bypass_mapping_error,
|
|
+};
|
|
+EXPORT_SYMBOL(dma_pseudo_bypass_ops);
|
|
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
index 9f40f235b39e..b982558a92ac 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
@@ -25,6 +25,7 @@
|
|
#include <linux/iommu.h>
|
|
#include <linux/rculist.h>
|
|
#include <linux/sizes.h>
|
|
+#include <linux/vmalloc.h>
|
|
|
|
#include <asm/sections.h>
|
|
#include <asm/io.h>
|
|
@@ -1085,6 +1086,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
|
|
pe->pbus = NULL;
|
|
pe->mve_number = -1;
|
|
pe->rid = dev->bus->number << 8 | pdn->devfn;
|
|
+ pe->tces = NULL;
|
|
+ pe->tce_tracker = NULL;
|
|
+ pe->tce_bitmap = NULL;
|
|
|
|
pe_info(pe, "Associated device to PE\n");
|
|
|
|
@@ -1566,6 +1570,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
|
|
pe->mve_number = -1;
|
|
pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
|
|
pci_iov_virtfn_devfn(pdev, vf_index);
|
|
+ pe->tces = NULL;
|
|
+ pe->tce_tracker = NULL;
|
|
+ pe->tce_bitmap = NULL;
|
|
|
|
pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
|
|
hose->global_number, pdev->bus->number,
|
|
@@ -1771,43 +1778,40 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
|
|
return true;
|
|
}
|
|
|
|
-/*
|
|
- * Reconfigure TVE#0 to be usable as 64-bit DMA space.
|
|
- *
|
|
- * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
|
|
- * Devices can only access more than that if bit 59 of the PCI address is set
|
|
- * by hardware, which indicates TVE#1 should be used instead of TVE#0.
|
|
- * Many PCI devices are not capable of addressing that many bits, and as a
|
|
- * result are limited to the 4GB of virtual memory made available to 32-bit
|
|
- * devices in TVE#0.
|
|
- *
|
|
- * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
|
|
- * devices by configuring the virtual memory past the first 4GB inaccessible
|
|
- * by 64-bit DMAs. This should only be used by devices that want more than
|
|
- * 4GB, and only on PEs that have no 32-bit devices.
|
|
- *
|
|
- * Currently this will only work on PHB3 (POWER8).
|
|
- */
|
|
-static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
|
|
+static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
|
|
{
|
|
- u64 window_size, table_size, tce_count, addr;
|
|
+ u64 tce_count, table_size, window_size;
|
|
+ struct pnv_phb *p = pe->phb;
|
|
struct page *table_pages;
|
|
- u64 tce_order = 28; /* 256MB TCEs */
|
|
__be64 *tces;
|
|
- s64 rc;
|
|
+ int rc = -ENOMEM;
|
|
+ int bitmap_size, tracker_entries;
|
|
+
|
|
+ /*
|
|
+ * XXX These are factors for scaling the size of the TCE table, and
|
|
+ * the table that tracks these allocations. These should eventually
|
|
+ * be kernel command line options with defaults above 1, for situations
|
|
+ * where your memory expands after the machine has booted.
|
|
+ */
|
|
+ int tce_size_factor = 1;
|
|
+ int tracking_table_factor = 1;
|
|
|
|
/*
|
|
- * Window size needs to be a power of two, but needs to account for
|
|
- * shifting memory by the 4GB offset required to skip 32bit space.
|
|
+ * The window size covers all of memory (and optionally more), with
|
|
+ * enough tracker entries to cover them all being allocated. So we
|
|
+ * create enough TCEs to cover all of memory at once.
|
|
*/
|
|
- window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
|
|
- tce_count = window_size >> tce_order;
|
|
+ window_size = roundup_pow_of_two(tce_size_factor * memory_hotplug_max());
|
|
+ tracker_entries = (tracking_table_factor * memory_hotplug_max()) >>
|
|
+ p->ioda.max_tce_order;
|
|
+ tce_count = window_size >> p->ioda.max_tce_order;
|
|
+ bitmap_size = BITS_TO_LONGS(tce_count) * sizeof(unsigned long);
|
|
table_size = tce_count << 3;
|
|
|
|
if (table_size < PAGE_SIZE)
|
|
table_size = PAGE_SIZE;
|
|
|
|
- table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
|
|
+ table_pages = alloc_pages_node(p->hose->node, GFP_KERNEL,
|
|
get_order(table_size));
|
|
if (!table_pages)
|
|
goto err;
|
|
@@ -1818,26 +1822,33 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
|
|
|
|
memset(tces, 0, table_size);
|
|
|
|
- for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
|
|
- tces[(addr + (1ULL << 32)) >> tce_order] =
|
|
- cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
|
|
- }
|
|
+ pe->tces = tces;
|
|
+ pe->tce_count = tce_count;
|
|
+ pe->tce_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
|
|
+ /* The tracking table has two u64s per TCE */
|
|
+ pe->tce_tracker = vzalloc(sizeof(u64) * 2 * tracker_entries);
|
|
+ spin_lock_init(&pe->tce_alloc_lock);
|
|
+
|
|
+ /* mark the first 4GB as reserved so this can still be used for 32bit */
|
|
+ bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order));
|
|
+
|
|
+ pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n",
|
|
+ tracker_entries, bitmap_size, tce_count);
|
|
|
|
rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
|
|
pe->pe_number,
|
|
- /* reconfigure window 0 */
|
|
(pe->pe_number << 1) + 0,
|
|
1,
|
|
__pa(tces),
|
|
table_size,
|
|
- 1 << tce_order);
|
|
+ 1 << p->ioda.max_tce_order);
|
|
if (rc == OPAL_SUCCESS) {
|
|
- pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
|
|
+ pe_info(pe, "TCE tables configured for pseudo-bypass\n");
|
|
return 0;
|
|
}
|
|
err:
|
|
- pe_err(pe, "Error configuring 64-bit DMA bypass\n");
|
|
- return -EIO;
|
|
+ pe_err(pe, "error configuring pseudo-bypass\n");
|
|
+ return rc;
|
|
}
|
|
|
|
static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
|
|
@@ -1848,7 +1859,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
|
|
struct pnv_ioda_pe *pe;
|
|
uint64_t top;
|
|
bool bypass = false;
|
|
- s64 rc;
|
|
|
|
if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
|
|
return -ENODEV;
|
|
@@ -1865,21 +1875,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
|
|
} else {
|
|
/*
|
|
* If the device can't set the TCE bypass bit but still wants
|
|
- * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
|
|
- * bypass the 32-bit region and be usable for 64-bit DMAs.
|
|
- * The device needs to be able to address all of this space.
|
|
+ * to access 4GB or more, we need to use a different set of DMA
|
|
+ * operations with an indirect mapping.
|
|
*/
|
|
if (dma_mask >> 32 &&
|
|
- dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
|
|
- pnv_pci_ioda_pe_single_vendor(pe) &&
|
|
- phb->model == PNV_PHB_MODEL_PHB3) {
|
|
- /* Configure the bypass mode */
|
|
- rc = pnv_pci_ioda_dma_64bit_bypass(pe);
|
|
- if (rc)
|
|
- return rc;
|
|
- /* 4GB offset bypasses 32-bit space */
|
|
- set_dma_offset(&pdev->dev, (1ULL << 32));
|
|
- set_dma_ops(&pdev->dev, &dma_nommu_ops);
|
|
+ phb->model != PNV_PHB_MODEL_P7IOC &&
|
|
+ pnv_pci_ioda_pe_single_vendor(pe)) {
|
|
+ if (!pe->tces)
|
|
+ pnv_pci_pseudo_bypass_setup(pe);
|
|
+ set_dma_ops(&pdev->dev, &dma_pseudo_bypass_ops);
|
|
} else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
|
|
/*
|
|
* Fail the request if a DMA mask between 32 and 64 bits
|
|
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
|
|
index ca5414055972..9418c6ea189b 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci.h
|
|
+++ b/arch/powerpc/platforms/powernv/pci.h
|
|
@@ -70,6 +70,13 @@ struct pnv_ioda_pe {
|
|
bool tce_bypass_enabled;
|
|
uint64_t tce_bypass_base;
|
|
|
|
+ /* TCE tables for DMA pseudo-bypass */
|
|
+ __be64 *tces;
|
|
+ u64 tce_count;
|
|
+ unsigned long *tce_bitmap;
|
|
+ u64 *tce_tracker; // 2 u64s per TCE
|
|
+ spinlock_t tce_alloc_lock;
|
|
+
|
|
/* MSIs. MVE index is identical for for 32 and 64 bit MSI
|
|
* and -1 if not supported. (It's actually identical to the
|
|
* PE number)
|
|
--
|
|
2.17.1
|
|
|
|
|
|
From 2e6abf2b56d40a953eaa39e2bee064bfcc1da6d1 Mon Sep 17 00:00:00 2001
|
|
From: Russell Currey <ruscur@russell.cc>
|
|
Date: Wed, 6 Jun 2018 13:36:06 +1000
|
|
Subject: [PATCH 3/9] powerpc/powernv/pci: Track DMA and TCE tables in debugfs
|
|
|
|
Add a new debugfs entry to trigger dumping out the tracking table and
|
|
TCEs for a given PE, for example PE 0x4 of PHB 2:
|
|
|
|
echo 0x4 > /sys/kernel/debug/powerpc/PCI0002/sketchy
|
|
|
|
This will result in the table being dumped out in dmesg.
|
|
|
|
Signed-off-by: Russell Currey <ruscur@russell.cc>
|
|
---
|
|
arch/powerpc/platforms/powernv/pci-ioda.c | 43 +++++++++++++++++++++++
|
|
1 file changed, 43 insertions(+)
|
|
|
|
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
index b982558a92ac..3598ca8daa7c 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
@@ -3203,6 +3203,47 @@ static int pnv_pci_diag_data_set(void *data, u64 val)
|
|
DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
|
|
pnv_pci_diag_data_set, "%llu\n");
|
|
|
|
+static int pnv_pci_sketchy_set(void *data, u64 val)
|
|
+{
|
|
+ struct pci_controller *hose;
|
|
+ struct pnv_ioda_pe *pe;
|
|
+ struct pnv_phb *phb;
|
|
+ u64 entry1, entry2;
|
|
+ int i;
|
|
+
|
|
+ hose = (struct pci_controller *)data;
|
|
+ if (!hose || !hose->private_data)
|
|
+ return -ENODEV;
|
|
+
|
|
+ phb = hose->private_data;
|
|
+ pe = &phb->ioda.pe_array[val];
|
|
+
|
|
+ if (!pe)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (!pe->tces || !pe->tce_tracker)
|
|
+ return -EIO;
|
|
+
|
|
+ for (i = 0; i < pe->tce_count; i++) {
|
|
+ if (i > 16 && pe->tces[i] == 0)
|
|
+ break;
|
|
+ pr_info("%3d: %016llx\n", i, be64_to_cpu(pe->tces[i]));
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < pe->tce_count; i++) {
|
|
+ entry1 = pe->tce_tracker[i * 2];
|
|
+ entry2 = pe->tce_tracker[i * 2 + 1];
|
|
+ if (!entry1)
|
|
+ break;
|
|
+ pr_info("%3d: %016llx %016llx\n", i, entry1, entry2);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_sketchy_fops, NULL,
|
|
+ pnv_pci_sketchy_set, "%llu\n");
|
|
+
|
|
+
|
|
#endif /* CONFIG_DEBUG_FS */
|
|
|
|
static void pnv_pci_ioda_create_dbgfs(void)
|
|
@@ -3228,6 +3269,8 @@ static void pnv_pci_ioda_create_dbgfs(void)
|
|
|
|
debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
|
|
&pnv_pci_diag_data_fops);
|
|
+ debugfs_create_file("sketchy", 0200, phb->dbgfs, hose,
|
|
+ &pnv_pci_sketchy_fops);
|
|
}
|
|
#endif /* CONFIG_DEBUG_FS */
|
|
}
|
|
--
|
|
2.17.1
|
|
|
|
|
|
From 8456e9247c21d9fc7838dd5a71435342f7b79f88 Mon Sep 17 00:00:00 2001
|
|
From: Russell Currey <ruscur@russell.cc>
|
|
Date: Tue, 19 Jun 2018 16:21:13 +1000
|
|
Subject: [PATCH 4/9] powerpc/powernv/pci: Safety fixes for pseudobypass TCE
|
|
allocation
|
|
|
|
Signed-off-by: Russell Currey <ruscur@russell.cc>
|
|
---
|
|
arch/powerpc/platforms/powernv/pci-dma.c | 6 ++++--
|
|
1 file changed, 4 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
|
|
index 1d5409be343e..237940a2a052 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci-dma.c
|
|
+++ b/arch/powerpc/platforms/powernv/pci-dma.c
|
|
@@ -29,8 +29,9 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
|
|
{
|
|
int tce;
|
|
__be64 old, new;
|
|
+ unsigned long flags;
|
|
|
|
- spin_lock(&pe->tce_alloc_lock);
|
|
+ spin_lock_irqsave(&pe->tce_alloc_lock, flags);
|
|
tce = bitmap_find_next_zero_area(pe->tce_bitmap,
|
|
pe->tce_count,
|
|
0,
|
|
@@ -40,9 +41,10 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
|
|
old = pe->tces[tce];
|
|
new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
|
|
pe->tces[tce] = new;
|
|
+ mb();
|
|
pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
|
|
tce, new, old);
|
|
- spin_unlock(&pe->tce_alloc_lock);
|
|
+ spin_unlock_irqrestore(&pe->tce_alloc_lock, flags);
|
|
|
|
return tce;
|
|
}
|
|
--
|
|
2.17.1
|
|
|
|
|
|
From b8949ef1b1bb5977ba9fb35f06d1466b6be475a5 Mon Sep 17 00:00:00 2001
|
|
From: Timothy Pearson <tpearson@raptorengineering.com>
|
|
Date: Sat, 23 Jun 2018 16:20:48 -0500
|
|
Subject: [PATCH 5/9] powerpc/powernv/pci: Export
|
|
pnv_pci_ioda2_tce_invalidate_pe
|
|
|
|
Pseudo DMA support requires a method to invalidate the TCE cache
|
|
Export pnv_pci_ioda2_tce_invalidate_pe for use by the pseudo DMA
|
|
mapper.
|
|
|
|
Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
|
|
---
|
|
arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
|
|
arch/powerpc/platforms/powernv/pci.h | 1 +
|
|
2 files changed, 2 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
index 3598ca8daa7c..83f9db17e711 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
@@ -2100,7 +2100,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
|
|
}
|
|
}
|
|
|
|
-static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
|
|
+void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
|
|
{
|
|
struct pnv_phb *phb = pe->phb;
|
|
|
|
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
|
|
index 9418c6ea189b..bea565c3f302 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci.h
|
|
+++ b/arch/powerpc/platforms/powernv/pci.h
|
|
@@ -244,6 +244,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
|
|
/* Nvlink functions */
|
|
extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
|
|
extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
|
|
+extern void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe);
|
|
extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
|
|
extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
|
|
struct iommu_table *tbl);
|
|
--
|
|
2.17.1
|
|
|
|
|
|
From 4d8211098c35d5b7556f756db18dba89c015600d Mon Sep 17 00:00:00 2001
|
|
From: Timothy Pearson <tpearson@raptorengineering.com>
|
|
Date: Sat, 23 Jun 2018 16:22:59 -0500
|
|
Subject: [PATCH 6/9] powerpc/powernv/pci: Invalidate TCE cache after DMA map
|
|
setup
|
|
|
|
Per the IODA2, TCEs must be invalidated after their settings
|
|
have been changed. Invalidate the cache after the address
|
|
is changed during TCE allocation when using pseudo DMA.
|
|
|
|
Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
|
|
---
|
|
arch/powerpc/platforms/powernv/pci-dma.c | 3 +--
|
|
1 file changed, 1 insertion(+), 2 deletions(-)
|
|
|
|
diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
|
|
index 237940a2a052..060dbc168401 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci-dma.c
|
|
+++ b/arch/powerpc/platforms/powernv/pci-dma.c
|
|
@@ -42,8 +42,7 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
|
|
new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
|
|
pe->tces[tce] = new;
|
|
mb();
|
|
- pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
|
|
- tce, new, old);
|
|
+ pnv_pci_ioda2_tce_invalidate_pe(pe);
|
|
spin_unlock_irqrestore(&pe->tce_alloc_lock, flags);
|
|
|
|
return tce;
|
|
--
|
|
2.17.1
|
|
|
|
|
|
From a963913380c91a465509bae341da1e8aac40cdee Mon Sep 17 00:00:00 2001
|
|
From: Timothy Pearson <tpearson@raptorengineering.com>
|
|
Date: Sat, 23 Jun 2018 16:25:16 -0500
|
|
Subject: [PATCH 7/9] powerpc/powernv/pci: Don't use the lower 4G TCEs in
|
|
pseudo-DMA mode
|
|
|
|
Four TCEs are reserved for legacy 32-bit DMA mappings in psuedo DMA
|
|
mode. Mark these with an invalid address to avoid their use by
|
|
the TCE cache mapper.
|
|
|
|
Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
|
|
---
|
|
arch/powerpc/platforms/powernv/pci-ioda.c | 8 +++++++-
|
|
1 file changed, 7 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
index 83f9db17e711..f4cd6a5c2bc7 100644
|
|
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
|
|
@@ -1780,7 +1780,7 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
|
|
|
|
static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
|
|
{
|
|
- u64 tce_count, table_size, window_size;
|
|
+ u64 i, tce_count, table_size, window_size;
|
|
struct pnv_phb *p = pe->phb;
|
|
struct page *table_pages;
|
|
__be64 *tces;
|
|
@@ -1832,6 +1832,12 @@ static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
|
|
/* mark the first 4GB as reserved so this can still be used for 32bit */
|
|
bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order));
|
|
|
|
+ /* make sure reserved first 4GB TCEs are not used by the mapper
|
|
+ * set each address to -1, which will never match an incoming request
|
|
+ */
|
|
+ for (i = 0; i < 4; i++)
|
|
+ pe->tce_tracker[i * 2] = -1;
|
|
+
|
|
pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n",
|
|
tracker_entries, bitmap_size, tce_count);
|
|
|
|
--
|
|
2.17.1
|
|
|
|
|
|
From d397370596955d166c76a1f487ef53d8dbf52d9c Mon Sep 17 00:00:00 2001
|
|
From: Paul Mackerras <paulus@ozlabs.org>
|
|
Date: Tue, 11 Sep 2018 17:00:30 +1000
|
|
Subject: [PATCH 8/9] KVM: PPC: Book3S HV: Allocate a memory area exclusively
|
|
for HPTs
|
|
|
|
Currently we allocate HPTs (hashed page tables) for guests using the
|
|
CMA (contiguous memory allocator) facility. However, there are
|
|
situations where the CMA region can get fragmented, notably when
|
|
lots of guest pages get pinned for PCI pass-through, which then causes
|
|
HPT allocations to fail even if there is sufficient CMA memory
|
|
available overall.
|
|
|
|
This commit adds the capability to reserve some memory at boot time
|
|
exclusively for HPTs for KVM guests. The amount is controlled with
|
|
the kvm_hpt_resv_ratio=N kernel command-line option, where N is the
|
|
percentage of system memory to reserve. This reserved memory will
|
|
be used first, and only when a guest HPT can't be allocated from this
|
|
reserved memory will the CMA region be used.
|
|
|
|
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
|
|
---
|
|
arch/powerpc/include/asm/kvm_host.h | 2 +
|
|
arch/powerpc/include/asm/kvm_ppc.h | 7 ++
|
|
arch/powerpc/kernel/setup-common.c | 3 +
|
|
arch/powerpc/kernel/setup.h | 6 +-
|
|
arch/powerpc/kvm/book3s_64_mmu_hv.c | 25 +++++--
|
|
arch/powerpc/kvm/book3s_hv_builtin.c | 105 ++++++++++++++++++++++++++-
|
|
6 files changed, 136 insertions(+), 12 deletions(-)
|
|
|
|
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
|
|
index 906bcbdfd2a1..053ba320db49 100644
|
|
--- a/arch/powerpc/include/asm/kvm_host.h
|
|
+++ b/arch/powerpc/include/asm/kvm_host.h
|
|
@@ -258,6 +258,8 @@ struct kvm_hpt_info {
|
|
struct revmap_entry *rev;
|
|
/* Guest HPT size is 2**(order) bytes */
|
|
u32 order;
|
|
+ /* 1 if HPT allocated from reserved region, 0 otherwise */
|
|
+ int resv;
|
|
/* 1 if HPT allocated with CMA, 0 otherwise */
|
|
int cma;
|
|
};
|
|
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
|
|
index e991821dd7fa..9625b0dd28cc 100644
|
|
--- a/arch/powerpc/include/asm/kvm_ppc.h
|
|
+++ b/arch/powerpc/include/asm/kvm_ppc.h
|
|
@@ -210,6 +210,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
|
|
unsigned long tce_value, unsigned long npages);
|
|
extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
|
|
unsigned long ioba);
|
|
+extern unsigned long kvmhv_alloc_resv_hpt(u32 order);
|
|
+extern void kvmhv_release_resv_hpt(unsigned long hpt, u32 order);
|
|
extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages);
|
|
extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages);
|
|
extern int kvmppc_core_init_vm(struct kvm *kvm);
|
|
@@ -436,6 +438,8 @@ struct openpic;
|
|
|
|
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
|
|
extern void kvm_cma_reserve(void) __init;
|
|
+extern void kvm_resv_hpt_init(void);
|
|
+
|
|
static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
|
|
{
|
|
paca_ptrs[cpu]->kvm_hstate.xics_phys = (void __iomem *)addr;
|
|
@@ -476,6 +480,9 @@ extern bool kvm_hv_mode_active(void);
|
|
static inline void __init kvm_cma_reserve(void)
|
|
{}
|
|
|
|
+static inline void kvm_resv_hpt_init(void)
|
|
+{}
|
|
+
|
|
static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
|
|
{}
|
|
|
|
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
|
|
index 93fa0c99681e..38e36c67ab2f 100644
|
|
--- a/arch/powerpc/kernel/setup-common.c
|
|
+++ b/arch/powerpc/kernel/setup-common.c
|
|
@@ -979,6 +979,9 @@ void __init setup_arch(char **cmdline_p)
|
|
/* Initialize the MMU context management stuff. */
|
|
mmu_context_init();
|
|
|
|
+ /* Reserve memory for KVM HPTs */
|
|
+ kvm_resv_hpt_init();
|
|
+
|
|
#ifdef CONFIG_PPC64
|
|
/* Interrupt code needs to be 64K-aligned. */
|
|
if ((unsigned long)_stext & 0xffff)
|
|
diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
|
|
index c6a592b67386..6de1fac35774 100644
|
|
--- a/arch/powerpc/kernel/setup.h
|
|
+++ b/arch/powerpc/kernel/setup.h
|
|
@@ -53,13 +53,15 @@ extern unsigned long spr_default_dscr;
|
|
#endif
|
|
|
|
/*
|
|
- * Having this in kvm_ppc.h makes include dependencies too
|
|
- * tricky to solve for setup-common.c so have it here.
|
|
+ * Having these in kvm_ppc.h makes include dependencies too
|
|
+ * tricky to solve for setup-common.c so have them here.
|
|
*/
|
|
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
|
|
void kvm_cma_reserve(void);
|
|
+void kvm_resv_hpt_init(void);
|
|
#else
|
|
static inline void kvm_cma_reserve(void) { };
|
|
+static inline void kvm_resv_hpt_init(void) { }
|
|
#endif
|
|
|
|
#ifdef CONFIG_TAU
|
|
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
|
|
index 68e14afecac8..9e607014f4c7 100644
|
|
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
|
|
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
|
|
@@ -81,7 +81,7 @@ struct kvm_resize_hpt {
|
|
int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
|
|
{
|
|
unsigned long hpt = 0;
|
|
- int cma = 0;
|
|
+ int resv = 0, cma = 0;
|
|
struct page *page = NULL;
|
|
struct revmap_entry *rev;
|
|
unsigned long npte;
|
|
@@ -89,11 +89,17 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
|
|
if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
|
|
return -EINVAL;
|
|
|
|
- page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
|
|
- if (page) {
|
|
- hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
|
|
+ hpt = kvmhv_alloc_resv_hpt(order);
|
|
+ if (hpt) {
|
|
memset((void *)hpt, 0, (1ul << order));
|
|
- cma = 1;
|
|
+ resv = 1;
|
|
+ } else {
|
|
+ page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
|
|
+ if (page) {
|
|
+ hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
|
|
+ memset((void *)hpt, 0, (1ul << order));
|
|
+ cma = 1;
|
|
+ }
|
|
}
|
|
|
|
if (!hpt)
|
|
@@ -109,7 +115,9 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
|
|
/* Allocate reverse map array */
|
|
rev = vmalloc(array_size(npte, sizeof(struct revmap_entry)));
|
|
if (!rev) {
|
|
- if (cma)
|
|
+ if (resv)
|
|
+ kvmhv_release_resv_hpt(hpt, order);
|
|
+ else if (cma)
|
|
kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
|
|
else
|
|
free_pages(hpt, order - PAGE_SHIFT);
|
|
@@ -118,6 +126,7 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
|
|
|
|
info->order = order;
|
|
info->virt = hpt;
|
|
+ info->resv = resv;
|
|
info->cma = cma;
|
|
info->rev = rev;
|
|
|
|
@@ -191,7 +200,9 @@ void kvmppc_free_hpt(struct kvm_hpt_info *info)
|
|
{
|
|
vfree(info->rev);
|
|
info->rev = NULL;
|
|
- if (info->cma)
|
|
+ if (info->resv)
|
|
+ kvmhv_release_resv_hpt(info->virt, info->order);
|
|
+ else if (info->cma)
|
|
kvm_free_hpt_cma(virt_to_page(info->virt),
|
|
1 << (info->order - PAGE_SHIFT));
|
|
else if (info->virt)
|
|
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
|
|
index fc6bb9630a9c..3f36b99fb46b 100644
|
|
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
|
|
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
|
|
@@ -53,11 +53,109 @@ EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
|
|
|
|
/*
|
|
* Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
|
|
- * should be power of 2.
|
|
+ * only needs to be 256kB.
|
|
*/
|
|
-#define HPT_ALIGN_PAGES ((1 << 18) >> PAGE_SHIFT) /* 256k */
|
|
+#define HPT_ALIGN_ORDER 18 /* 256k */
|
|
+#define HPT_ALIGN_PAGES ((1 << HPT_ALIGN_ORDER) >> PAGE_SHIFT)
|
|
+
|
|
+#define KVM_RESV_CHUNK_ORDER HPT_ALIGN_ORDER
|
|
+
|
|
/*
|
|
- * By default we reserve 5% of memory for hash pagetable allocation.
|
|
+ * By default we reserve 2% of memory exclusively for guest HPT
|
|
+ * allocations, plus another 3% in the CMA zone which can be used
|
|
+ * either for HPTs or for movable page allocations.
|
|
+ * Each guest's HPT will be sized at between 1/128 and 1/64 of its
|
|
+ * memory, i.e. up to 1.56%, and allowing for about a 3x memory
|
|
+ * overcommit factor gets us to about 5%.
|
|
+ */
|
|
+static unsigned long kvm_hpt_resv_ratio = 2;
|
|
+
|
|
+static int __init early_parse_kvm_hpt_resv(char *p)
|
|
+{
|
|
+ pr_debug("%s(%s)\n", __func__, p);
|
|
+ if (!p)
|
|
+ return -EINVAL;
|
|
+ return kstrtoul(p, 0, &kvm_hpt_resv_ratio);
|
|
+}
|
|
+early_param("kvm_hpt_resv_ratio", early_parse_kvm_hpt_resv);
|
|
+
|
|
+static unsigned long kvm_resv_addr;
|
|
+static unsigned long *kvm_resv_bitmap;
|
|
+static unsigned long kvm_resv_chunks;
|
|
+static DEFINE_MUTEX(kvm_resv_lock);
|
|
+
|
|
+void kvm_resv_hpt_init(void)
|
|
+{
|
|
+ unsigned long align = 1ul << KVM_RESV_CHUNK_ORDER;
|
|
+ unsigned long size, bm_size;
|
|
+ unsigned long addr, bm;
|
|
+ unsigned long *bmp;
|
|
+
|
|
+ if (!cpu_has_feature(CPU_FTR_HVMODE))
|
|
+ return;
|
|
+
|
|
+ size = memblock_phys_mem_size() * kvm_hpt_resv_ratio / 100;
|
|
+ size = ALIGN(size, align);
|
|
+ if (!size)
|
|
+ return;
|
|
+
|
|
+ pr_info("KVM: Allocating %lu MiB for hashed page tables\n",
|
|
+ size >> 20);
|
|
+
|
|
+ addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
|
|
+ if (!addr) {
|
|
+ pr_err("KVM: Allocation of reserved memory for HPTs failed\n");
|
|
+ return;
|
|
+ }
|
|
+ pr_info("KVM: %lu MiB reserved for HPTs at %lx\n", size >> 20, addr);
|
|
+
|
|
+ bm_size = BITS_TO_LONGS(size >> KVM_RESV_CHUNK_ORDER) * sizeof(long);
|
|
+ bm = __memblock_alloc_base(bm_size, sizeof(long),
|
|
+ MEMBLOCK_ALLOC_ACCESSIBLE);
|
|
+ if (!bm) {
|
|
+ pr_err("KVM: Allocation of reserved memory bitmap failed\n");
|
|
+ return;
|
|
+ }
|
|
+ bmp = __va(bm);
|
|
+ memset(bmp, 0, bm_size);
|
|
+
|
|
+ kvm_resv_addr = (unsigned long) __va(addr);
|
|
+ kvm_resv_chunks = size >> KVM_RESV_CHUNK_ORDER;
|
|
+ kvm_resv_bitmap = bmp;
|
|
+}
|
|
+
|
|
+unsigned long kvmhv_alloc_resv_hpt(u32 order)
|
|
+{
|
|
+ unsigned long nr_chunks = 1ul << (order - KVM_RESV_CHUNK_ORDER);
|
|
+ unsigned long chunk;
|
|
+
|
|
+ mutex_lock(&kvm_resv_lock);
|
|
+ chunk = bitmap_find_next_zero_area(kvm_resv_bitmap, kvm_resv_chunks,
|
|
+ 0, nr_chunks, 0);
|
|
+ if (chunk < kvm_resv_chunks)
|
|
+ bitmap_set(kvm_resv_bitmap, chunk, nr_chunks);
|
|
+ mutex_unlock(&kvm_resv_lock);
|
|
+
|
|
+ if (chunk < kvm_resv_chunks)
|
|
+ return kvm_resv_addr + (chunk << KVM_RESV_CHUNK_ORDER);
|
|
+ return 0;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(kvmhv_alloc_resv_hpt);
|
|
+
|
|
+void kvmhv_release_resv_hpt(unsigned long addr, u32 order)
|
|
+{
|
|
+ unsigned long nr_chunks = 1ul << (order - KVM_RESV_CHUNK_ORDER);
|
|
+ unsigned long chunk = (addr - kvm_resv_addr) >> KVM_RESV_CHUNK_ORDER;
|
|
+
|
|
+ mutex_lock(&kvm_resv_lock);
|
|
+ if (chunk + nr_chunks <= kvm_resv_chunks)
|
|
+ bitmap_clear(kvm_resv_bitmap, chunk, nr_chunks);
|
|
+ mutex_unlock(&kvm_resv_lock);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(kvmhv_release_resv_hpt);
|
|
+
|
|
+/*
|
|
+ * By default we reserve 3% of memory for the CMA zone.
|
|
*/
|
|
static unsigned long kvm_cma_resv_ratio = 5;
|
|
|
|
@@ -106,6 +204,7 @@ void __init kvm_cma_reserve(void)
|
|
*/
|
|
if (!cpu_has_feature(CPU_FTR_HVMODE))
|
|
return;
|
|
+
|
|
/*
|
|
* We cannot use memblock_phys_mem_size() here, because
|
|
* memblock_analyze() has not been called yet.
|
|
--
|
|
2.17.1
|
|
|
|
|
|
From 57a21f640a2b6d2e225cac7df35ed0dde7c6293f Mon Sep 17 00:00:00 2001
|
|
From: Timothy Pearson <tpearson@raptorengineering.com>
|
|
Date: Sun, 17 Jun 2018 23:59:51 -0500
|
|
Subject: [PATCH 9/9] Fix undefined behaviour from signed integer overflow
|
|
|
|
Caught by UBSAN
|
|
|
|
Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
|
|
---
|
|
drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c | 2 +-
|
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
|
|
diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
|
|
index a029e47c2319..f7e56bec2dd7 100644
|
|
--- a/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
|
|
+++ b/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
|
|
@@ -98,7 +98,7 @@ int smu7_copy_bytes_to_smc(struct pp_hwmgr *hwmgr, uint32_t smc_start_address,
|
|
|
|
while (byte_count >= 4) {
|
|
/* Bytes are written into the SMC addres space with the MSB first. */
|
|
- data = src[0] * 0x1000000 + src[1] * 0x10000 + src[2] * 0x100 + src[3];
|
|
+ data = src[0] * 0x1000000U + src[1] * 0x10000U + src[2] * 0x100U + src[3];
|
|
|
|
result = smu7_set_smc_sram_address(hwmgr, addr, limit);
|
|
|
|
--
|
|
2.17.1
|
|
|