kernel/ppc64-talos.patch

From ba7256030f1b04e56096e9796bdf478f12872403 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Mon, 9 Apr 2018 17:29:36 +1000
Subject: [PATCH 1/9] powerpc/powernv/pci: Track largest available TCE order
 per PHB

Knowing the largest possible TCE size of a PHB is useful, so get it out
of the device tree.  This relies on the property being added in OPAL.

It is assumed that any PHB4 or later machine would be running firmware
that implemented this property, and otherwise assumed to be PHB3, which
has a maximum TCE order of 28 bits or 256MB TCEs.

This is used later in the series.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++++++++++++++++
 arch/powerpc/platforms/powernv/pci.h      |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index cde710297a4e..9f40f235b39e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3751,11 +3751,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	struct resource r;
 	const __be64 *prop64;
 	const __be32 *prop32;
+	struct property *prop;
 	int len;
 	unsigned int segno;
 	u64 phb_id;
 	void *aux;
 	long rc;
+	u32 val;

 	if (!of_device_is_available(np))
 		return;
@@ -3894,6 +3896,20 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	}
 	phb->ioda.pe_array = aux + pemap_off;

+	phb->ioda.max_tce_order = 0;
+	// Get TCE order from the DT.  If it's not present, assume P8
+	if (!of_get_property(np, "ibm,supported-tce-sizes", NULL)) {
+		phb->ioda.max_tce_order = 28; // assume P8 256mb TCEs
+	} else {
+		of_property_for_each_u32(np, "ibm,supported-tce-sizes", prop,
+					 prop32, val) {
+			if (val > phb->ioda.max_tce_order)
+				phb->ioda.max_tce_order = val;
+		}
+		pr_debug("PHB%llx Found max TCE order of %d bits\n",
+			 phb->opal_id, phb->ioda.max_tce_order);
+	}
+
 	/*
 	 * Choose PE number for root bus, which shouldn't have
 	 * M64 resources consumed by its child devices. To pick
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 8b37b28e3831..ca5414055972 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -172,6 +172,9 @@ struct pnv_phb {
 		struct list_head	pe_list;
 		struct mutex            pe_list_mutex;

+		/* Largest supported TCE order bits */
+		uint8_t			max_tce_order;
+
 		/* Reverse map of PEs, indexed by {bus, devfn} */
 		unsigned int		pe_rmap[0x10000];
 	} ioda;
--
2.17.1


From b3f3546c4b4225093f41a4caa25648718170a093 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Mon, 9 Apr 2018 17:34:37 +1000
Subject: [PATCH 2/9] powerpc/powernv: DMA operations for discontiguous
 allocation

Cognitive DMA is a new set of DMA operations that solve some issues for
devices that want to address more than 32 bits but can't address the 59
bits required to enable direct DMA.

The previous implementation for POWER8/PHB3 worked around this by
configuring a bypass from the default 32-bit address space into 64-bit
address space.  This approach does not work for POWER9/PHB4 because
regions of memory are discontiguous and many devices will be unable to
address memory beyond the first node.

Instead, implement a new set of DMA operations that allocate TCEs as DMA
mappings are requested so that all memory is addressable even when a
one-to-one mapping between real addresses and DMA addresses isn't
possible.  These TCEs are the maximum size available on the platform,
which is 256M on PHB3 and 1G on PHB4.

Devices can now map any region of memory up to the maximum amount they can
address according to the DMA mask set, in chunks of the largest available
TCE size.

This implementation replaces the need for the existing PHB3 solution and
should be compatible with future PHB versions.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/include/asm/dma-mapping.h    |   1 +
 arch/powerpc/platforms/powernv/Makefile   |   2 +-
 arch/powerpc/platforms/powernv/pci-dma.c  | 319 ++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 102 +++----
 arch/powerpc/platforms/powernv/pci.h      |   7 +
 5 files changed, 381 insertions(+), 50 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/pci-dma.c

diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 8fa394520af6..354f435160f3 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev)
 extern struct dma_map_ops dma_iommu_ops;
 #endif
 extern const struct dma_map_ops dma_nommu_ops;
+extern const struct dma_map_ops dma_pseudo_bypass_ops;

 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index b540ce8eec55..7cfc821508c3 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,7 +6,7 @@ obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o

 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
-obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
+obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o pci-dma.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
new file mode 100644
index 000000000000..1d5409be343e
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-dma.c
@@ -0,0 +1,319 @@
+/*
+ * DMA operations supporting pseudo-bypass for PHB3+
+ *
+ * Author: Russell Currey <ruscur@russell.cc>
+ *
+ * Copyright 2018 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/memblock.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/hash.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/pnv-pci.h>
+#include <asm/tce.h>
+
+#include "pci.h"
+
+/* select and allocate a TCE using the bitmap */
+static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr)
+{
+	int tce;
+	__be64 old, new;
+
+	spin_lock(&pe->tce_alloc_lock);
+	tce = bitmap_find_next_zero_area(pe->tce_bitmap,
+					 pe->tce_count,
+					 0,
+					 1,
+					 0);
+	bitmap_set(pe->tce_bitmap, tce, 1);
+	old = pe->tces[tce];
+	new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
+	pe->tces[tce] = new;
+	pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
+		tce, new, old);
+	spin_unlock(&pe->tce_alloc_lock);
+
+	return tce;
+}
+
+/*
+ * The tracking table for assigning TCEs has two entries per TCE.
+ * - @entry1 contains the physical address and the smallest bit indicates
+ *     if it's currently valid.
+ * - @entry2 contains the DMA address returned in the upper 34 bits, and a
+ *     refcount in the lower 30 bits.
+ */
+static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev,
+					    phys_addr_t addr)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+        u64 i, entry1, entry2, dma_prefix, tce, ret;
+	u64 offset = addr & ((1 << phb->ioda.max_tce_order) - 1);
+
+	pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
+
+	/* look through the tracking table for a free entry */
+	for (i = 0; i < pe->tce_count; i++) {
+		entry1 = pe->tce_tracker[i * 2];
+		entry2 = pe->tce_tracker[i * 2 + 1];
+		dma_prefix = entry2 >> 34;
+
+		/* if the address is the same and the entry is valid */
+		if (entry1 == ((addr - offset) | 1)) {
+			/* all we need to do here is increment the refcount */
+			ret = cmpxchg(&pe->tce_tracker[i * 2 + 1],
+				      entry2, entry2 + 1);
+			if (ret != entry2) {
+				/* conflict, start looking again just in case */
+				i--;
+				continue;
+			}
+			return (dma_prefix << phb->ioda.max_tce_order) | offset;
+		/* if the entry is invalid then we want to replace it */
+		} else if (!(entry1 & 1)) {
+			/* set the real address, note that it isn't valid yet */
+			ret = cmpxchg(&pe->tce_tracker[i * 2],
+				      entry1, (addr - offset));
+			if (ret != entry1) {
+				/* conflict, start looking again */
+				i--;
+				continue;
+			}
+
+			/* now we can allocate a TCE */
+			tce = dma_pseudo_bypass_select_tce(pe, addr - offset);
+
+			/* set new value, including TCE index and new refcount */
+			ret = cmpxchg(&pe->tce_tracker[i * 2 + 1],
+				      entry2, tce << 34 | 1);
+			if (ret != entry2) {
+				/*
+				 * XXX In this case we need to throw out
+				 * everything, including the TCE we just
+				 * allocated.  For now, just leave it.
+				 */
+				i--;
+				continue;
+			}
+
+			/* now set the valid bit */
+			ret = cmpxchg(&pe->tce_tracker[i * 2],
+				      (addr - offset), (addr - offset) | 1);
+			if (ret != (addr - offset)) {
+				/*
+				 * XXX Same situation as above.  We'd probably
+				 * want to null out entry2 as well.
+				 */
+				i--;
+				continue;
+			}
+			return (tce << phb->ioda.max_tce_order) | offset;
+		/* it's a valid entry but not ours, keep looking */
+		} else {
+			continue;
+		}
+	}
+	/* If we get here, the table must be full, so error out. */
+	return -1ULL;
+}
+
+/*
+ * For the moment, unmapping just decrements the refcount and doesn't actually
+ * remove the TCE.  This is because it's very likely that a previously allocated
+ * TCE will be used again, and this saves having to invalidate it.
+ *
+ * TODO implement some kind of garbage collection that clears unused TCE entries
+ * once the table reaches a certain size.
+ */
+static void dma_pseudo_bypass_unmap_address(struct device *dev, dma_addr_t dma_addr)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+	u64 i, entry1, entry2, dma_prefix, refcount;
+
+	pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
+
+	for (i = 0; i < pe->tce_count; i++) {
+		entry1 = pe->tce_tracker[i * 2];
+		entry2 = pe->tce_tracker[i * 2 + 1];
+		dma_prefix = entry2 >> 34;
+		refcount = entry2 & ((1 << 30) - 1);
+
+		/* look through entry2 until we find our address */
+		if (dma_prefix == (dma_addr >> phb->ioda.max_tce_order)) {
+			refcount--;
+			cmpxchg(&pe->tce_tracker[i * 2 + 1], entry2, (dma_prefix << 34) | refcount);
+			if (!refcount) {
+				/*
+				 * Here is where we would remove the valid bit
+				 * from entry1, clear the entry in the TCE table
+				 * and invalidate the TCE - but we want to leave
+				 * them until the table fills up (for now).
+				 */
+			}
+			break;
+		}
+	}
+}
+
+static int dma_pseudo_bypass_dma_supported(struct device *dev, u64 mask)
+{
+	/*
+	 * Normally dma_supported() checks if the mask is capable of addressing
+	 * all of memory.  Since we map physical memory in chunks that the
+	 * device can address, the device will be able to address whatever it
+	 * wants - just not all at once.
+	 */
+	return 1;
+}
+
+static void *dma_pseudo_bypass_alloc_coherent(struct device *dev,
+					  size_t size,
+					  dma_addr_t *dma_handle,
+					  gfp_t flag,
+					  unsigned long attrs)
+{
+	void *ret;
+	struct page *page;
+	int node = dev_to_node(dev);
+
+	/* ignore region specifiers */
+	flag &= ~(__GFP_HIGHMEM);
+
+	page = alloc_pages_node(node, flag, get_order(size));
+	if (page == NULL)
+		return NULL;
+	ret = page_address(page);
+	memset(ret, 0, size);
+	*dma_handle = dma_pseudo_bypass_get_address(dev, __pa(ret));
+
+	return ret;
+}
+
+static void dma_pseudo_bypass_free_coherent(struct device *dev,
+					 size_t size,
+					 void *vaddr,
+					 dma_addr_t dma_handle,
+					 unsigned long attrs)
+{
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static int dma_pseudo_bypass_mmap_coherent(struct device *dev,
+				       struct vm_area_struct *vma,
+				       void *cpu_addr,
+				       dma_addr_t handle,
+				       size_t size,
+				       unsigned long attrs)
+{
+	unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
+
+	return remap_pfn_range(vma, vma->vm_start,
+			       pfn + vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+static inline dma_addr_t dma_pseudo_bypass_map_page(struct device *dev,
+						struct page *page,
+						unsigned long offset,
+						size_t size,
+						enum dma_data_direction dir,
+						unsigned long attrs)
+{
+	BUG_ON(dir == DMA_NONE);
+
+	/* XXX I don't know if this is necessary (or even desired) */
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		__dma_sync_page(page, offset, size, dir);
+
+	return dma_pseudo_bypass_get_address(dev, page_to_phys(page) + offset);
+}
+
+static inline void dma_pseudo_bypass_unmap_page(struct device *dev,
+					 dma_addr_t dma_address,
+					 size_t size,
+					 enum dma_data_direction direction,
+					 unsigned long attrs)
+{
+	dma_pseudo_bypass_unmap_address(dev, dma_address);
+}
+
+
+static int dma_pseudo_bypass_map_sg(struct device *dev, struct scatterlist *sgl,
+			     int nents, enum dma_data_direction direction,
+			     unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+
+	for_each_sg(sgl, sg, nents, i) {
+		sg->dma_address = dma_pseudo_bypass_get_address(dev, sg_phys(sg));
+		sg->dma_length = sg->length;
+
+		if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+			continue;
+
+		__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
+	}
+
+	return nents;
+}
+
+static void dma_pseudo_bypass_unmap_sg(struct device *dev, struct scatterlist *sgl,
+				int nents, enum dma_data_direction direction,
+				unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		dma_pseudo_bypass_unmap_address(dev, sg->dma_address);
+	}
+}
+
+static u64 dma_pseudo_bypass_get_required_mask(struct device *dev)
+{
+	/*
+	 * there's no limitation on our end, the driver should just call
+	 * set_mask() with as many bits as the device can address.
+	 */
+	return -1ULL;
+}
+
+static int dma_pseudo_bypass_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == -1ULL;
+}
+
+
+const struct dma_map_ops dma_pseudo_bypass_ops = {
+	.alloc				= dma_pseudo_bypass_alloc_coherent,
+	.free				= dma_pseudo_bypass_free_coherent,
+	.mmap				= dma_pseudo_bypass_mmap_coherent,
+	.map_sg				= dma_pseudo_bypass_map_sg,
+	.unmap_sg			= dma_pseudo_bypass_unmap_sg,
+	.dma_supported			= dma_pseudo_bypass_dma_supported,
+	.map_page			= dma_pseudo_bypass_map_page,
+	.unmap_page			= dma_pseudo_bypass_unmap_page,
+	.get_required_mask		= dma_pseudo_bypass_get_required_mask,
+	.mapping_error			= dma_pseudo_bypass_mapping_error,
+};
+EXPORT_SYMBOL(dma_pseudo_bypass_ops);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9f40f235b39e..b982558a92ac 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -25,6 +25,7 @@
 #include <linux/iommu.h>
 #include <linux/rculist.h>
 #include <linux/sizes.h>
+#include <linux/vmalloc.h>

 #include <asm/sections.h>
 #include <asm/io.h>
@@ -1085,6 +1086,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 	pe->pbus = NULL;
 	pe->mve_number = -1;
 	pe->rid = dev->bus->number << 8 | pdn->devfn;
+	pe->tces = NULL;
+	pe->tce_tracker = NULL;
+	pe->tce_bitmap = NULL;

 	pe_info(pe, "Associated device to PE\n");

@@ -1566,6 +1570,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		pe->mve_number = -1;
 		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
 			   pci_iov_virtfn_devfn(pdev, vf_index);
+		pe->tces = NULL;
+		pe->tce_tracker = NULL;
+		pe->tce_bitmap = NULL;

 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
 			hose->global_number, pdev->bus->number,
@@ -1771,43 +1778,40 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
 	return true;
 }

-/*
- * Reconfigure TVE#0 to be usable as 64-bit DMA space.
- *
- * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
- * Devices can only access more than that if bit 59 of the PCI address is set
- * by hardware, which indicates TVE#1 should be used instead of TVE#0.
- * Many PCI devices are not capable of addressing that many bits, and as a
- * result are limited to the 4GB of virtual memory made available to 32-bit
- * devices in TVE#0.
- *
- * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
- * devices by configuring the virtual memory past the first 4GB inaccessible
- * by 64-bit DMAs.  This should only be used by devices that want more than
- * 4GB, and only on PEs that have no 32-bit devices.
- *
- * Currently this will only work on PHB3 (POWER8).
- */
-static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
+static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
 {
-	u64 window_size, table_size, tce_count, addr;
+	u64 tce_count, table_size, window_size;
+	struct pnv_phb *p = pe->phb;
 	struct page *table_pages;
-	u64 tce_order = 28; /* 256MB TCEs */
 	__be64 *tces;
-	s64 rc;
+	int rc = -ENOMEM;
+	int bitmap_size, tracker_entries;
+
+	/*
+	 * XXX These are factors for scaling the size of the TCE table, and
+	 * the table that tracks these allocations.  These should eventually
+	 * be kernel command line options with defaults above 1, for situations
+	 * where your memory expands after the machine has booted.
+	 */
+	int tce_size_factor = 1;
+	int tracking_table_factor = 1;

 	/*
-	 * Window size needs to be a power of two, but needs to account for
-	 * shifting memory by the 4GB offset required to skip 32bit space.
+	 * The window size covers all of memory (and optionally more), with
+	 * enough tracker entries to cover them all being allocated.  So we
+	 * create enough TCEs to cover all of memory at once.
 	 */
-	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
-	tce_count = window_size >> tce_order;
+	window_size = roundup_pow_of_two(tce_size_factor * memory_hotplug_max());
+	tracker_entries = (tracking_table_factor * memory_hotplug_max()) >>
+		p->ioda.max_tce_order;
+	tce_count = window_size >> p->ioda.max_tce_order;
+	bitmap_size = BITS_TO_LONGS(tce_count) * sizeof(unsigned long);
 	table_size = tce_count << 3;

 	if (table_size < PAGE_SIZE)
 		table_size = PAGE_SIZE;

-	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
+	table_pages = alloc_pages_node(p->hose->node, GFP_KERNEL,
 				       get_order(table_size));
 	if (!table_pages)
 		goto err;
@@ -1818,26 +1822,33 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)

 	memset(tces, 0, table_size);

-	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
-		tces[(addr + (1ULL << 32)) >> tce_order] =
-			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
-	}
+	pe->tces = tces;
+	pe->tce_count = tce_count;
+	pe->tce_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	/* The tracking table has two u64s per TCE */
+	pe->tce_tracker = vzalloc(sizeof(u64) * 2 * tracker_entries);
+	spin_lock_init(&pe->tce_alloc_lock);
+
+	/* mark the first 4GB as reserved so this can still be used for 32bit */
+	bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order));
+
+	pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n",
+		tracker_entries, bitmap_size, tce_count);

 	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
 					pe->pe_number,
-					/* reconfigure window 0 */
 					(pe->pe_number << 1) + 0,
 					1,
 					__pa(tces),
 					table_size,
-					1 << tce_order);
+					1 << p->ioda.max_tce_order);
 	if (rc == OPAL_SUCCESS) {
-		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
+		pe_info(pe, "TCE tables configured for pseudo-bypass\n");
 		return 0;
 	}
 err:
-	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
-	return -EIO;
+	pe_err(pe, "error configuring pseudo-bypass\n");
+	return rc;
 }

 static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
@@ -1848,7 +1859,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 	struct pnv_ioda_pe *pe;
 	uint64_t top;
 	bool bypass = false;
-	s64 rc;

 	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
 		return -ENODEV;
@@ -1865,21 +1875,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 	} else {
 		/*
 		 * If the device can't set the TCE bypass bit but still wants
-		 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
-		 * bypass the 32-bit region and be usable for 64-bit DMAs.
-		 * The device needs to be able to address all of this space.
+		 * to access 4GB or more, we need to use a different set of DMA
+		 * operations with an indirect mapping.
 		 */
 		if (dma_mask >> 32 &&
-		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
-		    pnv_pci_ioda_pe_single_vendor(pe) &&
-		    phb->model == PNV_PHB_MODEL_PHB3) {
-			/* Configure the bypass mode */
-			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
-			if (rc)
-				return rc;
-			/* 4GB offset bypasses 32-bit space */
-			set_dma_offset(&pdev->dev, (1ULL << 32));
-			set_dma_ops(&pdev->dev, &dma_nommu_ops);
+		    phb->model != PNV_PHB_MODEL_P7IOC &&
+		    pnv_pci_ioda_pe_single_vendor(pe)) {
+			if (!pe->tces)
+				pnv_pci_pseudo_bypass_setup(pe);
+			set_dma_ops(&pdev->dev, &dma_pseudo_bypass_ops);
 		} else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
 			/*
 			 * Fail the request if a DMA mask between 32 and 64 bits
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index ca5414055972..9418c6ea189b 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -70,6 +70,13 @@ struct pnv_ioda_pe {
 	bool			tce_bypass_enabled;
 	uint64_t		tce_bypass_base;

+	/* TCE tables for DMA pseudo-bypass */
+	__be64			*tces;
+	u64			tce_count;
+	unsigned long		*tce_bitmap;
+	u64			*tce_tracker; // 2 u64s per TCE
+	spinlock_t		tce_alloc_lock;
+
 	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
 	 * and -1 if not supported. (It's actually identical to the
 	 * PE number)
--
2.17.1


From 2e6abf2b56d40a953eaa39e2bee064bfcc1da6d1 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Wed, 6 Jun 2018 13:36:06 +1000
Subject: [PATCH 3/9] powerpc/powernv/pci: Track DMA and TCE tables in debugfs

Add a new debugfs entry to trigger dumping out the tracking table and
TCEs for a given PE, for example PE 0x4 of PHB 2:

echo 0x4 > /sys/kernel/debug/powerpc/PCI0002/sketchy

This will result in the table being dumped out in dmesg.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 43 +++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b982558a92ac..3598ca8daa7c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3203,6 +3203,47 @@ static int pnv_pci_diag_data_set(void *data, u64 val)
 DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
 			pnv_pci_diag_data_set, "%llu\n");

+static int pnv_pci_sketchy_set(void *data, u64 val)
+{
+	struct pci_controller *hose;
+	struct pnv_ioda_pe *pe;
+	struct pnv_phb *phb;
+	u64 entry1, entry2;
+	int i;
+
+	hose = (struct pci_controller *)data;
+	if (!hose || !hose->private_data)
+		return -ENODEV;
+
+	phb = hose->private_data;
+	pe = &phb->ioda.pe_array[val];
+
+	if (!pe)
+		return -EINVAL;
+
+	if (!pe->tces || !pe->tce_tracker)
+		return -EIO;
+
+	for (i = 0; i < pe->tce_count; i++) {
+		if (i > 16 && pe->tces[i] == 0)
+			break;
+		pr_info("%3d: %016llx\n", i, be64_to_cpu(pe->tces[i]));
+	}
+
+	for (i = 0; i < pe->tce_count; i++) {
+		entry1 = pe->tce_tracker[i * 2];
+		entry2 = pe->tce_tracker[i * 2 + 1];
+		if (!entry1)
+			break;
+		pr_info("%3d: %016llx %016llx\n", i, entry1, entry2);
+	}
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_sketchy_fops, NULL,
+			pnv_pci_sketchy_set, "%llu\n");
+
+
 #endif /* CONFIG_DEBUG_FS */

 static void pnv_pci_ioda_create_dbgfs(void)
@@ -3228,6 +3269,8 @@ static void pnv_pci_ioda_create_dbgfs(void)

 		debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
 				    &pnv_pci_diag_data_fops);
+		debugfs_create_file("sketchy", 0200, phb->dbgfs, hose,
+				    &pnv_pci_sketchy_fops);
 	}
 #endif /* CONFIG_DEBUG_FS */
 }
--
2.17.1


From 8456e9247c21d9fc7838dd5a71435342f7b79f88 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Tue, 19 Jun 2018 16:21:13 +1000
Subject: [PATCH 4/9] powerpc/powernv/pci: Safety fixes for pseudobypass TCE
 allocation

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/platforms/powernv/pci-dma.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
index 1d5409be343e..237940a2a052 100644
--- a/arch/powerpc/platforms/powernv/pci-dma.c
+++ b/arch/powerpc/platforms/powernv/pci-dma.c
@@ -29,8 +29,9 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
 {
 	int tce;
 	__be64 old, new;
+	unsigned long flags;

-	spin_lock(&pe->tce_alloc_lock);
+	spin_lock_irqsave(&pe->tce_alloc_lock, flags);
 	tce = bitmap_find_next_zero_area(pe->tce_bitmap,
 					 pe->tce_count,
 					 0,
@@ -40,9 +41,10 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
 	old = pe->tces[tce];
 	new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
 	pe->tces[tce] = new;
+	mb();
 	pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
 		tce, new, old);
-	spin_unlock(&pe->tce_alloc_lock);
+	spin_unlock_irqrestore(&pe->tce_alloc_lock, flags);

 	return tce;
 }
--
2.17.1


From b8949ef1b1bb5977ba9fb35f06d1466b6be475a5 Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineering.com>
Date: Sat, 23 Jun 2018 16:20:48 -0500
Subject: [PATCH 5/9] powerpc/powernv/pci: Export
 pnv_pci_ioda2_tce_invalidate_pe

Pseudo DMA support requires a method to invalidate the TCE cache
Export pnv_pci_ioda2_tce_invalidate_pe for use by the pseudo DMA
mapper.

Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 arch/powerpc/platforms/powernv/pci.h      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3598ca8daa7c..83f9db17e711 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2100,7 +2100,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
 	}
 }

-static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
 {
 	struct pnv_phb *phb = pe->phb;

diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 9418c6ea189b..bea565c3f302 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -244,6 +244,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 /* Nvlink functions */
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
 extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
 extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 		struct iommu_table *tbl);
--
2.17.1


From 4d8211098c35d5b7556f756db18dba89c015600d Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineering.com>
Date: Sat, 23 Jun 2018 16:22:59 -0500
Subject: [PATCH 6/9] powerpc/powernv/pci: Invalidate TCE cache after DMA map
 setup

Per the IODA2, TCEs must be invalidated after their settings
have been changed.  Invalidate the cache after the address
is changed during TCE allocation when using pseudo DMA.

Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
---
 arch/powerpc/platforms/powernv/pci-dma.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
index 237940a2a052..060dbc168401 100644
--- a/arch/powerpc/platforms/powernv/pci-dma.c
+++ b/arch/powerpc/platforms/powernv/pci-dma.c
@@ -42,8 +42,7 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
 	new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
 	pe->tces[tce] = new;
 	mb();
-	pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
-		tce, new, old);
+	pnv_pci_ioda2_tce_invalidate_pe(pe);
 	spin_unlock_irqrestore(&pe->tce_alloc_lock, flags);

 	return tce;
--
2.17.1


From a963913380c91a465509bae341da1e8aac40cdee Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineering.com>
Date: Sat, 23 Jun 2018 16:25:16 -0500
Subject: [PATCH 7/9] powerpc/powernv/pci: Don't use the lower 4G TCEs in
 pseudo-DMA mode

Four TCEs are reserved for legacy 32-bit DMA mappings in psuedo DMA
mode.  Mark these with an invalid address to avoid their use by
the TCE cache mapper.

Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 83f9db17e711..f4cd6a5c2bc7 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1780,7 +1780,7 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)

 static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
 {
-	u64 tce_count, table_size, window_size;
+	u64 i, tce_count, table_size, window_size;
 	struct pnv_phb *p = pe->phb;
 	struct page *table_pages;
 	__be64 *tces;
@@ -1832,6 +1832,12 @@ static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
 	/* mark the first 4GB as reserved so this can still be used for 32bit */
 	bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order));

+	/* make sure reserved first 4GB TCEs are not used by the mapper
+	 * set each address to -1, which will never match an incoming request
+	 */
+	for (i = 0; i < 4; i++)
+		pe->tce_tracker[i * 2] = -1;
+
 	pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n",
 		tracker_entries, bitmap_size, tce_count);

--
2.17.1


From d397370596955d166c76a1f487ef53d8dbf52d9c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 11 Sep 2018 17:00:30 +1000
Subject: [PATCH 8/9] KVM: PPC: Book3S HV: Allocate a memory area exclusively
 for HPTs

Currently we allocate HPTs (hashed page tables) for guests using the
CMA (contiguous memory allocator) facility.  However, there are
situations where the CMA region can get fragmented, notably when
lots of guest pages get pinned for PCI pass-through, which then causes
HPT allocations to fail even if there is sufficient CMA memory
available overall.

This commit adds the capability to reserve some memory at boot time
exclusively for HPTs for KVM guests.  The amount is controlled with
the kvm_hpt_resv_ratio=N kernel command-line option, where N is the
percentage of system memory to reserve.  This reserved memory will
be used first, and only when a guest HPT can't be allocated from this
reserved memory will the CMA region be used.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h  |   2 +
 arch/powerpc/include/asm/kvm_ppc.h   |   7 ++
 arch/powerpc/kernel/setup-common.c   |   3 +
 arch/powerpc/kernel/setup.h          |   6 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  25 +++++--
 arch/powerpc/kvm/book3s_hv_builtin.c | 105 ++++++++++++++++++++++++++-
 6 files changed, 136 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 906bcbdfd2a1..053ba320db49 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -258,6 +258,8 @@ struct kvm_hpt_info {
 	struct revmap_entry *rev;
 	/* Guest HPT size is 2**(order) bytes */
 	u32 order;
+	/* 1 if HPT allocated from reserved region, 0 otherwise */
+	int resv;
 	/* 1 if HPT allocated with CMA, 0 otherwise */
 	int cma;
 };
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e991821dd7fa..9625b0dd28cc 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -210,6 +210,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 		unsigned long tce_value, unsigned long npages);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			     unsigned long ioba);
+extern unsigned long kvmhv_alloc_resv_hpt(u32 order);
+extern void kvmhv_release_resv_hpt(unsigned long hpt, u32 order);
 extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages);
 extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
@@ -436,6 +438,8 @@ struct openpic;

 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 extern void kvm_cma_reserve(void) __init;
+extern void kvm_resv_hpt_init(void);
+
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
 	paca_ptrs[cpu]->kvm_hstate.xics_phys = (void __iomem *)addr;
@@ -476,6 +480,9 @@ extern bool kvm_hv_mode_active(void);
 static inline void __init kvm_cma_reserve(void)
 {}

+static inline void kvm_resv_hpt_init(void)
+{}
+
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 93fa0c99681e..38e36c67ab2f 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -979,6 +979,9 @@ void __init setup_arch(char **cmdline_p)
 	/* Initialize the MMU context management stuff. */
 	mmu_context_init();

+	/* Reserve memory for KVM HPTs */
+	kvm_resv_hpt_init();
+
 #ifdef CONFIG_PPC64
 	/* Interrupt code needs to be 64K-aligned. */
 	if ((unsigned long)_stext & 0xffff)
diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
index c6a592b67386..6de1fac35774 100644
--- a/arch/powerpc/kernel/setup.h
+++ b/arch/powerpc/kernel/setup.h
@@ -53,13 +53,15 @@ extern unsigned long spr_default_dscr;
 #endif

 /*
- * Having this in kvm_ppc.h makes include dependencies too
- * tricky to solve for setup-common.c so have it here.
+ * Having these in kvm_ppc.h makes include dependencies too
+ * tricky to solve for setup-common.c so have them here.
  */
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 void kvm_cma_reserve(void);
+void kvm_resv_hpt_init(void);
 #else
 static inline void kvm_cma_reserve(void) { };
+static inline void kvm_resv_hpt_init(void) { }
 #endif

 #ifdef CONFIG_TAU
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 68e14afecac8..9e607014f4c7 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -81,7 +81,7 @@ struct kvm_resize_hpt {
 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
 {
 	unsigned long hpt = 0;
-	int cma = 0;
+	int resv = 0, cma = 0;
 	struct page *page = NULL;
 	struct revmap_entry *rev;
 	unsigned long npte;
@@ -89,11 +89,17 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
 	if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
 		return -EINVAL;

-	page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
-	if (page) {
-		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+	hpt = kvmhv_alloc_resv_hpt(order);
+	if (hpt) {
 		memset((void *)hpt, 0, (1ul << order));
-		cma = 1;
+		resv = 1;
+	} else {
+		page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
+		if (page) {
+			hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+			memset((void *)hpt, 0, (1ul << order));
+			cma = 1;
+		}
 	}

 	if (!hpt)
@@ -109,7 +115,9 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
 	/* Allocate reverse map array */
 	rev = vmalloc(array_size(npte, sizeof(struct revmap_entry)));
 	if (!rev) {
-		if (cma)
+		if (resv)
+			kvmhv_release_resv_hpt(hpt, order);
+		else if (cma)
 			kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
 		else
 			free_pages(hpt, order - PAGE_SHIFT);
@@ -118,6 +126,7 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)

 	info->order = order;
 	info->virt = hpt;
+	info->resv = resv;
 	info->cma = cma;
 	info->rev = rev;

@@ -191,7 +200,9 @@ void kvmppc_free_hpt(struct kvm_hpt_info *info)
 {
 	vfree(info->rev);
 	info->rev = NULL;
-	if (info->cma)
+	if (info->resv)
+		kvmhv_release_resv_hpt(info->virt, info->order);
+	else if (info->cma)
 		kvm_free_hpt_cma(virt_to_page(info->virt),
 				 1 << (info->order - PAGE_SHIFT));
 	else if (info->virt)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fc6bb9630a9c..3f36b99fb46b 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -53,11 +53,109 @@ EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);

 /*
  * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
- * should be power of 2.
+ * only needs to be 256kB.
  */
-#define HPT_ALIGN_PAGES		((1 << 18) >> PAGE_SHIFT) /* 256k */
+#define HPT_ALIGN_ORDER		18		/* 256k */
+#define HPT_ALIGN_PAGES		((1 << HPT_ALIGN_ORDER) >> PAGE_SHIFT)
+
+#define KVM_RESV_CHUNK_ORDER	HPT_ALIGN_ORDER
+
 /*
- * By default we reserve 5% of memory for hash pagetable allocation.
+ * By default we reserve 2% of memory exclusively for guest HPT
+ * allocations, plus another 3% in the CMA zone which can be used
+ * either for HPTs or for movable page allocations.
+ * Each guest's HPT will be sized at between 1/128 and 1/64 of its
+ * memory, i.e. up to 1.56%, and allowing for about a 3x memory
+ * overcommit factor gets us to about 5%.
+ */
+static unsigned long kvm_hpt_resv_ratio = 2;
+
+static int __init early_parse_kvm_hpt_resv(char *p)
+{
+	pr_debug("%s(%s)\n", __func__, p);
+	if (!p)
+		return -EINVAL;
+	return kstrtoul(p, 0, &kvm_hpt_resv_ratio);
+}
+early_param("kvm_hpt_resv_ratio", early_parse_kvm_hpt_resv);
+
+static unsigned long kvm_resv_addr;
+static unsigned long *kvm_resv_bitmap;
+static unsigned long kvm_resv_chunks;
+static DEFINE_MUTEX(kvm_resv_lock);
+
+void kvm_resv_hpt_init(void)
+{
+	unsigned long align = 1ul << KVM_RESV_CHUNK_ORDER;
+	unsigned long size, bm_size;
+	unsigned long addr, bm;
+	unsigned long *bmp;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return;
+
+	size = memblock_phys_mem_size() * kvm_hpt_resv_ratio / 100;
+	size = ALIGN(size, align);
+	if (!size)
+		return;
+
+	pr_info("KVM: Allocating %lu MiB for hashed page tables\n",
+		size >> 20);
+
+	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+	if (!addr) {
+		pr_err("KVM: Allocation of reserved memory for HPTs failed\n");
+		return;
+	}
+	pr_info("KVM: %lu MiB reserved for HPTs at %lx\n", size >> 20, addr);
+
+	bm_size = BITS_TO_LONGS(size >> KVM_RESV_CHUNK_ORDER) * sizeof(long);
+	bm = __memblock_alloc_base(bm_size, sizeof(long),
+				   MEMBLOCK_ALLOC_ACCESSIBLE);
+	if (!bm) {
+		pr_err("KVM: Allocation of reserved memory bitmap failed\n");
+		return;
+	}
+	bmp = __va(bm);
+	memset(bmp, 0, bm_size);
+
+	kvm_resv_addr = (unsigned long) __va(addr);
+	kvm_resv_chunks = size >> KVM_RESV_CHUNK_ORDER;
+	kvm_resv_bitmap = bmp;
+}
+
+unsigned long kvmhv_alloc_resv_hpt(u32 order)
+{
+	unsigned long nr_chunks = 1ul << (order - KVM_RESV_CHUNK_ORDER);
+	unsigned long chunk;
+
+	mutex_lock(&kvm_resv_lock);
+	chunk = bitmap_find_next_zero_area(kvm_resv_bitmap, kvm_resv_chunks,
+					   0, nr_chunks, 0);
+	if (chunk < kvm_resv_chunks)
+		bitmap_set(kvm_resv_bitmap, chunk, nr_chunks);
+	mutex_unlock(&kvm_resv_lock);
+
+	if (chunk < kvm_resv_chunks)
+		return kvm_resv_addr + (chunk << KVM_RESV_CHUNK_ORDER);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmhv_alloc_resv_hpt);
+
+void kvmhv_release_resv_hpt(unsigned long addr, u32 order)
+{
+	unsigned long nr_chunks = 1ul << (order - KVM_RESV_CHUNK_ORDER);
+	unsigned long chunk = (addr - kvm_resv_addr) >> KVM_RESV_CHUNK_ORDER;
+
+	mutex_lock(&kvm_resv_lock);
+	if (chunk + nr_chunks <= kvm_resv_chunks)
+		bitmap_clear(kvm_resv_bitmap, chunk, nr_chunks);
+	mutex_unlock(&kvm_resv_lock);
+}
+EXPORT_SYMBOL_GPL(kvmhv_release_resv_hpt);
+
+/*
+ * By default we reserve 3% of memory for the CMA zone.
  */
 static unsigned long kvm_cma_resv_ratio = 5;

@@ -106,6 +204,7 @@ void __init kvm_cma_reserve(void)
 	 */
 	if (!cpu_has_feature(CPU_FTR_HVMODE))
 		return;
+
 	/*
 	 * We cannot use memblock_phys_mem_size() here, because
 	 * memblock_analyze() has not been called yet.
--
2.17.1


From 57a21f640a2b6d2e225cac7df35ed0dde7c6293f Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineering.com>
Date: Sun, 17 Jun 2018 23:59:51 -0500
Subject: [PATCH 9/9] Fix undefined behaviour from signed integer overflow

Caught by UBSAN

Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
---
 drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
index a029e47c2319..f7e56bec2dd7 100644
--- a/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
+++ b/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
@@ -98,7 +98,7 @@ int smu7_copy_bytes_to_smc(struct pp_hwmgr *hwmgr, uint32_t smc_start_address,

 	while (byte_count >= 4) {
 	/* Bytes are written into the SMC addres space with the MSB first. */
-		data = src[0] * 0x1000000 + src[1] * 0x10000 + src[2] * 0x100 + src[3];
+		data = src[0] * 0x1000000U + src[1] * 0x10000U + src[2] * 0x100U + src[3];

 		result = smu7_set_smc_sram_address(hwmgr, addr, limit);

--
2.17.1