diff --git a/kernel.spec b/kernel.spec
index a65c882f4..c496d0cc2 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -367,7 +367,7 @@ Name: kernel%{?variant}
 License: GPLv2 and Redistributable, no modification permitted
 URL: https://www.kernel.org/
 Version: %{rpmversion}
-Release: %{pkg_release}
+Release: %{pkg_release}.op.1
 # DO NOT CHANGE THE 'ExclusiveArch' LINE TO TEMPORARILY EXCLUDE AN ARCHITECTURE BUILD.
 # SET %%nobuildarches (ABOVE) INSTEAD
 ExclusiveArch: %{all_x86} x86_64 s390x %{arm} aarch64 ppc64le
@@ -593,6 +593,7 @@ Patch350: arm64-arch_timer-Workaround-for-Allwinner-A64-timer-instability.patch
 Patch351: arm64-dts-allwinner-a64-Enable-A64-timer-workaround.patch
 
 # 400 - IBM (ppc/s390x) patches
+Patch400: ppc64-talos.patch
 
 # 500 - Temp fixes/CVEs etc
 
diff --git a/ppc64-talos.patch b/ppc64-talos.patch
new file mode 100644
index 000000000..37e8fe9d5
--- /dev/null
+++ b/ppc64-talos.patch
@@ -0,0 +1,1248 @@
+From ba7256030f1b04e56096e9796bdf478f12872403 Mon Sep 17 00:00:00 2001
+From: Russell Currey <ruscur@russell.cc>
+Date: Mon, 9 Apr 2018 17:29:36 +1000
+Subject: [PATCH 1/9] powerpc/powernv/pci: Track largest available TCE order
+ per PHB
+
+Knowing the largest possible TCE size of a PHB is useful, so get it out
+of the device tree.  This relies on the property being added in OPAL.
+
+It is assumed that any PHB4 or later machine would be running firmware
+that implemented this property, and otherwise assumed to be PHB3, which
+has a maximum TCE order of 28 bits or 256MB TCEs.
+
+This is used later in the series.
+
+Signed-off-by: Russell Currey <ruscur@russell.cc>
+---
+ arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++++++++++++++++
+ arch/powerpc/platforms/powernv/pci.h      |  3 +++
+ 2 files changed, 19 insertions(+)
+
+diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
+index cde710297a4e..9f40f235b39e 100644
+--- a/arch/powerpc/platforms/powernv/pci-ioda.c
++++ b/arch/powerpc/platforms/powernv/pci-ioda.c
+@@ -3751,11 +3751,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
+ 	struct resource r;
+ 	const __be64 *prop64;
+ 	const __be32 *prop32;
++	struct property *prop;
+ 	int len;
+ 	unsigned int segno;
+ 	u64 phb_id;
+ 	void *aux;
+ 	long rc;
++	u32 val;
+ 
+ 	if (!of_device_is_available(np))
+ 		return;
+@@ -3894,6 +3896,20 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
+ 	}
+ 	phb->ioda.pe_array = aux + pemap_off;
+ 
++	phb->ioda.max_tce_order = 0;
++	// Get TCE order from the DT.  If it's not present, assume P8
++	if (!of_get_property(np, "ibm,supported-tce-sizes", NULL)) {
++		phb->ioda.max_tce_order = 28; // assume P8 256mb TCEs
++	} else {
++		of_property_for_each_u32(np, "ibm,supported-tce-sizes", prop,
++					 prop32, val) {
++			if (val > phb->ioda.max_tce_order)
++				phb->ioda.max_tce_order = val;
++		}
++		pr_debug("PHB%llx Found max TCE order of %d bits\n",
++			 phb->opal_id, phb->ioda.max_tce_order);
++	}
++
+ 	/*
+ 	 * Choose PE number for root bus, which shouldn't have
+ 	 * M64 resources consumed by its child devices. To pick
+diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
+index 8b37b28e3831..ca5414055972 100644
+--- a/arch/powerpc/platforms/powernv/pci.h
++++ b/arch/powerpc/platforms/powernv/pci.h
+@@ -172,6 +172,9 @@ struct pnv_phb {
+ 		struct list_head	pe_list;
+ 		struct mutex            pe_list_mutex;
+ 
++		/* Largest supported TCE order bits */
++		uint8_t			max_tce_order;
++
+ 		/* Reverse map of PEs, indexed by {bus, devfn} */
+ 		unsigned int		pe_rmap[0x10000];
+ 	} ioda;
+-- 
+2.17.1
+
+
+From b3f3546c4b4225093f41a4caa25648718170a093 Mon Sep 17 00:00:00 2001
+From: Russell Currey <ruscur@russell.cc>
+Date: Mon, 9 Apr 2018 17:34:37 +1000
+Subject: [PATCH 2/9] powerpc/powernv: DMA operations for discontiguous
+ allocation
+
+Cognitive DMA is a new set of DMA operations that solve some issues for
+devices that want to address more than 32 bits but can't address the 59
+bits required to enable direct DMA.
+
+The previous implementation for POWER8/PHB3 worked around this by
+configuring a bypass from the default 32-bit address space into 64-bit
+address space.  This approach does not work for POWER9/PHB4 because
+regions of memory are discontiguous and many devices will be unable to
+address memory beyond the first node.
+
+Instead, implement a new set of DMA operations that allocate TCEs as DMA
+mappings are requested so that all memory is addressable even when a
+one-to-one mapping between real addresses and DMA addresses isn't
+possible.  These TCEs are the maximum size available on the platform,
+which is 256M on PHB3 and 1G on PHB4.
+
+Devices can now map any region of memory up to the maximum amount they can
+address according to the DMA mask set, in chunks of the largest available
+TCE size.
+
+This implementation replaces the need for the existing PHB3 solution and
+should be compatible with future PHB versions.
+
+Signed-off-by: Russell Currey <ruscur@russell.cc>
+---
+ arch/powerpc/include/asm/dma-mapping.h    |   1 +
+ arch/powerpc/platforms/powernv/Makefile   |   2 +-
+ arch/powerpc/platforms/powernv/pci-dma.c  | 319 ++++++++++++++++++++++
+ arch/powerpc/platforms/powernv/pci-ioda.c | 102 +++----
+ arch/powerpc/platforms/powernv/pci.h      |   7 +
+ 5 files changed, 381 insertions(+), 50 deletions(-)
+ create mode 100644 arch/powerpc/platforms/powernv/pci-dma.c
+
+diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
+index 8fa394520af6..354f435160f3 100644
+--- a/arch/powerpc/include/asm/dma-mapping.h
++++ b/arch/powerpc/include/asm/dma-mapping.h
+@@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev)
+ extern struct dma_map_ops dma_iommu_ops;
+ #endif
+ extern const struct dma_map_ops dma_nommu_ops;
++extern const struct dma_map_ops dma_pseudo_bypass_ops;
+ 
+ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
+ {
+diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
+index b540ce8eec55..7cfc821508c3 100644
+--- a/arch/powerpc/platforms/powernv/Makefile
++++ b/arch/powerpc/platforms/powernv/Makefile
+@@ -6,7 +6,7 @@ obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
+ obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
+ 
+ obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
+-obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
++obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o pci-dma.o
+ obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
+ obj-$(CONFIG_EEH)	+= eeh-powernv.o
+ obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
+diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
+new file mode 100644
+index 000000000000..1d5409be343e
+--- /dev/null
++++ b/arch/powerpc/platforms/powernv/pci-dma.c
+@@ -0,0 +1,319 @@
++/*
++ * DMA operations supporting pseudo-bypass for PHB3+
++ *
++ * Author: Russell Currey <ruscur@russell.cc>
++ *
++ * Copyright 2018 IBM Corporation.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2 of the License, or (at your
++ * option) any later version.
++ */
++
++#include <linux/export.h>
++#include <linux/memblock.h>
++#include <linux/device.h>
++#include <linux/dma-mapping.h>
++#include <linux/hash.h>
++
++#include <asm/pci-bridge.h>
++#include <asm/ppc-pci.h>
++#include <asm/pnv-pci.h>
++#include <asm/tce.h>
++
++#include "pci.h"
++
++/* select and allocate a TCE using the bitmap */
++static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr)
++{
++	int tce;
++	__be64 old, new;
++
++	spin_lock(&pe->tce_alloc_lock);
++	tce = bitmap_find_next_zero_area(pe->tce_bitmap,
++					 pe->tce_count,
++					 0,
++					 1,
++					 0);
++	bitmap_set(pe->tce_bitmap, tce, 1);
++	old = pe->tces[tce];
++	new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
++	pe->tces[tce] = new;
++	pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
++		tce, new, old);
++	spin_unlock(&pe->tce_alloc_lock);
++
++	return tce;
++}
++
++/*
++ * The tracking table for assigning TCEs has two entries per TCE.
++ * - @entry1 contains the physical address and the smallest bit indicates
++ *     if it's currently valid.
++ * - @entry2 contains the DMA address returned in the upper 34 bits, and a
++ *     refcount in the lower 30 bits.
++ */
++static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev,
++					    phys_addr_t addr)
++{
++	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
++	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
++	struct pnv_phb *phb = hose->private_data;
++	struct pnv_ioda_pe *pe;
++        u64 i, entry1, entry2, dma_prefix, tce, ret;
++	u64 offset = addr & ((1 << phb->ioda.max_tce_order) - 1);
++
++	pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
++
++	/* look through the tracking table for a free entry */
++	for (i = 0; i < pe->tce_count; i++) {
++		entry1 = pe->tce_tracker[i * 2];
++		entry2 = pe->tce_tracker[i * 2 + 1];
++		dma_prefix = entry2 >> 34;
++
++		/* if the address is the same and the entry is valid */
++		if (entry1 == ((addr - offset) | 1)) {
++			/* all we need to do here is increment the refcount */
++			ret = cmpxchg(&pe->tce_tracker[i * 2 + 1],
++				      entry2, entry2 + 1);
++			if (ret != entry2) {
++				/* conflict, start looking again just in case */
++				i--;
++				continue;
++			}
++			return (dma_prefix << phb->ioda.max_tce_order) | offset;
++		/* if the entry is invalid then we want to replace it */
++		} else if (!(entry1 & 1)) {
++			/* set the real address, note that it isn't valid yet */
++			ret = cmpxchg(&pe->tce_tracker[i * 2],
++				      entry1, (addr - offset));
++			if (ret != entry1) {
++				/* conflict, start looking again */
++				i--;
++				continue;
++			}
++
++			/* now we can allocate a TCE */
++			tce = dma_pseudo_bypass_select_tce(pe, addr - offset);
++
++			/* set new value, including TCE index and new refcount */
++			ret = cmpxchg(&pe->tce_tracker[i * 2 + 1],
++				      entry2, tce << 34 | 1);
++			if (ret != entry2) {
++				/*
++				 * XXX In this case we need to throw out
++				 * everything, including the TCE we just
++				 * allocated.  For now, just leave it.
++				 */
++				i--;
++				continue;
++			}
++
++			/* now set the valid bit */
++			ret = cmpxchg(&pe->tce_tracker[i * 2],
++				      (addr - offset), (addr - offset) | 1);
++			if (ret != (addr - offset)) {
++				/*
++				 * XXX Same situation as above.  We'd probably
++				 * want to null out entry2 as well.
++				 */
++				i--;
++				continue;
++			}
++			return (tce << phb->ioda.max_tce_order) | offset;
++		/* it's a valid entry but not ours, keep looking */
++		} else {
++			continue;
++		}
++	}
++	/* If we get here, the table must be full, so error out. */
++	return -1ULL;
++}
++
++/*
++ * For the moment, unmapping just decrements the refcount and doesn't actually
++ * remove the TCE.  This is because it's very likely that a previously allocated
++ * TCE will be used again, and this saves having to invalidate it.
++ *
++ * TODO implement some kind of garbage collection that clears unused TCE entries
++ * once the table reaches a certain size.
++ */
++static void dma_pseudo_bypass_unmap_address(struct device *dev, dma_addr_t dma_addr)
++{
++	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
++	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
++	struct pnv_phb *phb = hose->private_data;
++	struct pnv_ioda_pe *pe;
++	u64 i, entry1, entry2, dma_prefix, refcount;
++
++	pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
++
++	for (i = 0; i < pe->tce_count; i++) {
++		entry1 = pe->tce_tracker[i * 2];
++		entry2 = pe->tce_tracker[i * 2 + 1];
++		dma_prefix = entry2 >> 34;
++		refcount = entry2 & ((1 << 30) - 1);
++
++		/* look through entry2 until we find our address */
++		if (dma_prefix == (dma_addr >> phb->ioda.max_tce_order)) {
++			refcount--;
++			cmpxchg(&pe->tce_tracker[i * 2 + 1], entry2, (dma_prefix << 34) | refcount);
++			if (!refcount) {
++				/*
++				 * Here is where we would remove the valid bit
++				 * from entry1, clear the entry in the TCE table
++				 * and invalidate the TCE - but we want to leave
++				 * them until the table fills up (for now).
++				 */
++			}
++			break;
++		}
++	}
++}
++
++static int dma_pseudo_bypass_dma_supported(struct device *dev, u64 mask)
++{
++	/*
++	 * Normally dma_supported() checks if the mask is capable of addressing
++	 * all of memory.  Since we map physical memory in chunks that the
++	 * device can address, the device will be able to address whatever it
++	 * wants - just not all at once.
++	 */
++	return 1;
++}
++
++static void *dma_pseudo_bypass_alloc_coherent(struct device *dev,
++					  size_t size,
++					  dma_addr_t *dma_handle,
++					  gfp_t flag,
++					  unsigned long attrs)
++{
++	void *ret;
++	struct page *page;
++	int node = dev_to_node(dev);
++
++	/* ignore region specifiers */
++	flag &= ~(__GFP_HIGHMEM);
++
++	page = alloc_pages_node(node, flag, get_order(size));
++	if (page == NULL)
++		return NULL;
++	ret = page_address(page);
++	memset(ret, 0, size);
++	*dma_handle = dma_pseudo_bypass_get_address(dev, __pa(ret));
++
++	return ret;
++}
++
++static void dma_pseudo_bypass_free_coherent(struct device *dev,
++					 size_t size,
++					 void *vaddr,
++					 dma_addr_t dma_handle,
++					 unsigned long attrs)
++{
++	free_pages((unsigned long)vaddr, get_order(size));
++}
++
++static int dma_pseudo_bypass_mmap_coherent(struct device *dev,
++				       struct vm_area_struct *vma,
++				       void *cpu_addr,
++				       dma_addr_t handle,
++				       size_t size,
++				       unsigned long attrs)
++{
++	unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
++
++	return remap_pfn_range(vma, vma->vm_start,
++			       pfn + vma->vm_pgoff,
++			       vma->vm_end - vma->vm_start,
++			       vma->vm_page_prot);
++}
++
++static inline dma_addr_t dma_pseudo_bypass_map_page(struct device *dev,
++						struct page *page,
++						unsigned long offset,
++						size_t size,
++						enum dma_data_direction dir,
++						unsigned long attrs)
++{
++	BUG_ON(dir == DMA_NONE);
++
++	/* XXX I don't know if this is necessary (or even desired) */
++	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
++		__dma_sync_page(page, offset, size, dir);
++
++	return dma_pseudo_bypass_get_address(dev, page_to_phys(page) + offset);
++}
++
++static inline void dma_pseudo_bypass_unmap_page(struct device *dev,
++					 dma_addr_t dma_address,
++					 size_t size,
++					 enum dma_data_direction direction,
++					 unsigned long attrs)
++{
++	dma_pseudo_bypass_unmap_address(dev, dma_address);
++}
++
++
++static int dma_pseudo_bypass_map_sg(struct device *dev, struct scatterlist *sgl,
++			     int nents, enum dma_data_direction direction,
++			     unsigned long attrs)
++{
++	struct scatterlist *sg;
++	int i;
++
++
++	for_each_sg(sgl, sg, nents, i) {
++		sg->dma_address = dma_pseudo_bypass_get_address(dev, sg_phys(sg));
++		sg->dma_length = sg->length;
++
++		if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
++			continue;
++
++		__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
++	}
++
++	return nents;
++}
++
++static void dma_pseudo_bypass_unmap_sg(struct device *dev, struct scatterlist *sgl,
++				int nents, enum dma_data_direction direction,
++				unsigned long attrs)
++{
++	struct scatterlist *sg;
++	int i;
++
++	for_each_sg(sgl, sg, nents, i) {
++		dma_pseudo_bypass_unmap_address(dev, sg->dma_address);
++	}
++}
++
++static u64 dma_pseudo_bypass_get_required_mask(struct device *dev)
++{
++	/*
++	 * there's no limitation on our end, the driver should just call
++	 * set_mask() with as many bits as the device can address.
++	 */
++	return -1ULL;
++}
++
++static int dma_pseudo_bypass_mapping_error(struct device *dev, dma_addr_t dma_addr)
++{
++	return dma_addr == -1ULL;
++}
++
++
++const struct dma_map_ops dma_pseudo_bypass_ops = {
++	.alloc				= dma_pseudo_bypass_alloc_coherent,
++	.free				= dma_pseudo_bypass_free_coherent,
++	.mmap				= dma_pseudo_bypass_mmap_coherent,
++	.map_sg				= dma_pseudo_bypass_map_sg,
++	.unmap_sg			= dma_pseudo_bypass_unmap_sg,
++	.dma_supported			= dma_pseudo_bypass_dma_supported,
++	.map_page			= dma_pseudo_bypass_map_page,
++	.unmap_page			= dma_pseudo_bypass_unmap_page,
++	.get_required_mask		= dma_pseudo_bypass_get_required_mask,
++	.mapping_error			= dma_pseudo_bypass_mapping_error,
++};
++EXPORT_SYMBOL(dma_pseudo_bypass_ops);
+diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
+index 9f40f235b39e..b982558a92ac 100644
+--- a/arch/powerpc/platforms/powernv/pci-ioda.c
++++ b/arch/powerpc/platforms/powernv/pci-ioda.c
+@@ -25,6 +25,7 @@
+ #include <linux/iommu.h>
+ #include <linux/rculist.h>
+ #include <linux/sizes.h>
++#include <linux/vmalloc.h>
+ 
+ #include <asm/sections.h>
+ #include <asm/io.h>
+@@ -1085,6 +1086,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
+ 	pe->pbus = NULL;
+ 	pe->mve_number = -1;
+ 	pe->rid = dev->bus->number << 8 | pdn->devfn;
++	pe->tces = NULL;
++	pe->tce_tracker = NULL;
++	pe->tce_bitmap = NULL;
+ 
+ 	pe_info(pe, "Associated device to PE\n");
+ 
+@@ -1566,6 +1570,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+ 		pe->mve_number = -1;
+ 		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
+ 			   pci_iov_virtfn_devfn(pdev, vf_index);
++		pe->tces = NULL;
++		pe->tce_tracker = NULL;
++		pe->tce_bitmap = NULL;
+ 
+ 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
+ 			hose->global_number, pdev->bus->number,
+@@ -1771,43 +1778,40 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
+ 	return true;
+ }
+ 
+-/*
+- * Reconfigure TVE#0 to be usable as 64-bit DMA space.
+- *
+- * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
+- * Devices can only access more than that if bit 59 of the PCI address is set
+- * by hardware, which indicates TVE#1 should be used instead of TVE#0.
+- * Many PCI devices are not capable of addressing that many bits, and as a
+- * result are limited to the 4GB of virtual memory made available to 32-bit
+- * devices in TVE#0.
+- *
+- * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
+- * devices by configuring the virtual memory past the first 4GB inaccessible
+- * by 64-bit DMAs.  This should only be used by devices that want more than
+- * 4GB, and only on PEs that have no 32-bit devices.
+- *
+- * Currently this will only work on PHB3 (POWER8).
+- */
+-static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
++static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
+ {
+-	u64 window_size, table_size, tce_count, addr;
++	u64 tce_count, table_size, window_size;
++	struct pnv_phb *p = pe->phb;
+ 	struct page *table_pages;
+-	u64 tce_order = 28; /* 256MB TCEs */
+ 	__be64 *tces;
+-	s64 rc;
++	int rc = -ENOMEM;
++	int bitmap_size, tracker_entries;
++
++	/*
++	 * XXX These are factors for scaling the size of the TCE table, and
++	 * the table that tracks these allocations.  These should eventually
++	 * be kernel command line options with defaults above 1, for situations
++	 * where your memory expands after the machine has booted.
++	 */
++	int tce_size_factor = 1;
++	int tracking_table_factor = 1;
+ 
+ 	/*
+-	 * Window size needs to be a power of two, but needs to account for
+-	 * shifting memory by the 4GB offset required to skip 32bit space.
++	 * The window size covers all of memory (and optionally more), with
++	 * enough tracker entries to cover them all being allocated.  So we
++	 * create enough TCEs to cover all of memory at once.
+ 	 */
+-	window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
+-	tce_count = window_size >> tce_order;
++	window_size = roundup_pow_of_two(tce_size_factor * memory_hotplug_max());
++	tracker_entries = (tracking_table_factor * memory_hotplug_max()) >>
++		p->ioda.max_tce_order;
++	tce_count = window_size >> p->ioda.max_tce_order;
++	bitmap_size = BITS_TO_LONGS(tce_count) * sizeof(unsigned long);
+ 	table_size = tce_count << 3;
+ 
+ 	if (table_size < PAGE_SIZE)
+ 		table_size = PAGE_SIZE;
+ 
+-	table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
++	table_pages = alloc_pages_node(p->hose->node, GFP_KERNEL,
+ 				       get_order(table_size));
+ 	if (!table_pages)
+ 		goto err;
+@@ -1818,26 +1822,33 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
+ 
+ 	memset(tces, 0, table_size);
+ 
+-	for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
+-		tces[(addr + (1ULL << 32)) >> tce_order] =
+-			cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
+-	}
++	pe->tces = tces;
++	pe->tce_count = tce_count;
++	pe->tce_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
++	/* The tracking table has two u64s per TCE */
++	pe->tce_tracker = vzalloc(sizeof(u64) * 2 * tracker_entries);
++	spin_lock_init(&pe->tce_alloc_lock);
++
++	/* mark the first 4GB as reserved so this can still be used for 32bit */
++	bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order));
++
++	pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n",
++		tracker_entries, bitmap_size, tce_count);
+ 
+ 	rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
+ 					pe->pe_number,
+-					/* reconfigure window 0 */
+ 					(pe->pe_number << 1) + 0,
+ 					1,
+ 					__pa(tces),
+ 					table_size,
+-					1 << tce_order);
++					1 << p->ioda.max_tce_order);
+ 	if (rc == OPAL_SUCCESS) {
+-		pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
++		pe_info(pe, "TCE tables configured for pseudo-bypass\n");
+ 		return 0;
+ 	}
+ err:
+-	pe_err(pe, "Error configuring 64-bit DMA bypass\n");
+-	return -EIO;
++	pe_err(pe, "error configuring pseudo-bypass\n");
++	return rc;
+ }
+ 
+ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+@@ -1848,7 +1859,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+ 	struct pnv_ioda_pe *pe;
+ 	uint64_t top;
+ 	bool bypass = false;
+-	s64 rc;
+ 
+ 	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+ 		return -ENODEV;
+@@ -1865,21 +1875,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+ 	} else {
+ 		/*
+ 		 * If the device can't set the TCE bypass bit but still wants
+-		 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
+-		 * bypass the 32-bit region and be usable for 64-bit DMAs.
+-		 * The device needs to be able to address all of this space.
++		 * to access 4GB or more, we need to use a different set of DMA
++		 * operations with an indirect mapping.
+ 		 */
+ 		if (dma_mask >> 32 &&
+-		    dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
+-		    pnv_pci_ioda_pe_single_vendor(pe) &&
+-		    phb->model == PNV_PHB_MODEL_PHB3) {
+-			/* Configure the bypass mode */
+-			rc = pnv_pci_ioda_dma_64bit_bypass(pe);
+-			if (rc)
+-				return rc;
+-			/* 4GB offset bypasses 32-bit space */
+-			set_dma_offset(&pdev->dev, (1ULL << 32));
+-			set_dma_ops(&pdev->dev, &dma_nommu_ops);
++		    phb->model != PNV_PHB_MODEL_P7IOC &&
++		    pnv_pci_ioda_pe_single_vendor(pe)) {
++			if (!pe->tces)
++				pnv_pci_pseudo_bypass_setup(pe);
++			set_dma_ops(&pdev->dev, &dma_pseudo_bypass_ops);
+ 		} else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
+ 			/*
+ 			 * Fail the request if a DMA mask between 32 and 64 bits
+diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
+index ca5414055972..9418c6ea189b 100644
+--- a/arch/powerpc/platforms/powernv/pci.h
++++ b/arch/powerpc/platforms/powernv/pci.h
+@@ -70,6 +70,13 @@ struct pnv_ioda_pe {
+ 	bool			tce_bypass_enabled;
+ 	uint64_t		tce_bypass_base;
+ 
++	/* TCE tables for DMA pseudo-bypass */
++	__be64			*tces;
++	u64			tce_count;
++	unsigned long		*tce_bitmap;
++	u64			*tce_tracker; // 2 u64s per TCE
++	spinlock_t		tce_alloc_lock;
++
+ 	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
+ 	 * and -1 if not supported. (It's actually identical to the
+ 	 * PE number)
+-- 
+2.17.1
+
+
+From 2e6abf2b56d40a953eaa39e2bee064bfcc1da6d1 Mon Sep 17 00:00:00 2001
+From: Russell Currey <ruscur@russell.cc>
+Date: Wed, 6 Jun 2018 13:36:06 +1000
+Subject: [PATCH 3/9] powerpc/powernv/pci: Track DMA and TCE tables in debugfs
+
+Add a new debugfs entry to trigger dumping out the tracking table and
+TCEs for a given PE, for example PE 0x4 of PHB 2:
+
+echo 0x4 > /sys/kernel/debug/powerpc/PCI0002/sketchy
+
+This will result in the table being dumped out in dmesg.
+
+Signed-off-by: Russell Currey <ruscur@russell.cc>
+---
+ arch/powerpc/platforms/powernv/pci-ioda.c | 43 +++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
+index b982558a92ac..3598ca8daa7c 100644
+--- a/arch/powerpc/platforms/powernv/pci-ioda.c
++++ b/arch/powerpc/platforms/powernv/pci-ioda.c
+@@ -3203,6 +3203,47 @@ static int pnv_pci_diag_data_set(void *data, u64 val)
+ DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
+ 			pnv_pci_diag_data_set, "%llu\n");
+ 
++static int pnv_pci_sketchy_set(void *data, u64 val)
++{
++	struct pci_controller *hose;
++	struct pnv_ioda_pe *pe;
++	struct pnv_phb *phb;
++	u64 entry1, entry2;
++	int i;
++
++	hose = (struct pci_controller *)data;
++	if (!hose || !hose->private_data)
++		return -ENODEV;
++
++	phb = hose->private_data;
++	pe = &phb->ioda.pe_array[val];
++
++	if (!pe)
++		return -EINVAL;
++
++	if (!pe->tces || !pe->tce_tracker)
++		return -EIO;
++
++	for (i = 0; i < pe->tce_count; i++) {
++		if (i > 16 && pe->tces[i] == 0)
++			break;
++		pr_info("%3d: %016llx\n", i, be64_to_cpu(pe->tces[i]));
++	}
++
++	for (i = 0; i < pe->tce_count; i++) {
++		entry1 = pe->tce_tracker[i * 2];
++		entry2 = pe->tce_tracker[i * 2 + 1];
++		if (!entry1)
++			break;
++		pr_info("%3d: %016llx %016llx\n", i, entry1, entry2);
++	}
++	return 0;
++}
++
++DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_sketchy_fops, NULL,
++			pnv_pci_sketchy_set, "%llu\n");
++
++
+ #endif /* CONFIG_DEBUG_FS */
+ 
+ static void pnv_pci_ioda_create_dbgfs(void)
+@@ -3228,6 +3269,8 @@ static void pnv_pci_ioda_create_dbgfs(void)
+ 
+ 		debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
+ 				    &pnv_pci_diag_data_fops);
++		debugfs_create_file("sketchy", 0200, phb->dbgfs, hose,
++				    &pnv_pci_sketchy_fops);
+ 	}
+ #endif /* CONFIG_DEBUG_FS */
+ }
+-- 
+2.17.1
+
+
+From 8456e9247c21d9fc7838dd5a71435342f7b79f88 Mon Sep 17 00:00:00 2001
+From: Russell Currey <ruscur@russell.cc>
+Date: Tue, 19 Jun 2018 16:21:13 +1000
+Subject: [PATCH 4/9] powerpc/powernv/pci: Safety fixes for pseudobypass TCE
+ allocation
+
+Signed-off-by: Russell Currey <ruscur@russell.cc>
+---
+ arch/powerpc/platforms/powernv/pci-dma.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
+index 1d5409be343e..237940a2a052 100644
+--- a/arch/powerpc/platforms/powernv/pci-dma.c
++++ b/arch/powerpc/platforms/powernv/pci-dma.c
+@@ -29,8 +29,9 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
+ {
+ 	int tce;
+ 	__be64 old, new;
++	unsigned long flags;
+ 
+-	spin_lock(&pe->tce_alloc_lock);
++	spin_lock_irqsave(&pe->tce_alloc_lock, flags);
+ 	tce = bitmap_find_next_zero_area(pe->tce_bitmap,
+ 					 pe->tce_count,
+ 					 0,
+@@ -40,9 +41,10 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
+ 	old = pe->tces[tce];
+ 	new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
+ 	pe->tces[tce] = new;
++	mb();
+ 	pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
+ 		tce, new, old);
+-	spin_unlock(&pe->tce_alloc_lock);
++	spin_unlock_irqrestore(&pe->tce_alloc_lock, flags);
+ 
+ 	return tce;
+ }
+-- 
+2.17.1
+
+
+From b8949ef1b1bb5977ba9fb35f06d1466b6be475a5 Mon Sep 17 00:00:00 2001
+From: Timothy Pearson <tpearson@raptorengineering.com>
+Date: Sat, 23 Jun 2018 16:20:48 -0500
+Subject: [PATCH 5/9] powerpc/powernv/pci: Export
+ pnv_pci_ioda2_tce_invalidate_pe
+
+Pseudo DMA support requires a method to invalidate the TCE cache
+Export pnv_pci_ioda2_tce_invalidate_pe for use by the pseudo DMA
+mapper.
+
+Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
+---
+ arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
+ arch/powerpc/platforms/powernv/pci.h      | 1 +
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
+index 3598ca8daa7c..83f9db17e711 100644
+--- a/arch/powerpc/platforms/powernv/pci-ioda.c
++++ b/arch/powerpc/platforms/powernv/pci-ioda.c
+@@ -2100,7 +2100,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+ 	}
+ }
+ 
+-static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
++void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+ {
+ 	struct pnv_phb *phb = pe->phb;
+ 
+diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
+index 9418c6ea189b..bea565c3f302 100644
+--- a/arch/powerpc/platforms/powernv/pci.h
++++ b/arch/powerpc/platforms/powernv/pci.h
+@@ -244,6 +244,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+ /* Nvlink functions */
+ extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
+ extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
++extern void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe);
+ extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
+ extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+ 		struct iommu_table *tbl);
+-- 
+2.17.1
+
+
+From 4d8211098c35d5b7556f756db18dba89c015600d Mon Sep 17 00:00:00 2001
+From: Timothy Pearson <tpearson@raptorengineering.com>
+Date: Sat, 23 Jun 2018 16:22:59 -0500
+Subject: [PATCH 6/9] powerpc/powernv/pci: Invalidate TCE cache after DMA map
+ setup
+
+Per the IODA2, TCEs must be invalidated after their settings
+have been changed.  Invalidate the cache after the address
+is changed during TCE allocation when using pseudo DMA.
+
+Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
+---
+ arch/powerpc/platforms/powernv/pci-dma.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c
+index 237940a2a052..060dbc168401 100644
+--- a/arch/powerpc/platforms/powernv/pci-dma.c
++++ b/arch/powerpc/platforms/powernv/pci-dma.c
+@@ -42,8 +42,7 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr
+ 	new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
+ 	pe->tces[tce] = new;
+ 	mb();
+-	pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n",
+-		tce, new, old);
++	pnv_pci_ioda2_tce_invalidate_pe(pe);
+ 	spin_unlock_irqrestore(&pe->tce_alloc_lock, flags);
+ 
+ 	return tce;
+-- 
+2.17.1
+
+
+From a963913380c91a465509bae341da1e8aac40cdee Mon Sep 17 00:00:00 2001
+From: Timothy Pearson <tpearson@raptorengineering.com>
+Date: Sat, 23 Jun 2018 16:25:16 -0500
+Subject: [PATCH 7/9] powerpc/powernv/pci: Don't use the lower 4G TCEs in
+ pseudo-DMA mode
+
+Four TCEs are reserved for legacy 32-bit DMA mappings in psuedo DMA
+mode.  Mark these with an invalid address to avoid their use by
+the TCE cache mapper.
+
+Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
+---
+ arch/powerpc/platforms/powernv/pci-ioda.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
+index 83f9db17e711..f4cd6a5c2bc7 100644
+--- a/arch/powerpc/platforms/powernv/pci-ioda.c
++++ b/arch/powerpc/platforms/powernv/pci-ioda.c
+@@ -1780,7 +1780,7 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
+ 
+ static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
+ {
+-	u64 tce_count, table_size, window_size;
++	u64 i, tce_count, table_size, window_size;
+ 	struct pnv_phb *p = pe->phb;
+ 	struct page *table_pages;
+ 	__be64 *tces;
+@@ -1832,6 +1832,12 @@ static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
+ 	/* mark the first 4GB as reserved so this can still be used for 32bit */
+ 	bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order));
+ 
++	/* make sure reserved first 4GB TCEs are not used by the mapper
++	 * set each address to -1, which will never match an incoming request
++	 */
++	for (i = 0; i < 4; i++)
++		pe->tce_tracker[i * 2] = -1;
++
+ 	pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n",
+ 		tracker_entries, bitmap_size, tce_count);
+ 
+-- 
+2.17.1
+
+
+From d397370596955d166c76a1f487ef53d8dbf52d9c Mon Sep 17 00:00:00 2001
+From: Paul Mackerras <paulus@ozlabs.org>
+Date: Tue, 11 Sep 2018 17:00:30 +1000
+Subject: [PATCH 8/9] KVM: PPC: Book3S HV: Allocate a memory area exclusively
+ for HPTs
+
+Currently we allocate HPTs (hashed page tables) for guests using the
+CMA (contiguous memory allocator) facility.  However, there are
+situations where the CMA region can get fragmented, notably when
+lots of guest pages get pinned for PCI pass-through, which then causes
+HPT allocations to fail even if there is sufficient CMA memory
+available overall.
+
+This commit adds the capability to reserve some memory at boot time
+exclusively for HPTs for KVM guests.  The amount is controlled with
+the kvm_hpt_resv_ratio=N kernel command-line option, where N is the
+percentage of system memory to reserve.  This reserved memory will
+be used first, and only when a guest HPT can't be allocated from this
+reserved memory will the CMA region be used.
+
+Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
+---
+ arch/powerpc/include/asm/kvm_host.h  |   2 +
+ arch/powerpc/include/asm/kvm_ppc.h   |   7 ++
+ arch/powerpc/kernel/setup-common.c   |   3 +
+ arch/powerpc/kernel/setup.h          |   6 +-
+ arch/powerpc/kvm/book3s_64_mmu_hv.c  |  25 +++++--
+ arch/powerpc/kvm/book3s_hv_builtin.c | 105 ++++++++++++++++++++++++++-
+ 6 files changed, 136 insertions(+), 12 deletions(-)
+
+diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
+index 906bcbdfd2a1..053ba320db49 100644
+--- a/arch/powerpc/include/asm/kvm_host.h
++++ b/arch/powerpc/include/asm/kvm_host.h
+@@ -258,6 +258,8 @@ struct kvm_hpt_info {
+ 	struct revmap_entry *rev;
+ 	/* Guest HPT size is 2**(order) bytes */
+ 	u32 order;
++	/* 1 if HPT allocated from reserved region, 0 otherwise */
++	int resv;
+ 	/* 1 if HPT allocated with CMA, 0 otherwise */
+ 	int cma;
+ };
+diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
+index e991821dd7fa..9625b0dd28cc 100644
+--- a/arch/powerpc/include/asm/kvm_ppc.h
++++ b/arch/powerpc/include/asm/kvm_ppc.h
+@@ -210,6 +210,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+ 		unsigned long tce_value, unsigned long npages);
+ extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+ 			     unsigned long ioba);
++extern unsigned long kvmhv_alloc_resv_hpt(u32 order);
++extern void kvmhv_release_resv_hpt(unsigned long hpt, u32 order);
+ extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages);
+ extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages);
+ extern int kvmppc_core_init_vm(struct kvm *kvm);
+@@ -436,6 +438,8 @@ struct openpic;
+ 
+ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ extern void kvm_cma_reserve(void) __init;
++extern void kvm_resv_hpt_init(void);
++
+ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+ {
+ 	paca_ptrs[cpu]->kvm_hstate.xics_phys = (void __iomem *)addr;
+@@ -476,6 +480,9 @@ extern bool kvm_hv_mode_active(void);
+ static inline void __init kvm_cma_reserve(void)
+ {}
+ 
++static inline void kvm_resv_hpt_init(void)
++{}
++
+ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+ {}
+ 
+diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
+index 93fa0c99681e..38e36c67ab2f 100644
+--- a/arch/powerpc/kernel/setup-common.c
++++ b/arch/powerpc/kernel/setup-common.c
+@@ -979,6 +979,9 @@ void __init setup_arch(char **cmdline_p)
+ 	/* Initialize the MMU context management stuff. */
+ 	mmu_context_init();
+ 
++	/* Reserve memory for KVM HPTs */
++	kvm_resv_hpt_init();
++
+ #ifdef CONFIG_PPC64
+ 	/* Interrupt code needs to be 64K-aligned. */
+ 	if ((unsigned long)_stext & 0xffff)
+diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
+index c6a592b67386..6de1fac35774 100644
+--- a/arch/powerpc/kernel/setup.h
++++ b/arch/powerpc/kernel/setup.h
+@@ -53,13 +53,15 @@ extern unsigned long spr_default_dscr;
+ #endif
+ 
+ /*
+- * Having this in kvm_ppc.h makes include dependencies too
+- * tricky to solve for setup-common.c so have it here.
++ * Having these in kvm_ppc.h makes include dependencies too
++ * tricky to solve for setup-common.c so have them here.
+  */
+ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ void kvm_cma_reserve(void);
++void kvm_resv_hpt_init(void);
+ #else
+ static inline void kvm_cma_reserve(void) { };
++static inline void kvm_resv_hpt_init(void) { }
+ #endif
+ 
+ #ifdef CONFIG_TAU
+diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
+index 68e14afecac8..9e607014f4c7 100644
+--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
++++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
+@@ -81,7 +81,7 @@ struct kvm_resize_hpt {
+ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
+ {
+ 	unsigned long hpt = 0;
+-	int cma = 0;
++	int resv = 0, cma = 0;
+ 	struct page *page = NULL;
+ 	struct revmap_entry *rev;
+ 	unsigned long npte;
+@@ -89,11 +89,17 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
+ 	if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
+ 		return -EINVAL;
+ 
+-	page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
+-	if (page) {
+-		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
++	hpt = kvmhv_alloc_resv_hpt(order);
++	if (hpt) {
+ 		memset((void *)hpt, 0, (1ul << order));
+-		cma = 1;
++		resv = 1;
++	} else {
++		page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
++		if (page) {
++			hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
++			memset((void *)hpt, 0, (1ul << order));
++			cma = 1;
++		}
+ 	}
+ 
+ 	if (!hpt)
+@@ -109,7 +115,9 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
+ 	/* Allocate reverse map array */
+ 	rev = vmalloc(array_size(npte, sizeof(struct revmap_entry)));
+ 	if (!rev) {
+-		if (cma)
++		if (resv)
++			kvmhv_release_resv_hpt(hpt, order);
++		else if (cma)
+ 			kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
+ 		else
+ 			free_pages(hpt, order - PAGE_SHIFT);
+@@ -118,6 +126,7 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
+ 
+ 	info->order = order;
+ 	info->virt = hpt;
++	info->resv = resv;
+ 	info->cma = cma;
+ 	info->rev = rev;
+ 
+@@ -191,7 +200,9 @@ void kvmppc_free_hpt(struct kvm_hpt_info *info)
+ {
+ 	vfree(info->rev);
+ 	info->rev = NULL;
+-	if (info->cma)
++	if (info->resv)
++		kvmhv_release_resv_hpt(info->virt, info->order);
++	else if (info->cma)
+ 		kvm_free_hpt_cma(virt_to_page(info->virt),
+ 				 1 << (info->order - PAGE_SHIFT));
+ 	else if (info->virt)
+diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
+index fc6bb9630a9c..3f36b99fb46b 100644
+--- a/arch/powerpc/kvm/book3s_hv_builtin.c
++++ b/arch/powerpc/kvm/book3s_hv_builtin.c
+@@ -53,11 +53,109 @@ EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
+ 
+ /*
+  * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
+- * should be power of 2.
++ * only needs to be 256kB.
+  */
+-#define HPT_ALIGN_PAGES		((1 << 18) >> PAGE_SHIFT) /* 256k */
++#define HPT_ALIGN_ORDER		18		/* 256k */
++#define HPT_ALIGN_PAGES		((1 << HPT_ALIGN_ORDER) >> PAGE_SHIFT)
++
++#define KVM_RESV_CHUNK_ORDER	HPT_ALIGN_ORDER
++
+ /*
+- * By default we reserve 5% of memory for hash pagetable allocation.
++ * By default we reserve 2% of memory exclusively for guest HPT
++ * allocations, plus another 3% in the CMA zone which can be used
++ * either for HPTs or for movable page allocations.
++ * Each guest's HPT will be sized at between 1/128 and 1/64 of its
++ * memory, i.e. up to 1.56%, and allowing for about a 3x memory
++ * overcommit factor gets us to about 5%.
++ */
++static unsigned long kvm_hpt_resv_ratio = 2;
++
++static int __init early_parse_kvm_hpt_resv(char *p)
++{
++	pr_debug("%s(%s)\n", __func__, p);
++	if (!p)
++		return -EINVAL;
++	return kstrtoul(p, 0, &kvm_hpt_resv_ratio);
++}
++early_param("kvm_hpt_resv_ratio", early_parse_kvm_hpt_resv);
++
++static unsigned long kvm_resv_addr;
++static unsigned long *kvm_resv_bitmap;
++static unsigned long kvm_resv_chunks;
++static DEFINE_MUTEX(kvm_resv_lock);
++
++void kvm_resv_hpt_init(void)
++{
++	unsigned long align = 1ul << KVM_RESV_CHUNK_ORDER;
++	unsigned long size, bm_size;
++	unsigned long addr, bm;
++	unsigned long *bmp;
++
++	if (!cpu_has_feature(CPU_FTR_HVMODE))
++		return;
++
++	size = memblock_phys_mem_size() * kvm_hpt_resv_ratio / 100;
++	size = ALIGN(size, align);
++	if (!size)
++		return;
++
++	pr_info("KVM: Allocating %lu MiB for hashed page tables\n",
++		size >> 20);
++
++	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
++	if (!addr) {
++		pr_err("KVM: Allocation of reserved memory for HPTs failed\n");
++		return;
++	}
++	pr_info("KVM: %lu MiB reserved for HPTs at %lx\n", size >> 20, addr);
++
++	bm_size = BITS_TO_LONGS(size >> KVM_RESV_CHUNK_ORDER) * sizeof(long);
++	bm = __memblock_alloc_base(bm_size, sizeof(long),
++				   MEMBLOCK_ALLOC_ACCESSIBLE);
++	if (!bm) {
++		pr_err("KVM: Allocation of reserved memory bitmap failed\n");
++		return;
++	}
++	bmp = __va(bm);
++	memset(bmp, 0, bm_size);
++
++	kvm_resv_addr = (unsigned long) __va(addr);
++	kvm_resv_chunks = size >> KVM_RESV_CHUNK_ORDER;
++	kvm_resv_bitmap = bmp;
++}
++
++unsigned long kvmhv_alloc_resv_hpt(u32 order)
++{
++	unsigned long nr_chunks = 1ul << (order - KVM_RESV_CHUNK_ORDER);
++	unsigned long chunk;
++
++	mutex_lock(&kvm_resv_lock);
++	chunk = bitmap_find_next_zero_area(kvm_resv_bitmap, kvm_resv_chunks,
++					   0, nr_chunks, 0);
++	if (chunk < kvm_resv_chunks)
++		bitmap_set(kvm_resv_bitmap, chunk, nr_chunks);
++	mutex_unlock(&kvm_resv_lock);
++
++	if (chunk < kvm_resv_chunks)
++		return kvm_resv_addr + (chunk << KVM_RESV_CHUNK_ORDER);
++	return 0;
++}
++EXPORT_SYMBOL_GPL(kvmhv_alloc_resv_hpt);
++
++void kvmhv_release_resv_hpt(unsigned long addr, u32 order)
++{
++	unsigned long nr_chunks = 1ul << (order - KVM_RESV_CHUNK_ORDER);
++	unsigned long chunk = (addr - kvm_resv_addr) >> KVM_RESV_CHUNK_ORDER;
++
++	mutex_lock(&kvm_resv_lock);
++	if (chunk + nr_chunks <= kvm_resv_chunks)
++		bitmap_clear(kvm_resv_bitmap, chunk, nr_chunks);
++	mutex_unlock(&kvm_resv_lock);
++}
++EXPORT_SYMBOL_GPL(kvmhv_release_resv_hpt);
++
++/*
++ * By default we reserve 3% of memory for the CMA zone.
+  */
+ static unsigned long kvm_cma_resv_ratio = 5;
+ 
+@@ -106,6 +204,7 @@ void __init kvm_cma_reserve(void)
+ 	 */
+ 	if (!cpu_has_feature(CPU_FTR_HVMODE))
+ 		return;
++
+ 	/*
+ 	 * We cannot use memblock_phys_mem_size() here, because
+ 	 * memblock_analyze() has not been called yet.
+-- 
+2.17.1
+
+
+From 57a21f640a2b6d2e225cac7df35ed0dde7c6293f Mon Sep 17 00:00:00 2001
+From: Timothy Pearson <tpearson@raptorengineering.com>
+Date: Sun, 17 Jun 2018 23:59:51 -0500
+Subject: [PATCH 9/9] Fix undefined behaviour from signed integer overflow
+
+Caught by UBSAN
+
+Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
+---
+ drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
+index a029e47c2319..f7e56bec2dd7 100644
+--- a/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
++++ b/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c
+@@ -98,7 +98,7 @@ int smu7_copy_bytes_to_smc(struct pp_hwmgr *hwmgr, uint32_t smc_start_address,
+ 
+ 	while (byte_count >= 4) {
+ 	/* Bytes are written into the SMC addres space with the MSB first. */
+-		data = src[0] * 0x1000000 + src[1] * 0x10000 + src[2] * 0x100 + src[3];
++		data = src[0] * 0x1000000U + src[1] * 0x10000U + src[2] * 0x100U + src[3];
+ 
+ 		result = smu7_set_smc_sram_address(hwmgr, addr, limit);
+ 
+-- 
+2.17.1
+