diff --git a/kernel.spec b/kernel.spec index a65c882f4..c496d0cc2 100644 --- a/kernel.spec +++ b/kernel.spec @@ -367,7 +367,7 @@ Name: kernel%{?variant} License: GPLv2 and Redistributable, no modification permitted URL: https://www.kernel.org/ Version: %{rpmversion} -Release: %{pkg_release} +Release: %{pkg_release}.op.1 # DO NOT CHANGE THE 'ExclusiveArch' LINE TO TEMPORARILY EXCLUDE AN ARCHITECTURE BUILD. # SET %%nobuildarches (ABOVE) INSTEAD ExclusiveArch: %{all_x86} x86_64 s390x %{arm} aarch64 ppc64le @@ -593,6 +593,7 @@ Patch350: arm64-arch_timer-Workaround-for-Allwinner-A64-timer-instability.patch Patch351: arm64-dts-allwinner-a64-Enable-A64-timer-workaround.patch # 400 - IBM (ppc/s390x) patches +Patch400: ppc64-talos.patch # 500 - Temp fixes/CVEs etc diff --git a/ppc64-talos.patch b/ppc64-talos.patch new file mode 100644 index 000000000..37e8fe9d5 --- /dev/null +++ b/ppc64-talos.patch @@ -0,0 +1,1248 @@ +From ba7256030f1b04e56096e9796bdf478f12872403 Mon Sep 17 00:00:00 2001 +From: Russell Currey +Date: Mon, 9 Apr 2018 17:29:36 +1000 +Subject: [PATCH 1/9] powerpc/powernv/pci: Track largest available TCE order + per PHB + +Knowing the largest possible TCE size of a PHB is useful, so get it out +of the device tree. This relies on the property being added in OPAL. + +It is assumed that any PHB4 or later machine would be running firmware +that implemented this property, and otherwise assumed to be PHB3, which +has a maximum TCE order of 28 bits or 256MB TCEs. + +This is used later in the series. + +Signed-off-by: Russell Currey +--- + arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++++++++++++++++ + arch/powerpc/platforms/powernv/pci.h | 3 +++ + 2 files changed, 19 insertions(+) + +diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c +index cde710297a4e..9f40f235b39e 100644 +--- a/arch/powerpc/platforms/powernv/pci-ioda.c ++++ b/arch/powerpc/platforms/powernv/pci-ioda.c +@@ -3751,11 +3751,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, + struct resource r; + const __be64 *prop64; + const __be32 *prop32; ++ struct property *prop; + int len; + unsigned int segno; + u64 phb_id; + void *aux; + long rc; ++ u32 val; + + if (!of_device_is_available(np)) + return; +@@ -3894,6 +3896,20 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, + } + phb->ioda.pe_array = aux + pemap_off; + ++ phb->ioda.max_tce_order = 0; ++ // Get TCE order from the DT. If it's not present, assume P8 ++ if (!of_get_property(np, "ibm,supported-tce-sizes", NULL)) { ++ phb->ioda.max_tce_order = 28; // assume P8 256mb TCEs ++ } else { ++ of_property_for_each_u32(np, "ibm,supported-tce-sizes", prop, ++ prop32, val) { ++ if (val > phb->ioda.max_tce_order) ++ phb->ioda.max_tce_order = val; ++ } ++ pr_debug("PHB%llx Found max TCE order of %d bits\n", ++ phb->opal_id, phb->ioda.max_tce_order); ++ } ++ + /* + * Choose PE number for root bus, which shouldn't have + * M64 resources consumed by its child devices. To pick +diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h +index 8b37b28e3831..ca5414055972 100644 +--- a/arch/powerpc/platforms/powernv/pci.h ++++ b/arch/powerpc/platforms/powernv/pci.h +@@ -172,6 +172,9 @@ struct pnv_phb { + struct list_head pe_list; + struct mutex pe_list_mutex; + ++ /* Largest supported TCE order bits */ ++ uint8_t max_tce_order; ++ + /* Reverse map of PEs, indexed by {bus, devfn} */ + unsigned int pe_rmap[0x10000]; + } ioda; +-- +2.17.1 + + +From b3f3546c4b4225093f41a4caa25648718170a093 Mon Sep 17 00:00:00 2001 +From: Russell Currey +Date: Mon, 9 Apr 2018 17:34:37 +1000 +Subject: [PATCH 2/9] powerpc/powernv: DMA operations for discontiguous + allocation + +Cognitive DMA is a new set of DMA operations that solve some issues for +devices that want to address more than 32 bits but can't address the 59 +bits required to enable direct DMA. + +The previous implementation for POWER8/PHB3 worked around this by +configuring a bypass from the default 32-bit address space into 64-bit +address space. This approach does not work for POWER9/PHB4 because +regions of memory are discontiguous and many devices will be unable to +address memory beyond the first node. + +Instead, implement a new set of DMA operations that allocate TCEs as DMA +mappings are requested so that all memory is addressable even when a +one-to-one mapping between real addresses and DMA addresses isn't +possible. These TCEs are the maximum size available on the platform, +which is 256M on PHB3 and 1G on PHB4. + +Devices can now map any region of memory up to the maximum amount they can +address according to the DMA mask set, in chunks of the largest available +TCE size. + +This implementation replaces the need for the existing PHB3 solution and +should be compatible with future PHB versions. + +Signed-off-by: Russell Currey +--- + arch/powerpc/include/asm/dma-mapping.h | 1 + + arch/powerpc/platforms/powernv/Makefile | 2 +- + arch/powerpc/platforms/powernv/pci-dma.c | 319 ++++++++++++++++++++++ + arch/powerpc/platforms/powernv/pci-ioda.c | 102 +++---- + arch/powerpc/platforms/powernv/pci.h | 7 + + 5 files changed, 381 insertions(+), 50 deletions(-) + create mode 100644 arch/powerpc/platforms/powernv/pci-dma.c + +diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h +index 8fa394520af6..354f435160f3 100644 +--- a/arch/powerpc/include/asm/dma-mapping.h ++++ b/arch/powerpc/include/asm/dma-mapping.h +@@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev) + extern struct dma_map_ops dma_iommu_ops; + #endif + extern const struct dma_map_ops dma_nommu_ops; ++extern const struct dma_map_ops dma_pseudo_bypass_ops; + + static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) + { +diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile +index b540ce8eec55..7cfc821508c3 100644 +--- a/arch/powerpc/platforms/powernv/Makefile ++++ b/arch/powerpc/platforms/powernv/Makefile +@@ -6,7 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o + obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o + + obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o +-obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o ++obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o pci-dma.o + obj-$(CONFIG_CXL_BASE) += pci-cxl.o + obj-$(CONFIG_EEH) += eeh-powernv.o + obj-$(CONFIG_PPC_SCOM) += opal-xscom.o +diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c +new file mode 100644 +index 000000000000..1d5409be343e +--- /dev/null ++++ b/arch/powerpc/platforms/powernv/pci-dma.c +@@ -0,0 +1,319 @@ ++/* ++ * DMA operations supporting pseudo-bypass for PHB3+ ++ * ++ * Author: Russell Currey ++ * ++ * Copyright 2018 IBM Corporation. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include "pci.h" ++ ++/* select and allocate a TCE using the bitmap */ ++static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr) ++{ ++ int tce; ++ __be64 old, new; ++ ++ spin_lock(&pe->tce_alloc_lock); ++ tce = bitmap_find_next_zero_area(pe->tce_bitmap, ++ pe->tce_count, ++ 0, ++ 1, ++ 0); ++ bitmap_set(pe->tce_bitmap, tce, 1); ++ old = pe->tces[tce]; ++ new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); ++ pe->tces[tce] = new; ++ pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n", ++ tce, new, old); ++ spin_unlock(&pe->tce_alloc_lock); ++ ++ return tce; ++} ++ ++/* ++ * The tracking table for assigning TCEs has two entries per TCE. ++ * - @entry1 contains the physical address and the smallest bit indicates ++ * if it's currently valid. ++ * - @entry2 contains the DMA address returned in the upper 34 bits, and a ++ * refcount in the lower 30 bits. ++ */ ++static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev, ++ phys_addr_t addr) ++{ ++ struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); ++ struct pci_controller *hose = pci_bus_to_host(pdev->bus); ++ struct pnv_phb *phb = hose->private_data; ++ struct pnv_ioda_pe *pe; ++ u64 i, entry1, entry2, dma_prefix, tce, ret; ++ u64 offset = addr & ((1 << phb->ioda.max_tce_order) - 1); ++ ++ pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number]; ++ ++ /* look through the tracking table for a free entry */ ++ for (i = 0; i < pe->tce_count; i++) { ++ entry1 = pe->tce_tracker[i * 2]; ++ entry2 = pe->tce_tracker[i * 2 + 1]; ++ dma_prefix = entry2 >> 34; ++ ++ /* if the address is the same and the entry is valid */ ++ if (entry1 == ((addr - offset) | 1)) { ++ /* all we need to do here is increment the refcount */ ++ ret = cmpxchg(&pe->tce_tracker[i * 2 + 1], ++ entry2, entry2 + 1); ++ if (ret != entry2) { ++ /* conflict, start looking again just in case */ ++ i--; ++ continue; ++ } ++ return (dma_prefix << phb->ioda.max_tce_order) | offset; ++ /* if the entry is invalid then we want to replace it */ ++ } else if (!(entry1 & 1)) { ++ /* set the real address, note that it isn't valid yet */ ++ ret = cmpxchg(&pe->tce_tracker[i * 2], ++ entry1, (addr - offset)); ++ if (ret != entry1) { ++ /* conflict, start looking again */ ++ i--; ++ continue; ++ } ++ ++ /* now we can allocate a TCE */ ++ tce = dma_pseudo_bypass_select_tce(pe, addr - offset); ++ ++ /* set new value, including TCE index and new refcount */ ++ ret = cmpxchg(&pe->tce_tracker[i * 2 + 1], ++ entry2, tce << 34 | 1); ++ if (ret != entry2) { ++ /* ++ * XXX In this case we need to throw out ++ * everything, including the TCE we just ++ * allocated. For now, just leave it. ++ */ ++ i--; ++ continue; ++ } ++ ++ /* now set the valid bit */ ++ ret = cmpxchg(&pe->tce_tracker[i * 2], ++ (addr - offset), (addr - offset) | 1); ++ if (ret != (addr - offset)) { ++ /* ++ * XXX Same situation as above. We'd probably ++ * want to null out entry2 as well. ++ */ ++ i--; ++ continue; ++ } ++ return (tce << phb->ioda.max_tce_order) | offset; ++ /* it's a valid entry but not ours, keep looking */ ++ } else { ++ continue; ++ } ++ } ++ /* If we get here, the table must be full, so error out. */ ++ return -1ULL; ++} ++ ++/* ++ * For the moment, unmapping just decrements the refcount and doesn't actually ++ * remove the TCE. This is because it's very likely that a previously allocated ++ * TCE will be used again, and this saves having to invalidate it. ++ * ++ * TODO implement some kind of garbage collection that clears unused TCE entries ++ * once the table reaches a certain size. ++ */ ++static void dma_pseudo_bypass_unmap_address(struct device *dev, dma_addr_t dma_addr) ++{ ++ struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); ++ struct pci_controller *hose = pci_bus_to_host(pdev->bus); ++ struct pnv_phb *phb = hose->private_data; ++ struct pnv_ioda_pe *pe; ++ u64 i, entry1, entry2, dma_prefix, refcount; ++ ++ pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number]; ++ ++ for (i = 0; i < pe->tce_count; i++) { ++ entry1 = pe->tce_tracker[i * 2]; ++ entry2 = pe->tce_tracker[i * 2 + 1]; ++ dma_prefix = entry2 >> 34; ++ refcount = entry2 & ((1 << 30) - 1); ++ ++ /* look through entry2 until we find our address */ ++ if (dma_prefix == (dma_addr >> phb->ioda.max_tce_order)) { ++ refcount--; ++ cmpxchg(&pe->tce_tracker[i * 2 + 1], entry2, (dma_prefix << 34) | refcount); ++ if (!refcount) { ++ /* ++ * Here is where we would remove the valid bit ++ * from entry1, clear the entry in the TCE table ++ * and invalidate the TCE - but we want to leave ++ * them until the table fills up (for now). ++ */ ++ } ++ break; ++ } ++ } ++} ++ ++static int dma_pseudo_bypass_dma_supported(struct device *dev, u64 mask) ++{ ++ /* ++ * Normally dma_supported() checks if the mask is capable of addressing ++ * all of memory. Since we map physical memory in chunks that the ++ * device can address, the device will be able to address whatever it ++ * wants - just not all at once. ++ */ ++ return 1; ++} ++ ++static void *dma_pseudo_bypass_alloc_coherent(struct device *dev, ++ size_t size, ++ dma_addr_t *dma_handle, ++ gfp_t flag, ++ unsigned long attrs) ++{ ++ void *ret; ++ struct page *page; ++ int node = dev_to_node(dev); ++ ++ /* ignore region specifiers */ ++ flag &= ~(__GFP_HIGHMEM); ++ ++ page = alloc_pages_node(node, flag, get_order(size)); ++ if (page == NULL) ++ return NULL; ++ ret = page_address(page); ++ memset(ret, 0, size); ++ *dma_handle = dma_pseudo_bypass_get_address(dev, __pa(ret)); ++ ++ return ret; ++} ++ ++static void dma_pseudo_bypass_free_coherent(struct device *dev, ++ size_t size, ++ void *vaddr, ++ dma_addr_t dma_handle, ++ unsigned long attrs) ++{ ++ free_pages((unsigned long)vaddr, get_order(size)); ++} ++ ++static int dma_pseudo_bypass_mmap_coherent(struct device *dev, ++ struct vm_area_struct *vma, ++ void *cpu_addr, ++ dma_addr_t handle, ++ size_t size, ++ unsigned long attrs) ++{ ++ unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr)); ++ ++ return remap_pfn_range(vma, vma->vm_start, ++ pfn + vma->vm_pgoff, ++ vma->vm_end - vma->vm_start, ++ vma->vm_page_prot); ++} ++ ++static inline dma_addr_t dma_pseudo_bypass_map_page(struct device *dev, ++ struct page *page, ++ unsigned long offset, ++ size_t size, ++ enum dma_data_direction dir, ++ unsigned long attrs) ++{ ++ BUG_ON(dir == DMA_NONE); ++ ++ /* XXX I don't know if this is necessary (or even desired) */ ++ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) ++ __dma_sync_page(page, offset, size, dir); ++ ++ return dma_pseudo_bypass_get_address(dev, page_to_phys(page) + offset); ++} ++ ++static inline void dma_pseudo_bypass_unmap_page(struct device *dev, ++ dma_addr_t dma_address, ++ size_t size, ++ enum dma_data_direction direction, ++ unsigned long attrs) ++{ ++ dma_pseudo_bypass_unmap_address(dev, dma_address); ++} ++ ++ ++static int dma_pseudo_bypass_map_sg(struct device *dev, struct scatterlist *sgl, ++ int nents, enum dma_data_direction direction, ++ unsigned long attrs) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ ++ for_each_sg(sgl, sg, nents, i) { ++ sg->dma_address = dma_pseudo_bypass_get_address(dev, sg_phys(sg)); ++ sg->dma_length = sg->length; ++ ++ if (attrs & DMA_ATTR_SKIP_CPU_SYNC) ++ continue; ++ ++ __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); ++ } ++ ++ return nents; ++} ++ ++static void dma_pseudo_bypass_unmap_sg(struct device *dev, struct scatterlist *sgl, ++ int nents, enum dma_data_direction direction, ++ unsigned long attrs) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ for_each_sg(sgl, sg, nents, i) { ++ dma_pseudo_bypass_unmap_address(dev, sg->dma_address); ++ } ++} ++ ++static u64 dma_pseudo_bypass_get_required_mask(struct device *dev) ++{ ++ /* ++ * there's no limitation on our end, the driver should just call ++ * set_mask() with as many bits as the device can address. ++ */ ++ return -1ULL; ++} ++ ++static int dma_pseudo_bypass_mapping_error(struct device *dev, dma_addr_t dma_addr) ++{ ++ return dma_addr == -1ULL; ++} ++ ++ ++const struct dma_map_ops dma_pseudo_bypass_ops = { ++ .alloc = dma_pseudo_bypass_alloc_coherent, ++ .free = dma_pseudo_bypass_free_coherent, ++ .mmap = dma_pseudo_bypass_mmap_coherent, ++ .map_sg = dma_pseudo_bypass_map_sg, ++ .unmap_sg = dma_pseudo_bypass_unmap_sg, ++ .dma_supported = dma_pseudo_bypass_dma_supported, ++ .map_page = dma_pseudo_bypass_map_page, ++ .unmap_page = dma_pseudo_bypass_unmap_page, ++ .get_required_mask = dma_pseudo_bypass_get_required_mask, ++ .mapping_error = dma_pseudo_bypass_mapping_error, ++}; ++EXPORT_SYMBOL(dma_pseudo_bypass_ops); +diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c +index 9f40f235b39e..b982558a92ac 100644 +--- a/arch/powerpc/platforms/powernv/pci-ioda.c ++++ b/arch/powerpc/platforms/powernv/pci-ioda.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1085,6 +1086,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) + pe->pbus = NULL; + pe->mve_number = -1; + pe->rid = dev->bus->number << 8 | pdn->devfn; ++ pe->tces = NULL; ++ pe->tce_tracker = NULL; ++ pe->tce_bitmap = NULL; + + pe_info(pe, "Associated device to PE\n"); + +@@ -1566,6 +1570,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) + pe->mve_number = -1; + pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | + pci_iov_virtfn_devfn(pdev, vf_index); ++ pe->tces = NULL; ++ pe->tce_tracker = NULL; ++ pe->tce_bitmap = NULL; + + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", + hose->global_number, pdev->bus->number, +@@ -1771,43 +1778,40 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) + return true; + } + +-/* +- * Reconfigure TVE#0 to be usable as 64-bit DMA space. +- * +- * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses. +- * Devices can only access more than that if bit 59 of the PCI address is set +- * by hardware, which indicates TVE#1 should be used instead of TVE#0. +- * Many PCI devices are not capable of addressing that many bits, and as a +- * result are limited to the 4GB of virtual memory made available to 32-bit +- * devices in TVE#0. +- * +- * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit +- * devices by configuring the virtual memory past the first 4GB inaccessible +- * by 64-bit DMAs. This should only be used by devices that want more than +- * 4GB, and only on PEs that have no 32-bit devices. +- * +- * Currently this will only work on PHB3 (POWER8). +- */ +-static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe) ++static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe) + { +- u64 window_size, table_size, tce_count, addr; ++ u64 tce_count, table_size, window_size; ++ struct pnv_phb *p = pe->phb; + struct page *table_pages; +- u64 tce_order = 28; /* 256MB TCEs */ + __be64 *tces; +- s64 rc; ++ int rc = -ENOMEM; ++ int bitmap_size, tracker_entries; ++ ++ /* ++ * XXX These are factors for scaling the size of the TCE table, and ++ * the table that tracks these allocations. These should eventually ++ * be kernel command line options with defaults above 1, for situations ++ * where your memory expands after the machine has booted. ++ */ ++ int tce_size_factor = 1; ++ int tracking_table_factor = 1; + + /* +- * Window size needs to be a power of two, but needs to account for +- * shifting memory by the 4GB offset required to skip 32bit space. ++ * The window size covers all of memory (and optionally more), with ++ * enough tracker entries to cover them all being allocated. So we ++ * create enough TCEs to cover all of memory at once. + */ +- window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32)); +- tce_count = window_size >> tce_order; ++ window_size = roundup_pow_of_two(tce_size_factor * memory_hotplug_max()); ++ tracker_entries = (tracking_table_factor * memory_hotplug_max()) >> ++ p->ioda.max_tce_order; ++ tce_count = window_size >> p->ioda.max_tce_order; ++ bitmap_size = BITS_TO_LONGS(tce_count) * sizeof(unsigned long); + table_size = tce_count << 3; + + if (table_size < PAGE_SIZE) + table_size = PAGE_SIZE; + +- table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL, ++ table_pages = alloc_pages_node(p->hose->node, GFP_KERNEL, + get_order(table_size)); + if (!table_pages) + goto err; +@@ -1818,26 +1822,33 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe) + + memset(tces, 0, table_size); + +- for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) { +- tces[(addr + (1ULL << 32)) >> tce_order] = +- cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); +- } ++ pe->tces = tces; ++ pe->tce_count = tce_count; ++ pe->tce_bitmap = kzalloc(bitmap_size, GFP_KERNEL); ++ /* The tracking table has two u64s per TCE */ ++ pe->tce_tracker = vzalloc(sizeof(u64) * 2 * tracker_entries); ++ spin_lock_init(&pe->tce_alloc_lock); ++ ++ /* mark the first 4GB as reserved so this can still be used for 32bit */ ++ bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order)); ++ ++ pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n", ++ tracker_entries, bitmap_size, tce_count); + + rc = opal_pci_map_pe_dma_window(pe->phb->opal_id, + pe->pe_number, +- /* reconfigure window 0 */ + (pe->pe_number << 1) + 0, + 1, + __pa(tces), + table_size, +- 1 << tce_order); ++ 1 << p->ioda.max_tce_order); + if (rc == OPAL_SUCCESS) { +- pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n"); ++ pe_info(pe, "TCE tables configured for pseudo-bypass\n"); + return 0; + } + err: +- pe_err(pe, "Error configuring 64-bit DMA bypass\n"); +- return -EIO; ++ pe_err(pe, "error configuring pseudo-bypass\n"); ++ return rc; + } + + static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +@@ -1848,7 +1859,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) + struct pnv_ioda_pe *pe; + uint64_t top; + bool bypass = false; +- s64 rc; + + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return -ENODEV; +@@ -1865,21 +1875,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) + } else { + /* + * If the device can't set the TCE bypass bit but still wants +- * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to +- * bypass the 32-bit region and be usable for 64-bit DMAs. +- * The device needs to be able to address all of this space. ++ * to access 4GB or more, we need to use a different set of DMA ++ * operations with an indirect mapping. + */ + if (dma_mask >> 32 && +- dma_mask > (memory_hotplug_max() + (1ULL << 32)) && +- pnv_pci_ioda_pe_single_vendor(pe) && +- phb->model == PNV_PHB_MODEL_PHB3) { +- /* Configure the bypass mode */ +- rc = pnv_pci_ioda_dma_64bit_bypass(pe); +- if (rc) +- return rc; +- /* 4GB offset bypasses 32-bit space */ +- set_dma_offset(&pdev->dev, (1ULL << 32)); +- set_dma_ops(&pdev->dev, &dma_nommu_ops); ++ phb->model != PNV_PHB_MODEL_P7IOC && ++ pnv_pci_ioda_pe_single_vendor(pe)) { ++ if (!pe->tces) ++ pnv_pci_pseudo_bypass_setup(pe); ++ set_dma_ops(&pdev->dev, &dma_pseudo_bypass_ops); + } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) { + /* + * Fail the request if a DMA mask between 32 and 64 bits +diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h +index ca5414055972..9418c6ea189b 100644 +--- a/arch/powerpc/platforms/powernv/pci.h ++++ b/arch/powerpc/platforms/powernv/pci.h +@@ -70,6 +70,13 @@ struct pnv_ioda_pe { + bool tce_bypass_enabled; + uint64_t tce_bypass_base; + ++ /* TCE tables for DMA pseudo-bypass */ ++ __be64 *tces; ++ u64 tce_count; ++ unsigned long *tce_bitmap; ++ u64 *tce_tracker; // 2 u64s per TCE ++ spinlock_t tce_alloc_lock; ++ + /* MSIs. MVE index is identical for for 32 and 64 bit MSI + * and -1 if not supported. (It's actually identical to the + * PE number) +-- +2.17.1 + + +From 2e6abf2b56d40a953eaa39e2bee064bfcc1da6d1 Mon Sep 17 00:00:00 2001 +From: Russell Currey +Date: Wed, 6 Jun 2018 13:36:06 +1000 +Subject: [PATCH 3/9] powerpc/powernv/pci: Track DMA and TCE tables in debugfs + +Add a new debugfs entry to trigger dumping out the tracking table and +TCEs for a given PE, for example PE 0x4 of PHB 2: + +echo 0x4 > /sys/kernel/debug/powerpc/PCI0002/sketchy + +This will result in the table being dumped out in dmesg. + +Signed-off-by: Russell Currey +--- + arch/powerpc/platforms/powernv/pci-ioda.c | 43 +++++++++++++++++++++++ + 1 file changed, 43 insertions(+) + +diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c +index b982558a92ac..3598ca8daa7c 100644 +--- a/arch/powerpc/platforms/powernv/pci-ioda.c ++++ b/arch/powerpc/platforms/powernv/pci-ioda.c +@@ -3203,6 +3203,47 @@ static int pnv_pci_diag_data_set(void *data, u64 val) + DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, + pnv_pci_diag_data_set, "%llu\n"); + ++static int pnv_pci_sketchy_set(void *data, u64 val) ++{ ++ struct pci_controller *hose; ++ struct pnv_ioda_pe *pe; ++ struct pnv_phb *phb; ++ u64 entry1, entry2; ++ int i; ++ ++ hose = (struct pci_controller *)data; ++ if (!hose || !hose->private_data) ++ return -ENODEV; ++ ++ phb = hose->private_data; ++ pe = &phb->ioda.pe_array[val]; ++ ++ if (!pe) ++ return -EINVAL; ++ ++ if (!pe->tces || !pe->tce_tracker) ++ return -EIO; ++ ++ for (i = 0; i < pe->tce_count; i++) { ++ if (i > 16 && pe->tces[i] == 0) ++ break; ++ pr_info("%3d: %016llx\n", i, be64_to_cpu(pe->tces[i])); ++ } ++ ++ for (i = 0; i < pe->tce_count; i++) { ++ entry1 = pe->tce_tracker[i * 2]; ++ entry2 = pe->tce_tracker[i * 2 + 1]; ++ if (!entry1) ++ break; ++ pr_info("%3d: %016llx %016llx\n", i, entry1, entry2); ++ } ++ return 0; ++} ++ ++DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_sketchy_fops, NULL, ++ pnv_pci_sketchy_set, "%llu\n"); ++ ++ + #endif /* CONFIG_DEBUG_FS */ + + static void pnv_pci_ioda_create_dbgfs(void) +@@ -3228,6 +3269,8 @@ static void pnv_pci_ioda_create_dbgfs(void) + + debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose, + &pnv_pci_diag_data_fops); ++ debugfs_create_file("sketchy", 0200, phb->dbgfs, hose, ++ &pnv_pci_sketchy_fops); + } + #endif /* CONFIG_DEBUG_FS */ + } +-- +2.17.1 + + +From 8456e9247c21d9fc7838dd5a71435342f7b79f88 Mon Sep 17 00:00:00 2001 +From: Russell Currey +Date: Tue, 19 Jun 2018 16:21:13 +1000 +Subject: [PATCH 4/9] powerpc/powernv/pci: Safety fixes for pseudobypass TCE + allocation + +Signed-off-by: Russell Currey +--- + arch/powerpc/platforms/powernv/pci-dma.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c +index 1d5409be343e..237940a2a052 100644 +--- a/arch/powerpc/platforms/powernv/pci-dma.c ++++ b/arch/powerpc/platforms/powernv/pci-dma.c +@@ -29,8 +29,9 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr + { + int tce; + __be64 old, new; ++ unsigned long flags; + +- spin_lock(&pe->tce_alloc_lock); ++ spin_lock_irqsave(&pe->tce_alloc_lock, flags); + tce = bitmap_find_next_zero_area(pe->tce_bitmap, + pe->tce_count, + 0, +@@ -40,9 +41,10 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr + old = pe->tces[tce]; + new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); + pe->tces[tce] = new; ++ mb(); + pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n", + tce, new, old); +- spin_unlock(&pe->tce_alloc_lock); ++ spin_unlock_irqrestore(&pe->tce_alloc_lock, flags); + + return tce; + } +-- +2.17.1 + + +From b8949ef1b1bb5977ba9fb35f06d1466b6be475a5 Mon Sep 17 00:00:00 2001 +From: Timothy Pearson +Date: Sat, 23 Jun 2018 16:20:48 -0500 +Subject: [PATCH 5/9] powerpc/powernv/pci: Export + pnv_pci_ioda2_tce_invalidate_pe + +Pseudo DMA support requires a method to invalidate the TCE cache +Export pnv_pci_ioda2_tce_invalidate_pe for use by the pseudo DMA +mapper. + +Signed-off-by: Timothy Pearson +--- + arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- + arch/powerpc/platforms/powernv/pci.h | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c +index 3598ca8daa7c..83f9db17e711 100644 +--- a/arch/powerpc/platforms/powernv/pci-ioda.c ++++ b/arch/powerpc/platforms/powernv/pci-ioda.c +@@ -2100,7 +2100,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, + } + } + +-static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) ++void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) + { + struct pnv_phb *phb = pe->phb; + +diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h +index 9418c6ea189b..bea565c3f302 100644 +--- a/arch/powerpc/platforms/powernv/pci.h ++++ b/arch/powerpc/platforms/powernv/pci.h +@@ -244,6 +244,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, + /* Nvlink functions */ + extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); + extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); ++extern void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe); + extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); + extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, + struct iommu_table *tbl); +-- +2.17.1 + + +From 4d8211098c35d5b7556f756db18dba89c015600d Mon Sep 17 00:00:00 2001 +From: Timothy Pearson +Date: Sat, 23 Jun 2018 16:22:59 -0500 +Subject: [PATCH 6/9] powerpc/powernv/pci: Invalidate TCE cache after DMA map + setup + +Per the IODA2, TCEs must be invalidated after their settings +have been changed. Invalidate the cache after the address +is changed during TCE allocation when using pseudo DMA. + +Signed-off-by: Timothy Pearson +--- + arch/powerpc/platforms/powernv/pci-dma.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/arch/powerpc/platforms/powernv/pci-dma.c b/arch/powerpc/platforms/powernv/pci-dma.c +index 237940a2a052..060dbc168401 100644 +--- a/arch/powerpc/platforms/powernv/pci-dma.c ++++ b/arch/powerpc/platforms/powernv/pci-dma.c +@@ -42,8 +42,7 @@ static int dma_pseudo_bypass_select_tce(struct pnv_ioda_pe *pe, phys_addr_t addr + new = cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE); + pe->tces[tce] = new; + mb(); +- pe_info(pe, "allocating TCE %i 0x%016llx (old 0x%016llx)\n", +- tce, new, old); ++ pnv_pci_ioda2_tce_invalidate_pe(pe); + spin_unlock_irqrestore(&pe->tce_alloc_lock, flags); + + return tce; +-- +2.17.1 + + +From a963913380c91a465509bae341da1e8aac40cdee Mon Sep 17 00:00:00 2001 +From: Timothy Pearson +Date: Sat, 23 Jun 2018 16:25:16 -0500 +Subject: [PATCH 7/9] powerpc/powernv/pci: Don't use the lower 4G TCEs in + pseudo-DMA mode + +Four TCEs are reserved for legacy 32-bit DMA mappings in psuedo DMA +mode. Mark these with an invalid address to avoid their use by +the TCE cache mapper. + +Signed-off-by: Timothy Pearson +--- + arch/powerpc/platforms/powernv/pci-ioda.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c +index 83f9db17e711..f4cd6a5c2bc7 100644 +--- a/arch/powerpc/platforms/powernv/pci-ioda.c ++++ b/arch/powerpc/platforms/powernv/pci-ioda.c +@@ -1780,7 +1780,7 @@ static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) + + static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe) + { +- u64 tce_count, table_size, window_size; ++ u64 i, tce_count, table_size, window_size; + struct pnv_phb *p = pe->phb; + struct page *table_pages; + __be64 *tces; +@@ -1832,6 +1832,12 @@ static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe) + /* mark the first 4GB as reserved so this can still be used for 32bit */ + bitmap_set(pe->tce_bitmap, 0, 1ULL << (32 - p->ioda.max_tce_order)); + ++ /* make sure reserved first 4GB TCEs are not used by the mapper ++ * set each address to -1, which will never match an incoming request ++ */ ++ for (i = 0; i < 4; i++) ++ pe->tce_tracker[i * 2] = -1; ++ + pe_info(pe, "pseudo-bypass sizes: tracker %d bitmap %d TCEs %lld\n", + tracker_entries, bitmap_size, tce_count); + +-- +2.17.1 + + +From d397370596955d166c76a1f487ef53d8dbf52d9c Mon Sep 17 00:00:00 2001 +From: Paul Mackerras +Date: Tue, 11 Sep 2018 17:00:30 +1000 +Subject: [PATCH 8/9] KVM: PPC: Book3S HV: Allocate a memory area exclusively + for HPTs + +Currently we allocate HPTs (hashed page tables) for guests using the +CMA (contiguous memory allocator) facility. However, there are +situations where the CMA region can get fragmented, notably when +lots of guest pages get pinned for PCI pass-through, which then causes +HPT allocations to fail even if there is sufficient CMA memory +available overall. + +This commit adds the capability to reserve some memory at boot time +exclusively for HPTs for KVM guests. The amount is controlled with +the kvm_hpt_resv_ratio=N kernel command-line option, where N is the +percentage of system memory to reserve. This reserved memory will +be used first, and only when a guest HPT can't be allocated from this +reserved memory will the CMA region be used. + +Signed-off-by: Paul Mackerras +--- + arch/powerpc/include/asm/kvm_host.h | 2 + + arch/powerpc/include/asm/kvm_ppc.h | 7 ++ + arch/powerpc/kernel/setup-common.c | 3 + + arch/powerpc/kernel/setup.h | 6 +- + arch/powerpc/kvm/book3s_64_mmu_hv.c | 25 +++++-- + arch/powerpc/kvm/book3s_hv_builtin.c | 105 ++++++++++++++++++++++++++- + 6 files changed, 136 insertions(+), 12 deletions(-) + +diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h +index 906bcbdfd2a1..053ba320db49 100644 +--- a/arch/powerpc/include/asm/kvm_host.h ++++ b/arch/powerpc/include/asm/kvm_host.h +@@ -258,6 +258,8 @@ struct kvm_hpt_info { + struct revmap_entry *rev; + /* Guest HPT size is 2**(order) bytes */ + u32 order; ++ /* 1 if HPT allocated from reserved region, 0 otherwise */ ++ int resv; + /* 1 if HPT allocated with CMA, 0 otherwise */ + int cma; + }; +diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h +index e991821dd7fa..9625b0dd28cc 100644 +--- a/arch/powerpc/include/asm/kvm_ppc.h ++++ b/arch/powerpc/include/asm/kvm_ppc.h +@@ -210,6 +210,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long tce_value, unsigned long npages); + extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba); ++extern unsigned long kvmhv_alloc_resv_hpt(u32 order); ++extern void kvmhv_release_resv_hpt(unsigned long hpt, u32 order); + extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages); + extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages); + extern int kvmppc_core_init_vm(struct kvm *kvm); +@@ -436,6 +438,8 @@ struct openpic; + + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + extern void kvm_cma_reserve(void) __init; ++extern void kvm_resv_hpt_init(void); ++ + static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) + { + paca_ptrs[cpu]->kvm_hstate.xics_phys = (void __iomem *)addr; +@@ -476,6 +480,9 @@ extern bool kvm_hv_mode_active(void); + static inline void __init kvm_cma_reserve(void) + {} + ++static inline void kvm_resv_hpt_init(void) ++{} ++ + static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) + {} + +diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c +index 93fa0c99681e..38e36c67ab2f 100644 +--- a/arch/powerpc/kernel/setup-common.c ++++ b/arch/powerpc/kernel/setup-common.c +@@ -979,6 +979,9 @@ void __init setup_arch(char **cmdline_p) + /* Initialize the MMU context management stuff. */ + mmu_context_init(); + ++ /* Reserve memory for KVM HPTs */ ++ kvm_resv_hpt_init(); ++ + #ifdef CONFIG_PPC64 + /* Interrupt code needs to be 64K-aligned. */ + if ((unsigned long)_stext & 0xffff) +diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h +index c6a592b67386..6de1fac35774 100644 +--- a/arch/powerpc/kernel/setup.h ++++ b/arch/powerpc/kernel/setup.h +@@ -53,13 +53,15 @@ extern unsigned long spr_default_dscr; + #endif + + /* +- * Having this in kvm_ppc.h makes include dependencies too +- * tricky to solve for setup-common.c so have it here. ++ * Having these in kvm_ppc.h makes include dependencies too ++ * tricky to solve for setup-common.c so have them here. + */ + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + void kvm_cma_reserve(void); ++void kvm_resv_hpt_init(void); + #else + static inline void kvm_cma_reserve(void) { }; ++static inline void kvm_resv_hpt_init(void) { } + #endif + + #ifdef CONFIG_TAU +diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c +index 68e14afecac8..9e607014f4c7 100644 +--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c ++++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c +@@ -81,7 +81,7 @@ struct kvm_resize_hpt { + int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) + { + unsigned long hpt = 0; +- int cma = 0; ++ int resv = 0, cma = 0; + struct page *page = NULL; + struct revmap_entry *rev; + unsigned long npte; +@@ -89,11 +89,17 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) + if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) + return -EINVAL; + +- page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); +- if (page) { +- hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); ++ hpt = kvmhv_alloc_resv_hpt(order); ++ if (hpt) { + memset((void *)hpt, 0, (1ul << order)); +- cma = 1; ++ resv = 1; ++ } else { ++ page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); ++ if (page) { ++ hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); ++ memset((void *)hpt, 0, (1ul << order)); ++ cma = 1; ++ } + } + + if (!hpt) +@@ -109,7 +115,9 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) + /* Allocate reverse map array */ + rev = vmalloc(array_size(npte, sizeof(struct revmap_entry))); + if (!rev) { +- if (cma) ++ if (resv) ++ kvmhv_release_resv_hpt(hpt, order); ++ else if (cma) + kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); + else + free_pages(hpt, order - PAGE_SHIFT); +@@ -118,6 +126,7 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) + + info->order = order; + info->virt = hpt; ++ info->resv = resv; + info->cma = cma; + info->rev = rev; + +@@ -191,7 +200,9 @@ void kvmppc_free_hpt(struct kvm_hpt_info *info) + { + vfree(info->rev); + info->rev = NULL; +- if (info->cma) ++ if (info->resv) ++ kvmhv_release_resv_hpt(info->virt, info->order); ++ else if (info->cma) + kvm_free_hpt_cma(virt_to_page(info->virt), + 1 << (info->order - PAGE_SHIFT)); + else if (info->virt) +diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c +index fc6bb9630a9c..3f36b99fb46b 100644 +--- a/arch/powerpc/kvm/book3s_hv_builtin.c ++++ b/arch/powerpc/kvm/book3s_hv_builtin.c +@@ -53,11 +53,109 @@ EXPORT_SYMBOL_GPL(__xive_vm_h_eoi); + + /* + * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) +- * should be power of 2. ++ * only needs to be 256kB. + */ +-#define HPT_ALIGN_PAGES ((1 << 18) >> PAGE_SHIFT) /* 256k */ ++#define HPT_ALIGN_ORDER 18 /* 256k */ ++#define HPT_ALIGN_PAGES ((1 << HPT_ALIGN_ORDER) >> PAGE_SHIFT) ++ ++#define KVM_RESV_CHUNK_ORDER HPT_ALIGN_ORDER ++ + /* +- * By default we reserve 5% of memory for hash pagetable allocation. ++ * By default we reserve 2% of memory exclusively for guest HPT ++ * allocations, plus another 3% in the CMA zone which can be used ++ * either for HPTs or for movable page allocations. ++ * Each guest's HPT will be sized at between 1/128 and 1/64 of its ++ * memory, i.e. up to 1.56%, and allowing for about a 3x memory ++ * overcommit factor gets us to about 5%. ++ */ ++static unsigned long kvm_hpt_resv_ratio = 2; ++ ++static int __init early_parse_kvm_hpt_resv(char *p) ++{ ++ pr_debug("%s(%s)\n", __func__, p); ++ if (!p) ++ return -EINVAL; ++ return kstrtoul(p, 0, &kvm_hpt_resv_ratio); ++} ++early_param("kvm_hpt_resv_ratio", early_parse_kvm_hpt_resv); ++ ++static unsigned long kvm_resv_addr; ++static unsigned long *kvm_resv_bitmap; ++static unsigned long kvm_resv_chunks; ++static DEFINE_MUTEX(kvm_resv_lock); ++ ++void kvm_resv_hpt_init(void) ++{ ++ unsigned long align = 1ul << KVM_RESV_CHUNK_ORDER; ++ unsigned long size, bm_size; ++ unsigned long addr, bm; ++ unsigned long *bmp; ++ ++ if (!cpu_has_feature(CPU_FTR_HVMODE)) ++ return; ++ ++ size = memblock_phys_mem_size() * kvm_hpt_resv_ratio / 100; ++ size = ALIGN(size, align); ++ if (!size) ++ return; ++ ++ pr_info("KVM: Allocating %lu MiB for hashed page tables\n", ++ size >> 20); ++ ++ addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); ++ if (!addr) { ++ pr_err("KVM: Allocation of reserved memory for HPTs failed\n"); ++ return; ++ } ++ pr_info("KVM: %lu MiB reserved for HPTs at %lx\n", size >> 20, addr); ++ ++ bm_size = BITS_TO_LONGS(size >> KVM_RESV_CHUNK_ORDER) * sizeof(long); ++ bm = __memblock_alloc_base(bm_size, sizeof(long), ++ MEMBLOCK_ALLOC_ACCESSIBLE); ++ if (!bm) { ++ pr_err("KVM: Allocation of reserved memory bitmap failed\n"); ++ return; ++ } ++ bmp = __va(bm); ++ memset(bmp, 0, bm_size); ++ ++ kvm_resv_addr = (unsigned long) __va(addr); ++ kvm_resv_chunks = size >> KVM_RESV_CHUNK_ORDER; ++ kvm_resv_bitmap = bmp; ++} ++ ++unsigned long kvmhv_alloc_resv_hpt(u32 order) ++{ ++ unsigned long nr_chunks = 1ul << (order - KVM_RESV_CHUNK_ORDER); ++ unsigned long chunk; ++ ++ mutex_lock(&kvm_resv_lock); ++ chunk = bitmap_find_next_zero_area(kvm_resv_bitmap, kvm_resv_chunks, ++ 0, nr_chunks, 0); ++ if (chunk < kvm_resv_chunks) ++ bitmap_set(kvm_resv_bitmap, chunk, nr_chunks); ++ mutex_unlock(&kvm_resv_lock); ++ ++ if (chunk < kvm_resv_chunks) ++ return kvm_resv_addr + (chunk << KVM_RESV_CHUNK_ORDER); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(kvmhv_alloc_resv_hpt); ++ ++void kvmhv_release_resv_hpt(unsigned long addr, u32 order) ++{ ++ unsigned long nr_chunks = 1ul << (order - KVM_RESV_CHUNK_ORDER); ++ unsigned long chunk = (addr - kvm_resv_addr) >> KVM_RESV_CHUNK_ORDER; ++ ++ mutex_lock(&kvm_resv_lock); ++ if (chunk + nr_chunks <= kvm_resv_chunks) ++ bitmap_clear(kvm_resv_bitmap, chunk, nr_chunks); ++ mutex_unlock(&kvm_resv_lock); ++} ++EXPORT_SYMBOL_GPL(kvmhv_release_resv_hpt); ++ ++/* ++ * By default we reserve 3% of memory for the CMA zone. + */ + static unsigned long kvm_cma_resv_ratio = 5; + +@@ -106,6 +204,7 @@ void __init kvm_cma_reserve(void) + */ + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; ++ + /* + * We cannot use memblock_phys_mem_size() here, because + * memblock_analyze() has not been called yet. +-- +2.17.1 + + +From 57a21f640a2b6d2e225cac7df35ed0dde7c6293f Mon Sep 17 00:00:00 2001 +From: Timothy Pearson +Date: Sun, 17 Jun 2018 23:59:51 -0500 +Subject: [PATCH 9/9] Fix undefined behaviour from signed integer overflow + +Caught by UBSAN + +Signed-off-by: Timothy Pearson +--- + drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c +index a029e47c2319..f7e56bec2dd7 100644 +--- a/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c ++++ b/drivers/gpu/drm/amd/powerplay/smumgr/smu7_smumgr.c +@@ -98,7 +98,7 @@ int smu7_copy_bytes_to_smc(struct pp_hwmgr *hwmgr, uint32_t smc_start_address, + + while (byte_count >= 4) { + /* Bytes are written into the SMC addres space with the MSB first. */ +- data = src[0] * 0x1000000 + src[1] * 0x10000 + src[2] * 0x100 + src[3]; ++ data = src[0] * 0x1000000U + src[1] * 0x10000U + src[2] * 0x100U + src[3]; + + result = smu7_set_smc_sram_address(hwmgr, addr, limit); + +-- +2.17.1 +